scrapy主动退出爬虫的代码片段(python3)

问题:在运行scrapy的过程中,如果想主动退出该怎么做?

背景:比如说我只要爬取当日的新闻,那么在遍历的时候,如果出现了超过1条不是当日的新闻,那么就不爬取了,就主动退出爬虫,这个时候该怎么做呢?

IDE:pycharm

版本:python3

框架:scrapy

系统:windows10

代码如下:

# -*- coding: utf-8 -*-
import scrapy
from torrentSpider.items.NavigationItem import NavigationItem
from torrentSpider.items.TorrentItem import TorrentItem
import time
import random
import logging
import os


class XxxSpider(scrapy.Spider):
    name = "xxx_spider"
    allowed_domains = ['www.xxx.com']
    start_urls = ['http://www.xxx.com/1.html']

    # 网站前缀
    web_pre_url = 'http://xxx.com'
    # 计数
    count = 0

    def parse(self, response):

        # 设置请求也随机延迟
        time.sleep(random.randint(0, 5))

        # 获取导航栏的数量
        navigation_type_number = response.xpath('//*[@id="hypoNav"]/div/ul/li/em/a/text()').extract()
        for n_k in range(1, len(navigation_type_number)):
            navigation_item = NavigationItem()
            # 网站标题
            navigation_item['navigation_title'] = response.xpath('//*[@id="logoSea"]/div[1]/a/img/@alt').extract()[0]
            # 导航栏目分类名称
            navigation_item['navigation_type'] = response.xpath('//*[@id="hypoNav"]/div/ul/li['+str(n_k+1)+']/em/a/text()').extract()[0]
            # 导航链接
            navigation_item['navigation_url'] = response.xpath('//*[@id="hypoNav"]/div/ul/li['+str(n_k+1)+']/em/a/@href').extract()[0]

        # 获取子导航栏的数量
        sub_navigation_type_number = response.xpath('//*[@id="nodeNav"]/div/ul/li/em/a/span/text()').extract()
        for sub_k in range(1, len(sub_navigation_type_number)):
            sub_navigation_item = NavigationItem()
            # 网站标题
            sub_navigation_item['navigation_title'] = response.xpath('//*[@id="logoSea"]/div[1]/a/img/@alt').extract()[0]
            # 副导航栏目分类名称
            sub_navigation_item['sub_navigation_type'] = response.xpath('//*[@id="nodeNav"]/div/ul/li['+str(sub_k)+']/em/a/span/text()').extract()[0]
            # 副导航栏链接
            sub_navigation_item['sub_navigation_url'] = response.xpath('//*[@id="nodeNav"]/div/ul/li['+str(sub_k)+']/em/a/@href').extract()[0]

        # 获取每页电影条目数长度
        movie_name_tr_array = response.xpath('/html/body/div[2]/table[1]/tr/td[1]/table[2]/tbody/tr').extract()
        for i_k in range(1, len(movie_name_tr_array)):
            # 子链接
            str_sub_url = '/html/body/div[2]/table[1]/tr/td[1]/table[2]/tbody/tr['+str(i_k)+']/td[1]/a/@href'
            m_link = self.web_pre_url + response.xpath(str_sub_url).extract()[0]
            yield scrapy.Request(url=m_link, callback=self.parse_links, dont_filter=True)

        # 解析下一页
        next_link = response.xpath('//*[@class="pagegbk"]/@href').extract()
        if next_link:
            if len(next_link) == 1:
                next_link = next_link[0]
            else:
                next_link = next_link[1]
            yield scrapy.Request(self.web_pre_url + next_link, callback=self.parse)

    # 爬取子链接
    def parse_links(self, response):
        torrent_item = TorrentItem()
        # 标题
        torrent_item['torrent_title'] = self.check_xpath_value(response, '/html/body/div[2]/table[1]/tbody/tr/td/font/text()')
        # 影片名称
        torrent_item['torrent_name'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[1]/text()')
        # 导演
        torrent_item['torrent_director'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[2]/text()')
        # 影片演员
        torrent_item['torrent_actor'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/span/font[2]/text()')
        # 语言
        torrent_item['torrent_language'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[3]/text()')
        # 影片类型
        torrent_item['torrent_type'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[4]/text()')
        # 影片地区
        torrent_item['torrent_region'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[5]/text()')
        # 更新时间
        torrent_item['torrent_update_time'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[6]/text()')
        # 影片状态
        torrent_item['torrent_status'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[7]/text()')
        # 上映日期
        torrent_item['torrent_show_time'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[8]/text()')
        # 剧情介绍
        torrent_item['torrent_introduction'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[2]/text()')
        # 影片地址
        torrent_item['torrent_url'] = self.check_xpath_value(response, '//*[@id="plist"]/table[2]/tbody/tr[2]/td/ul/li/input/@value')

        # 获取当前时间并格式化
        current_date = time.strftime('%Y-%m-%d', time.localtime())
        print('current_date = %s' % str(current_date))
        print('torrent_update_time = %s' % torrent_item['torrent_update_time'])
        # 如果不是当天的就不爬取,并且计数
        if torrent_item['torrent_update_time'] == str(current_date):
            yield torrent_item
        else:
            self.count = self.count + 1
            # 判断计数是否超过50,超过就不爬取了
            if self.count > 1:
                # logging.info("计数超过10,停止爬虫")
                self.crawler.engine.close_spider(self, '计数超过10,停止爬虫!')
            pass

    # 判断是否为空
    @staticmethod
    def check_xpath_value(response, xpath_url):
        xpath_value = response.xpath(xpath_url).extract()
        if xpath_value:
            if xpath_value[0].strip() != '':
                return xpath_value[0]
            else:
                return "null"
        else:
            return "null"

注意以上代码中标红的地方:

self.crawler.engine.close_spider(self, '计数超过10,停止爬虫!')

1,此行代码是写在spider文件中的

2,虽然这一行代码会停止爬虫,但是这一行代码的停止并不是立即停止

原因是因为当我们不更改爬虫的setting.py文件的时候,默认配置是:

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

含义就是:Scrapy downloader 并发请求(concurrent requests)的最大值,默认: 16

那么这个时候的问题来了,按照以上的写法,在队列里就已经有十几个请求了,你停止之后,这十几个请求依旧会执行下去,所以并不是立即停止,如果想改变的话,就必须改变此项配置,设为:

CONCURRENT_REQUESTS = 1

 

具体scrapy爬虫原理请自行百度,并请自行调试,谢谢~

 

posted @ 2019-01-16 18:05  me-ht  阅读(1391)  评论(0编辑  收藏  举报