scrapy 插件(扩展,信号量 logging)

注意:插件在配置文件中,后面的数值不是很重要,除非某些插件是依赖与某些插件的时候

一、主要作用:信号量

1、方法

import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured

logger = logging.getLogger(__name__)

class SpiderOpenCloseLogging(object):

    def __init__(self, item_count):
        self.item_count = item_count
        self.items_scraped = 0

    @classmethod
    def from_crawler(cls, crawler):
        # first check if the extension should be enabled and raise
        # NotConfigured otherwise
        if not crawler.settings.getbool('MYEXT_ENABLED'):
            raise NotConfigured

        # get the number of items from settings
        item_count = crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)

        # instantiate the extension object
        ext = cls(item_count)

        # connect the extension object to signals
        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)

        # return the extension object
        return ext

    def spider_opened(self, spider):
        logger.info("opened spider %s", spider.name)

    def spider_closed(self, spider):
        logger.info("closed spider %s", spider.name)

    def item_scraped(self, item, spider):
        self.items_scraped += 1
        if self.items_scraped % self.item_count == 0:
            logger.info("scraped %d items", self.items_scraped)
信号量丢弃

 

import logging
from collections import defaultdict
from datetime import datetime
from scrapy import signals
from scrapy.exceptions import NotConfigured

logger = logging.getLogger(__name__)


class SpiderOpenCloseLogging(object):

    def __init__(self, item_count):
        self.item_count = item_count
        self.items_scraped = 0
        self.items_dropped = 0
        self.stats = defaultdict(int)
        self.error_stats = defaultdict(int)

    @classmethod
    def from_crawler(cls, crawler):
        # first check if the extension should be enabled and raise
        # NotConfigured otherwise
        if not crawler.settings.getbool('MYEXT_ENABLED'):
            raise NotConfigured

        # get the number of items from settings
        item_count = crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)

        # instantiate the extension object
        ext = cls(item_count)

        # connect the extension object to signals
        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(ext.item_dropped, signal=signals.item_dropped)
        crawler.signals.connect(ext.response_received, signal=signals.response_received)

        # return the extension object
        return ext

    def spider_opened(self, spider):
        logger.info("opened spider %s" % spider.name)

    def spider_closed(self, spider):
        logger.info("closed spider %s"% spider.name)

    def item_scraped(self, item, spider):
        self.items_scraped += 1
        if self.items_scraped % self.item_count == 0:
            logger.info("scraped %d items" % self.items_scraped)

    def item_dropped(self, item, spider, response, exception):
        self.items_dropped += 1
        if self.items_dropped % self.item_count == 0:
            logger.info("dropped %d items" % self.items_dropped)

    def response_received(self, response, request, spider):
        now = datetime.now().strftime('%Y%m%d%H%M')
        self.stats[now] += 1
        if response.status in (401, 403, 404, 500):
            self.error_stats[now] += 1
        if self.error_stats[now] / float(self.stats[now]) > 0.2:
            logging.warning('received %s response and %s of them is none 200 %s' % (self.stats[now], self.error_stats[now], now))

item_dropped和item_scraped用法相同,具体看官网

2、配置文件

EXTENSIONS = {
   # 'scrapy.extensions.telnet.TelnetConsole': None,
   'toscrapy.mysingles.MySingle': 555,
}

注意:插件后面的数值,不重要

二、Telnet

1、线上部署爬虫,将Telnet设置成None

EXTENSIONS = {
   'scrapy.extensions.telnet.TelnetConsole': None,
}

2、使用

telnet ip port

3、est()

作用:查看当前爬虫的状态

posted @ 2019-11-12 00:10  市丸银  阅读(244)  评论(0编辑  收藏  举报