scrapy 插件(扩展,信号量 logging)
注意:插件在配置文件中,后面的数值不是很重要,除非某些插件是依赖与某些插件的时候
一、主要作用:信号量
1、方法
import logging from scrapy import signals from scrapy.exceptions import NotConfigured logger = logging.getLogger(__name__) class SpiderOpenCloseLogging(object): def __init__(self, item_count): self.item_count = item_count self.items_scraped = 0 @classmethod def from_crawler(cls, crawler): # first check if the extension should be enabled and raise # NotConfigured otherwise if not crawler.settings.getbool('MYEXT_ENABLED'): raise NotConfigured # get the number of items from settings item_count = crawler.settings.getint('MYEXT_ITEMCOUNT', 1000) # instantiate the extension object ext = cls(item_count) # connect the extension object to signals crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped) # return the extension object return ext def spider_opened(self, spider): logger.info("opened spider %s", spider.name) def spider_closed(self, spider): logger.info("closed spider %s", spider.name) def item_scraped(self, item, spider): self.items_scraped += 1 if self.items_scraped % self.item_count == 0: logger.info("scraped %d items", self.items_scraped)
import logging from collections import defaultdict from datetime import datetime from scrapy import signals from scrapy.exceptions import NotConfigured logger = logging.getLogger(__name__) class SpiderOpenCloseLogging(object): def __init__(self, item_count): self.item_count = item_count self.items_scraped = 0 self.items_dropped = 0 self.stats = defaultdict(int) self.error_stats = defaultdict(int) @classmethod def from_crawler(cls, crawler): # first check if the extension should be enabled and raise # NotConfigured otherwise if not crawler.settings.getbool('MYEXT_ENABLED'): raise NotConfigured # get the number of items from settings item_count = crawler.settings.getint('MYEXT_ITEMCOUNT', 1000) # instantiate the extension object ext = cls(item_count) # connect the extension object to signals crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped) crawler.signals.connect(ext.item_dropped, signal=signals.item_dropped) crawler.signals.connect(ext.response_received, signal=signals.response_received) # return the extension object return ext def spider_opened(self, spider): logger.info("opened spider %s" % spider.name) def spider_closed(self, spider): logger.info("closed spider %s"% spider.name) def item_scraped(self, item, spider): self.items_scraped += 1 if self.items_scraped % self.item_count == 0: logger.info("scraped %d items" % self.items_scraped) def item_dropped(self, item, spider, response, exception): self.items_dropped += 1 if self.items_dropped % self.item_count == 0: logger.info("dropped %d items" % self.items_dropped) def response_received(self, response, request, spider): now = datetime.now().strftime('%Y%m%d%H%M') self.stats[now] += 1 if response.status in (401, 403, 404, 500): self.error_stats[now] += 1 if self.error_stats[now] / float(self.stats[now]) > 0.2: logging.warning('received %s response and %s of them is none 200 %s' % (self.stats[now], self.error_stats[now], now))
item_dropped和item_scraped用法相同,具体看官网
2、配置文件
EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, 'toscrapy.mysingles.MySingle': 555, }
注意:插件后面的数值,不重要
二、Telnet
1、线上部署爬虫,将Telnet设置成None
EXTENSIONS = { 'scrapy.extensions.telnet.TelnetConsole': None, }
2、使用
telnet ip port
3、est()
作用:查看当前爬虫的状态