scrapy-scheduler
1 # scheduler的作用: 用于控制Request对象的存储和获取,并提供了过滤重复Request的功能。 2 3 class Scheduler(object): 4 5 def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, 6 logunser=False, stats=None, pqclass=None): 7 self.df = dupefilter 8 self.dqdir = self._dqdir(jobdir) 9 self.pqclass = pqclass # 优先级队列 10 self.dqclass = dqclass # 磁盘序列号队列,用于断点续采 11 self.mqclass = mqclass # 内存队列 12 self.logunser = logunser 13 self.stats = stats 14 15 @classmethod 16 def from_crawler(cls, crawler): 17 settings = crawler.settings 18 dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) 19 dupefilter = dupefilter_cls.from_settings(settings) 20 pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) 21 dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) 22 mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) 23 logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) 24 return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, 25 stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)