scrapy 去重策略修改
1、首先自定义一个‘duplication.py’文件:
class RepeatFilter(object): def __init__(self): """ 2、对象初始化 """ self.visited_set = set() @classmethod def from_settings(cls, settings): """ 1、创建对象 :param settings: :return: """ print('......') return cls() def request_seen(self, request): """ 4、检查是否已经访问过 :param request: :return: """ if request.url in self.visited_set: return True self.visited_set.add(request.url) return False def open(self): # can return deferred """ 3、开始爬取 :return: """ print('open') pass def close(self, reason): # can return a deferred """ 5、停止爬取 :param reason: :return: """ print('close') pass def log(self, request, spider): # log that a request has been filtered pass
2、修改settings文件,添加
DUPEFILTER_CLASS = 'day96.duplication.RepeatFilter'