scrapy去重
自定义去重
-类。自定义一个类。DUPEFILTER_CLASS = 'sp2.rep.RepeatUrl'
-配置文件中指定 :scrapy.dupefilter.RFPDupeFilter
scrapy默认使用 scrapy.dupefilter.RFPDupeFilter 进行去重,相关配置有:
from scrapy.dupefilter import RFPDupeFilter
UPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' DUPEFILTER_DEBUG = False JOBDIR = "保存范文记录的日志路径,如:/root/" # 最终路径为 /root/requests.seen
#DUPEFILTER_CLASS = 'sp2.rep.RepeatUrl'
class RepeatUrl: def __init__(self): self.visited_url = set() @classmethod def from_settings(cls, settings): """ 初始化时,调用 :param settings: :return: """ return cls() def request_seen(self, request): """ 检测当前请求是否已经被访问过 :param request: :return: True表示已经访问过;False表示未访问过 """ if request.url in self.visited_url: return True self.visited_url.add(request.url) return False def open(self): """ 开始爬去请求时,调用 :return: """ print('open replication') def close(self, reason): """ 结束爬虫爬取时,调用 :param reason: :return: """ print('close replication') def log(self, request, spider): """ 记录日志 :param request: :param spider: :return: """ print('repeat', request.url) 自定义URL去重操作