重写dupefilter以解决scrapy-redis和scrapy-splash冲突问题

from __future__ import absolute_import

from copy import deepcopy

from scrapy.utils.request import request_fingerprint
from scrapy.utils.url import canonicalize_url

from scrapy_splash.utils import dict_hash

from scrapy_redis.dupefilter import RFPDupeFilter


def splash_request_fingerprint(request, include_headers=None):
   fp = request_fingerprint(request, include_headers=include_headers)
   if 'splash' not in request.meta:
       return fp

   splash_options = deepcopy(request.meta['splash'])
   args = splash_options.setdefault('args', {})

   if 'url' in args:
       args['url'] = canonicalize_url(args['url'], keep_fragments=True)

   return dict_hash(splash_options, fp)


class SplashAwareDupeFilter(RFPDupeFilter):

   def request_fingerprint(self, request):
       return splash_request_fingerprint(request)

​# 在项目目录下新建一个py文件,将上面的代码复制进去,在settings中添加
DUPEFILTER_CLASS = '项目名.xxx.py.SplashAwareDupeFilter' ,并且把scrapy-redis和scrapy-splash的
DUPEFILTER_CLASS删掉,其他设置并不冲突,可以不改。

 

 

posted on 2021-07-23 18:56  CJTARRR  阅读(287)  评论(0编辑  收藏  举报