安装

pip3 install scrapy-redis

 

目标:帮助开发者实现分布式爬虫程序

class RFPDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""

    def __init__(self, path=None, debug=False):
        self.fingerprints = set()

    @classmethod
    def from_settings(cls, settings):
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(job_dir(settings), debug)

    def request_seen(self, request):
        # 将request对象转换成唯一标识。
        fp = self.request_fingerprint(request)
        # 判断在集合中是否存在,如果存在则返回True,表示已经访问过。
        if fp in self.fingerprints:
            return True
        # 之前未访问过,将url添加到访问记录中。
        self.fingerprints.add(fp)

    def request_fingerprint(self, request):
        return request_fingerprint(request)
scrapy中去重规则是如何实现
class RFPDupeFilter(BaseDupeFilter):
    """Redis-based request duplicates filter.

    This class can also be used with default Scrapy's scheduler.

    """

    logger = logger

    def __init__(self, server, key, debug=False):
        # self.server = redis连接
        self.server = server
        # self.key = dupefilter:123912873234
        self.key = key

    @classmethod
    def from_settings(cls, settings):
        # 读取配置,连接redis
        server = get_redis_from_settings(settings)

        #  key = dupefilter:123912873234
        key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(server, key=key, debug=debug)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def request_seen(self, request):
        fp = self.request_fingerprint(request)
        # This returns the number of values added, zero if already exists.
        # self.server=redis连接
        # 添加到redis集合中:1,添加工程;0,已经存在
        added = self.server.sadd(self.key, fp)
        return added == 0

    def request_fingerprint(self, request):
        return request_fingerprint(request)

    def close(self, reason=''):
        self.clear()

    def clear(self):
        """Clears fingerprints data."""
        self.server.delete(self.key)
scrapy-redis中去重规则是如何实现
# 将request对象全部放到内存维护的队列:self.q = deque()
# 将request对象全部放到硬盘维护的队列:文件操作

SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
SCHEDULER_PRIORITY_QUEUE = 'queuelib.PriorityQueue'


class Scheduler(object):
    def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
                 logunser=False, stats=None, pqclass=None):
        self.df = dupefilter
        self.dqdir = self._dqdir(jobdir)
        self.pqclass = pqclass
        self.dqclass = dqclass
        self.mqclass = mqclass
        self.logunser = logunser
        self.stats = stats

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)

        pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])

        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))
        return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
                   stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)

    def has_pending_requests(self):
        return len(self) > 0

    def open(self, spider):
        self.spider = spider
        self.mqs = self.pqclass(self._newmq)
        self.dqs = self._dq() if self.dqdir else None
        return self.df.open()

    def close(self, reason):
        if self.dqs:
            prios = self.dqs.close()
            with open(join(self.dqdir, 'active.json'), 'w') as f:
                json.dump(prios, f)
        return self.df.close(reason)

    def enqueue_request(self, request):
        # request.dont_filter=False
        # self.df.request_seen(request):
        #   - True,已经访问
        #   - False,未访问
        # request.dont_filter=True,全部加入到调度器
        if not request.dont_filter and self.df.request_seen(request):
            self.df.log(request, self.spider)
            return False
        # 如果往下走,把请求加入调度器
        dqok = self._dqpush(request)
        if dqok:
            self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
        else:
            self._mqpush(request)
            self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider)
        self.stats.inc_value('scheduler/enqueued', spider=self.spider)
        return True

    def next_request(self):
        request = self.mqs.pop()
        if request:
            self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider)
        else:
            request = self._dqpop()
            if request:
                self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider)
        if request:
            self.stats.inc_value('scheduler/dequeued', spider=self.spider)
        return request

    def __len__(self):
        return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)

    def _dqpush(self, request):
        if self.dqs is None:
            return
        try:
            reqd = request_to_dict(request, self.spider)
            self.dqs.push(reqd, -request.priority)
        except ValueError as e:  # non serializable request
            if self.logunser:
                msg = ("Unable to serialize request: %(request)s - reason:"
                       " %(reason)s - no more unserializable requests will be"
                       " logged (stats being collected)")
                logger.warning(msg, {'request': request, 'reason': e},
                               exc_info=True, extra={'spider': self.spider})
                self.logunser = False
            self.stats.inc_value('scheduler/unserializable',
                                 spider=self.spider)
            return
        else:
            return True

    def _mqpush(self, request):
        self.mqs.push(request, -request.priority)

    def _dqpop(self):
        if self.dqs:
            d = self.dqs.pop()
            if d:
                return request_from_dict(d, self.spider)

    def _newmq(self, priority):
        return self.mqclass()

    def _newdq(self, priority):
        return self.dqclass(join(self.dqdir, 'p%s' % priority))

    def _dq(self):
        activef = join(self.dqdir, 'active.json')
        if exists(activef):
            with open(activef) as f:
                prios = json.load(f)
        else:
            prios = ()
        q = self.pqclass(self._newdq, startprios=prios)
        if q:
            logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
                        {'queuesize': len(q)}, extra={'spider': self.spider})
        return q

    def _dqdir(self, jobdir):
        if jobdir:
            dqdir = join(jobdir, 'requests.queue')
            if not exists(dqdir):
                os.makedirs(dqdir)
            return dqdir
scrapy中的调度器是如何实现
# 将请求通过pickle进行序列化,然后添加到redis: 列表或有序结合中。
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'


class Scheduler(object):
    def __init__(self, server,
                 persist=False,
                 flush_on_start=False,
                 queue_key=defaults.SCHEDULER_QUEUE_KEY,
                 queue_cls=defaults.SCHEDULER_QUEUE_CLASS,
                 dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY,
                 dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS,
                 idle_before_close=0,
                 serializer=None):

        if idle_before_close < 0:
            raise TypeError("idle_before_close cannot be negative")

        self.server = server
        self.persist = persist
        self.flush_on_start = flush_on_start
        self.queue_key = queue_key
        self.queue_cls = queue_cls
        self.dupefilter_cls = dupefilter_cls
        self.dupefilter_key = dupefilter_key
        self.idle_before_close = idle_before_close
        self.serializer = serializer
        self.stats = None

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_settings(cls, settings):
        kwargs = {
            'persist': settings.getbool('SCHEDULER_PERSIST'),
            'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'),
            'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'),
        }

        # If these values are missing, it means we want to use the defaults.
        optional = {
            # TODO: Use custom prefixes for this settings to note that are
            # specific to scrapy-redis.
            'queue_key': 'SCHEDULER_QUEUE_KEY',
            'queue_cls': 'SCHEDULER_QUEUE_CLASS',
            'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY',
            # We use the default setting name to keep compatibility.
            'dupefilter_cls': 'DUPEFILTER_CLASS',
            'serializer': 'SCHEDULER_SERIALIZER',
        }
        for name, setting_name in optional.items():
            val = settings.get(setting_name)
            if val:
                kwargs[name] = val

        # Support serializer as a path to a module.
        if isinstance(kwargs.get('serializer'), six.string_types):
            kwargs['serializer'] = importlib.import_module(kwargs['serializer'])

        server = connection.from_settings(settings)
        # Ensure the connection is working.
        server.ping()

        return cls(server=server, **kwargs)

    @classmethod
    def from_crawler(cls, crawler):
        instance = cls.from_settings(crawler.settings)
        # FIXME: for now, stats are only supported from this constructor
        instance.stats = crawler.stats
        return instance

    def open(self, spider):
        self.spider = spider

        try:
            self.queue = load_object(self.queue_cls)(
                server=self.server,
                spider=spider,
                key=self.queue_key % {'spider': spider.name},
                serializer=self.serializer,
            )
        except TypeError as e:
            raise ValueError("Failed to instantiate queue class '%s': %s",
                             self.queue_cls, e)

        try:
            self.df = load_object(self.dupefilter_cls)(
                server=self.server,
                key=self.dupefilter_key % {'spider': spider.name},
                debug=spider.settings.getbool('DUPEFILTER_DEBUG'),
            )
        except TypeError as e:
            raise ValueError("Failed to instantiate dupefilter class '%s': %s",
                             self.dupefilter_cls, e)

        if self.flush_on_start:
            self.flush()
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.flush()

    def flush(self):
        self.df.clear()
        self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            self.df.log(request, self.spider)
            return False
        if self.stats:
            self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
        self.queue.push(request)
        return True

    def next_request(self):
        block_pop_timeout = self.idle_before_close
        request = self.queue.pop(block_pop_timeout)
        if request and self.stats:
            self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
        return request

    def has_pending_requests(self):
        return len(self) > 0
scrapy-redis中的调度器是如何实现
class Base(object):
                    """Per-spider base queue class"""

                    def __init__(self, server, spider, key, serializer=None):
                        """Initialize per-spider redis queue.

                        Parameters
                        ----------
                        server : StrictRedis
                            Redis client instance.
                        spider : Spider
                            Scrapy spider instance.
                        key: str
                            Redis key where to put and get messages.
                        serializer : object
                            Serializer object with ``loads`` and ``dumps`` methods.

                        """
                        if serializer is None:
                            # Backward compatibility.
                            # TODO: deprecate pickle.
                            serializer = picklecompat
                        if not hasattr(serializer, 'loads'):
                            raise TypeError("serializer does not implement 'loads' function: %r"
                                            % serializer)
                        if not hasattr(serializer, 'dumps'):
                            raise TypeError("serializer '%s' does not implement 'dumps' function: %r"
                                            % serializer)

                        self.server = server
                        self.spider = spider
                        self.key = key % {'spider': spider.name}
                        self.serializer = serializer

                    def _encode_request(self, request):
                        """Encode a request object"""
                        obj = request_to_dict(request, self.spider)
                        return self.serializer.dumps(obj)

                    def _decode_request(self, encoded_request):
                        """Decode an request previously encoded"""
                        obj = self.serializer.loads(encoded_request)
                        return request_from_dict(obj, self.spider)

                    def __len__(self):
                        """Return the length of the queue"""
                        raise NotImplementedError

                    def push(self, request):
                        """Push a request"""
                        raise NotImplementedError

                    def pop(self, timeout=0):
                        """Pop a request"""
                        raise NotImplementedError

                    def clear(self):
                        """Clear queue/stack"""
                        self.server.delete(self.key)


                class FifoQueue(Base):
                    """Per-spider FIFO queue"""

                    def __len__(self):
                        """Return the length of the queue"""
                        return self.server.llen(self.key)

                    def push(self, request):
                        """Push a request"""
                        self.server.lpush(self.key, self._encode_request(request))

                    def pop(self, timeout=0):
                        """Pop a request"""
                        if timeout > 0:
                            data = self.server.brpop(self.key, timeout)
                            if isinstance(data, tuple):
                                data = data[1]
                        else:
                            data = self.server.rpop(self.key)
                        if data:
                            return self._decode_request(data)


                class PriorityQueue(Base):
                    """Per-spider priority queue abstraction using redis' sorted set"""

                    def __len__(self):
                        """Return the length of the queue"""
                        return self.server.zcard(self.key)

                    def push(self, request):
                        """Push a request"""
                        data = self._encode_request(request)
                        score = -request.priority
                        # We don't use zadd method as the order of arguments change depending on
                        # whether the class is Redis or StrictRedis, and the option of using
                        # kwargs only accepts strings, not bytes.
                        self.server.execute_command('ZADD', self.key, score, data)

                    def pop(self, timeout=0):
                        """
                        Pop a request
                        timeout not support in this queue class
                        """
                        # use atomic range/remove using multi/exec
                        pipe = self.server.pipeline()
                        pipe.multi()
                        pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
                        results, count = pipe.execute()
                        if results:
                            return self._decode_request(results[0])


                class LifoQueue(Base):
                    """Per-spider LIFO queue."""

                    def __len__(self):
                        """Return the length of the stack"""
                        return self.server.llen(self.key)

                    def push(self, request):
                        """Push a request"""
                        self.server.lpush(self.key, self._encode_request(request))

                    def pop(self, timeout=0):
                        """Pop a request"""
                        if timeout > 0:
                            data = self.server.blpop(self.key, timeout)
                            if isinstance(data, tuple):
                                data = data[1]
                        else:
                            data = self.server.lpop(self.key)

                        if data:
                            return self._decode_request(data)


                # TODO: Deprecate the use of these names.
                SpiderQueue = FifoQueue
                SpiderStack = LifoQueue
                SpiderPriorityQueue = PriorityQueue

            # 爬虫爬取数据时存在层级和优先级:爬虫中间件实现
相关源码

 

使用scrapy-redis组件

# ############ 连接redis 信息 #################
REDIS_HOST = '127.0.0.1'  # 主机名
REDIS_PORT = 6379  # 端口
# REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)
REDIS_PARAMS = {}  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
# REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定连接Redis的Python模块  默认:redis.StrictRedis
REDIS_ENCODING = "utf-8"

# 自定义去重规则
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
1:只利用去重规则
# ############ 连接redis 信息 #################
REDIS_HOST = '127.0.0.1'  # 主机名
REDIS_PORT = 6379  # 端口
# REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)
REDIS_PARAMS = {}  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
# REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定连接Redis的Python模块  默认:redis.StrictRedis
REDIS_ENCODING = "utf-8"

# 有引擎来执行:自定义调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'  # 默认使用优先级队列(默认广度优先),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
SCHEDULER_QUEUE_KEY = '%(spider)s:requests'  # 调度器中请求存放在redis中的key
SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"  # 对保存到redis中的数据进行序列化,默认使用pickle
SCHEDULER_PERSIST = True  # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空
SCHEDULER_FLUSH_ON_START = False  # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空
# SCHEDULER_IDLE_BEFORE_CLOSE = 10  # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。
SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'  # 去重规则,在redis中保存时对应的key  chouti:dupefilter
SCHEDULER_DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'  # 去重规则对应处理的类
# 去重规则对应处理的类
DUPEFILTER_DEBUG = False
2:只用它的调度器
# ############ 连接redis 信息 #################
REDIS_HOST = '127.0.0.1'  # 主机名
REDIS_PORT = 6379  # 端口
# REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)
REDIS_PARAMS = {}  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
# REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定连接Redis的Python模块  默认:redis.StrictRedis
REDIS_ENCODING = "utf-8"

# 有引擎来执行:自定义调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'  # 默认使用优先级队列(默认广度优先),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
SCHEDULER_QUEUE_KEY = '%(spider)s:requests'  # 调度器中请求存放在redis中的key
SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"  # 对保存到redis中的数据进行序列化,默认使用pickle
SCHEDULER_PERSIST = True  # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空
SCHEDULER_FLUSH_ON_START = False  # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空
# SCHEDULER_IDLE_BEFORE_CLOSE = 10  # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。
SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'  # 去重规则,在redis中保存时对应的key  chouti:dupefilter
SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'  # 去重规则对应处理的类
DUPEFILTER_DEBUG = False
3:去重+调度器
# ############ 连接redis 信息 #################
REDIS_HOST = '127.0.0.1'  # 主机名
REDIS_PORT = 6379  # 端口
# REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)
REDIS_PARAMS = {}  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
# REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定连接Redis的Python模块  默认:redis.StrictRedis
REDIS_ENCODING = "utf-8"
ITEM_PIPELINES = {
    'scrapy_redis.pipelines.RedisPipeline': 300,
}
4:使用scrapy-redis内置的pipeline做持久化:就是将item对象保存到redis的列表中
DIS_HOST = '127.0.0.1'  # 主机名
REDIS_PORT = 6379  # 端口
# REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)
REDIS_PARAMS = {}  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
# REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定连接Redis的Python模块  默认:redis.StrictRedis
REDIS_ENCODING = "utf-8"

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

# 有引擎来执行:自定义调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'  # 默认使用优先级队列(默认广度优先),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
SCHEDULER_QUEUE_KEY = '%(spider)s:requests'  # 调度器中请求存放在redis中的key
SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"  # 对保存到redis中的数据进行序列化,默认使用pickle
SCHEDULER_PERSIST = True  # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空
SCHEDULER_FLUSH_ON_START = False  # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空
# SCHEDULER_IDLE_BEFORE_CLOSE = 10  # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。
SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'  # 去重规则,在redis中保存时对应的key  chouti:dupefilter
SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'  # 去重规则对应处理的类
DUPEFILTER_DEBUG = False
5:以上功能全部应用的配置
# 配置:
REDIS_START_URLS_BATCH_SIZE = 1
# REDIS_START_URLS_AS_SET = True # 把起始url放到redis的集合
REDIS_START_URLS_AS_SET = False  # 把起始url放到redis的列表

# 爬虫:
from scrapy_redis.spiders import RedisSpider
from scrapy.http import Request
from ..items import WenwenItem


class ChoutiSpider(RedisSpider):
    name = 'chouti'
    allowed_domains = ['chouti.com']

    def parse(self, response):
        # 随着深度的增加、优先级一直在递减
        print(response)


# 放置起始URL:
import redis

conn = redis.Redis(host='127.0.0.1', port=6379)

# 起始url的Key: chouti:start_urls
conn.lpush("chouti:start_urls", 'https://dig.chouti.com/r/ask/hot/12')
让scrapy-redis的起始URL不再通过start_reuqests执行,而是去redis中获取
PS:深度和优先级相关
  DEPTH_PRIORITY = 1
posted on 2018-05-15 22:22  Py行僧  阅读(161)  评论(0编辑  收藏  举报