scrapy-redis
1.scrapy-redis(queue源码)
- 在scrapy-redis中有三种方法来存放Request对象
- # 用这个来配置使用那个方法来存放Request 路径 scrapy-redis.queue.py
在setting.py中写上 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
class FifoQueue(Base): """Per-spider FIFO queue""" # 先进先出 def __len__(self): """Return the length of the queue""" return self.server.llen(self.key) def push(self, request): """插入""" self.server.lpush(self.key, self._encode_request(request)) def pop(self, timeout=0): """获取""" if timeout > 0: data = self.server.brpop(self.key, timeout) if isinstance(data, tuple): data = data[1] else: data = self.server.rpop(self.key) if data: return self._decode_request(data)
class PriorityQueue(Base): """Per-spider priority queue abstraction using redis' sorted set""" # 有序集合默认使用这个 def __len__(self): """Return the length of the queue""" return self.server.zcard(self.key) def push(self, request): """Push a request""" data = self._encode_request(request) score = -request.priority # We don't use zadd method as the order of arguments change depending on # whether the class is Redis or StrictRedis, and the option of using # kwargs only accepts strings, not bytes. self.server.execute_command('ZADD', self.key, score, data) def pop(self, timeout=0): """ Pop a request timeout not support in this queue class """ # use atomic range/remove using multi/exec pipe = self.server.pipeline() pipe.multi() pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) results, count = pipe.execute() if results: return self._decode_request(results[0])
class LifoQueue(Base): """Per-spider LIFO queue.""" # 后进先出队列(栈) def __len__(self): """Return the length of the stack""" return self.server.llen(self.key) def push(self, request): """Push a request""" self.server.lpush(self.key, self._encode_request(request)) def pop(self, timeout=0): """Pop a request""" if timeout > 0: data = self.server.blpop(self.key, timeout) if isinstance(data, tuple): data = data[1] else: data = self.server.lpop(self.key) if data: return self._decode_request(data)