scrapy-redis源码抛析
#scrapy-redis--->queue.py-->class FifoQueue 队列 LifoQueue(lastinfirstout栈) #self.server父类Base中链接字符串 ---》 LifoQueue _encode_reques---》Base---》_encode_request--》serializer--->picklecompat #picklecompat-->/usr/local/lib/python3.6/site-packages/scrapy_redis/picklecompat.py # def loads(s): # return pickle.loads(s) # # # def dumps(obj): # return pickle.dumps(obj, protocol=-1) #通过pickle做的serializer #server来源 #/usr/local/lib/python3.6/site-packages/scrapy_redis/connection.py # 通过 def get_redis---》defaults.REDIS_CLS--》/usr/local/lib/python3.6/site-packages/scrapy_redis/defaults.py #redis.StrictRedis--># import redis #redis.Redis --->/usr/local/lib/python3.6/site-packages/redis/client.py --->redis继承了 StrictRedis #def get_redis # if url: # return redis_cls.from_url(url, **kwargs) # else: # return redis_cls(**kwargs) # 有url通过url实例化 没有url就直接加括号实例化--》拿到server #调度器 #/usr/local/lib/python3.6/site-packages/scrapy_redis/scheduler.py #def enqueue_request--->self.queue.push(request) 放东西 #def next_request -----request = self.queue.pop(block_pop_timeout) 取东西 #def enqueue_request--->self.df.request_seen(request)判断是否存在 #/usr/local/lib/python3.6/site-packages/scrapy_redis/dupefilter.py def request_seen #self.server.sadd往集合里添加 根据self.server.sadd的返回值True就是如果是0就没加进去就是有已经添加了,如果是1就是没看到过并添加了 #这里面 #/usr/local/lib/python3.6/site-packages/scrapy_redis/queue.py LifoQueue pop push # #dupefilter.py pipelines.py scheduler.py是组件里做的事 #default.py默认值 #picklecompat.py用什么进行序列化 #scheduler.py调用的queue.py #spiders.py爬虫 def start_requests--》self.next_requests # use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) # fetch_one = self.server.spop if use_set else self.server.lpop #就是去redis取启始url 如果用spiders就需要redis中预先存好启始url 如果不用这个spider我们自己需要手动添加启始url # #utiles公共的 #链接redis #如果有url url优先链接 否则就是ip # 配置到settings里面 #http://www.cnblogs.com/wupeiqi/articles/6912807.html # REDIS_HOST = 'localhost' # 主机名 # REDIS_PORT = 6379 # 端口 # REDIS_URL = 'redis://user:pass@hostname:9001' # 连接URL(优先于以上配置) # REDIS_PARAMS = {} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定连接Redis的Python模块 默认:redis.StrictRedis # REDIS_ENCODING = "utf-8" # redis编码类型 默认:'utf-8' #调度器配置 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) SCHEDULER_QUEUE_KEY = '%(spider)s:requests' # 调度器中请求存放在redis中的key # v='%(spider)s:requests' # val=v%{'spider':'hahah'} # val # 'hahah:requests' #每个爬虫,都有自己在scrapy-redis中的队列,在redis中对应的一个key #renjian:requests:{}/[] 选择是{}/[]是根据PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) 来定的 #放任务的 #renjian:requests:['http://www.baidu.com',] #jianren:requests:['http://www.chouti.com',] #/Users/shuanggai/PycharmProjects/git/python/D20171113/scrapy-redis/day115/dabo/dabo/spiders/renjian.py中的name = 'renjian' 用于做格式化 # SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat" # 对保存到redis中的数据进行序列化,默认使用pickle # SCHEDULER_PERSIST = True # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空 # 测试时清空 线上不清空 # SCHEDULER_FLUSH_ON_START = True # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空 # SCHEDULER_IDLE_BEFORE_CLOSE = 10 # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。 #/usr/local/lib/python3.6/site-packages/scrapy_redis/scheduler.py--》def next_request --》self.idle_before_close #idle_before_close=0,默认是0 #/usr/local/lib/python3.6/site-packages/scrapy_redis/queue.py-->def pop-->data = self.server.blpop(self.key, timeout) #lpop取回的是元祖 超时就是体现在这里 # SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' # 去重规则,在redis中保存时对应的key # SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' # 去重规则对应处理的类 #访问记录 类型是集合 #/usr/local/lib/python3.6/site-packages/scrapy_redis/dupefilter.py #def request_seen ---》self.server.sadd """ renjian:dupefilter:{} jianren:dupefilter:{} """ #/usr/local/lib/python3.6/site-packages/scrapy_redis/dupefilter.py --》def request_seen---》fp = self.request_fingerprint(request) #/usr/local/lib/python3.6/site-packages/scrapy/utils/request.py--》def request_fingerprint(request, include_headers=None): from scrapy.utils.request import request_fingerprint from scrapy.http import Request r1=Request(url='http://www.baidu.com?id=1&page=2',headers={'k1':'v1'}) r1_str = request_fingerprint(r1,include_headers=['k1']) #把对象转化成字符串 print(r1_str) #75d6587d87b3f4f3aa574b33dbd69ceeb9eafe7b r2=Request(url='http://www.baidu.com?page=2&id=1',headers={'k1':'v2'}) r2_str = request_fingerprint(r2,include_headers=['k1']) print(r2_str) #默认 是否加请求头和参数位置不同不影响值,但是参数的增加或减少会影响值 除非添加include_headers #重要 ## 利用调度器使用scrapy_redis SCHEDULER="scrapy_redis.scheduler.Scheduler" #去重的记录使用scrapy_redis DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" #数据持久化 # from scrapy_redis.pipelines import RedisPipeline # ITEM_PIPELINES = { # 'dabo.pipelines.DaboPipeline': 300, # 'dabo.pipelines.XiaoboPipeline': 400, # } from scrapy_redis.pipelines import RedisPipeline ITEM_PIPELINES = { 'scrapy_redis.pipelines.RedisPipeline': 300, } REDIS_ITEMS_KEY = '%(spider)s:items' REDIS_ITEMS_SERIALIZER = 'json.dumps' #启始url REDIS_START_URLS_AS_SET = False #True就是集合False就是列表 REDIS_START_URLS_KEY = '%(name)s:start_urls' #/usr/local/lib/python3.6/site-packages/scrapy_redis/spiders.py 获取start_url 所以需要在redis先添加url #def start_requests-->self.next_requests # use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) # fetch_one = self.server.spop if use_set else self.server.lpop #conn.lpush('renjian:start_urls','http://www.chouti.com') #/usr/local/lib/python3.6/site-packages/scrapy_redis/pipelines.py--->class RedisPipeline-->def process_item-->deferToThread(self._process_item, item, spider) # class Base(object): # """Per-spider base queue class""" # # def __init__(self, server, spider, key, serializer=None): # """Initialize per-spider redis queue. # # Parameters # ---------- # server : StrictRedis # Redis client instance. # spider : Spider # Scrapy spider instance. # key: str # Redis key where to put and get messages. # serializer : object # Serializer object with ``loads`` and ``dumps`` methods. # # """ # if serializer is None: # # Backward compatibility. # # TODO: deprecate pickle. # serializer = picklecompat # if not hasattr(serializer, 'loads'): # raise TypeError("serializer does not implement 'loads' function: %r" # % serializer) # if not hasattr(serializer, 'dumps'): # raise TypeError("serializer '%s' does not implement 'dumps' function: %r" # % serializer) # # self.server = server # self.spider = spider # self.key = key % {'spider': spider.name} # self.serializer = serializer # # def _encode_request(self, request): # """Encode a request object""" # obj = request_to_dict(request, self.spider) # return self.serializer.dumps(obj) # class LifoQueue(Base): # """Per-spider LIFO queue.""" # # def __len__(self): # """Return the length of the stack""" # return self.server.llen(self.key) # # def push(self, request): # """Push a request""" # self.server.lpush(self.key, self._encode_request(request)) # # def pop(self, timeout=0): # """Pop a request""" # if timeout > 0: # data = self.server.blpop(self.key, timeout) # if isinstance(data, tuple): # data = data[1] # else: # data = self.server.lpop(self.key) # # if data: # return self._decode_request(data) # # # # if serializer is None: # # Backward compatibility. # # TODO: deprecate pickle. # serializer = picklecompat # if not hasattr(serializer, 'loads'): # raise TypeError("serializer does not implement 'loads' function: %r" # % serializer) # if not hasattr(serializer, 'dumps'): # raise TypeError("serializer '%s' does not implement 'dumps' function: %r" # % serializer) # def get_redis(**kwargs): # """Returns a redis client instance. # # Parameters # ---------- # redis_cls : class, optional # Defaults to ``redis.StrictRedis``. # url : str, optional # If given, ``redis_cls.from_url`` is used to instantiate the class. # **kwargs # Extra parameters to be passed to the ``redis_cls`` class. # # Returns # ------- # server # Redis client instance. # # """ # redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS) # url = kwargs.pop('url', None) # if url: # return redis_cls.from_url(url, **kwargs) # else: # return redis_cls(**kwargs)