'''
主要起到链接redis数据库的功能
connetction.py
class RFPDupeFilter(BaseDupeFilter)
常亮值
defaults.py
主要用来对request请求去重的一个类
dupefliter.py
def request_seen(self, request):
# Returns True if request was already seen.
# Parameters
# _________
# request : scrapy.http.Request
# Returns
# ___________
# bool
# 核心方法, 获取指纹
fp = self.request_fingerprint(request)
# This returns the number of values added, zero if already
exists;
added = self.server.sadd(self.key,fp)
return added == 0
存在该指纹,就证明这个请求已经在redis队列中了,我们就丢弃它
sadd
json的方法
picklecompat.py
把结果集中存到redis中
pipelines.py
def process_item(self,item,spider):
return deferToThread(self._process_item,item,spider)
def _process_item(self,item,spider):
key = self.item_key(item,spider)
data = self.serialize(item)
self.server.rpush(key,data)
return item
一共维护了3个队列
queue.py
SpiderQueue = FifoQueue (先进先出)
SpiderStack = LifoQueue (后进先出)
SpiderPriorityQueue = PriorityQueue (优先级队列)
比较核心的功能
scheduler.py
'''