05| scrapy redis
scrapy-redis
分布式爬虫的组件。
- 基于scrapy-redis的去重规则
- redis的集合
- 完全自定义
from scrapy.dupefilter import BaseDupeFilter
import redis
from scrapy.utils.request import request_fingerprint
import scrapy_redis
class DupFilter(BaseDupeFilter):
def __init__(self):
self.conn = redis.Redis(host='140.143.227.206',port=8888,password='beta')
def request_seen(self, request):
"""
检测当前请求是否已经被访问过
:param request:
:return: True表示已经访问过;False表示未访问过
"""
fid = request_fingerprint(request)
result = self.conn.sadd('visited_urls', fid)
if result == 1:
return False
return True
settings.py
REDIS_HOST = '140.143.227.206' # 主机名
REDIS_PORT = 8888 # 端口
REDIS_PARAMS = {'password':'beta'} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
REDIS_ENCODING = "utf-8" # redis编码类型 默认:'utf-8'
# REDIS_URL = 'redis://user:pass@hostname:9001' # 连接URL(优先于以上配置)
DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
继承scrapy-redis 实现自定制
from scrapy_redis.dupefilter import RFPDupeFilter
from scrapy_redis.connection import get_redis_from_settings
from scrapy_redis import defaults
class RedisDupeFilter(RFPDupeFilter):
@classmethod
def from_settings(cls, settings):
key = defaults.DUPEFILTER_KEY % {'timestamp': 'xiaodongbei'}
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(server, key=key, debug=debug)
settings.py
REDIS_HOST = '140.143.227.206' # 主机名
REDIS_PORT = 8888 # 端口
REDIS_PARAMS = {'password':'beta'} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
REDIS_ENCODING = "utf-8" # redis编码类型 默认:'utf-8'
# REDIS_URL = 'redis://user:pass@hostname:9001' # 连接URL(优先于以上配置)
DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
DUPEFILTER_CLASS = 'dbd.xxx.RedisDupeFilter'
案例
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
import scrapy_redis
from scrapy_redis.spiders import RedisSpider
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
def parse(self, response):
print(response)
使用scrapy-redis的配置
# ############### scrapy redis连接 #################### REDIS_HOST = '140.143.227.206' # 主机名 REDIS_PORT = 8888 # 端口 REDIS_PARAMS = {'password':'beta'} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) REDIS_ENCODING = "utf-8" # redis编码类型 默认:'utf-8' # REDIS_URL = 'redis://user:pass@hostname:9001' # 连接URL(优先于以上配置) ################ 去重 ###################### DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' # ###################### 调度器 ###################### from scrapy_redis.scheduler import Scheduler # 由scrapy_redis的调度器来进行负责调配 # enqueue_request: 向调度器中添加任务 # next_request: 去调度器中获取一个任务 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 规定任务存放的顺序 # 优先级 DEPTH_PRIORITY = 1 # 广度优先 # DEPTH_PRIORITY = -1 # 深度优先 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) # 广度优先 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) # 深度优先 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue' # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) """ redis = { chouti:requests:[ pickle.dumps(Request(url='Http://wwwww',callback=self.parse)), pickle.dumps(Request(url='Http://wwwww',callback=self.parse)), pickle.dumps(Request(url='Http://wwwww',callback=self.parse)), ], cnblogs:requests:[ ] } """ SCHEDULER_QUEUE_KEY = '%(spider)s:requests' # 调度器中请求存放在redis中的key SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat" # 对保存到redis中的数据进行序列化,默认使用pickle SCHEDULER_PERSIST = False # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空 SCHEDULER_FLUSH_ON_START = True # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空 # SCHEDULER_IDLE_BEFORE_CLOSE = 10 # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。 SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' # 去重规则,在redis中保存时对应的key SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' # 去重规则对应处理的类 START_URLS_KEY = '%(name)s:start_urls' REDIS_START_URLS_AS_SET = False
往redis中放起始url
# by luffycity.com
import redis
conn = redis.Redis(host='140.143.227.206',port=8888,password='beta')
conn.lpush('chouti:start_urls','https://dig.chouti.com/r/pic/hot/1')

浙公网安备 33010602011771号