返回顶部

05| scrapy redis

scrapy-redis
  分布式爬虫的组件。
  - 基于scrapy-redis的去重规则
  - redis的集合

 

- 完全自定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from scrapy.dupefilter import BaseDupeFilter
import redis
from scrapy.utils.request import request_fingerprint
import scrapy_redis
 
 
class DupFilter(BaseDupeFilter):
    def __init__(self):
        self.conn = redis.Redis(host='140.143.227.206',port=8888,password='beta')
 
    def request_seen(self, request):
        """
        检测当前请求是否已经被访问过
        :param request:
        :return: True表示已经访问过;False表示未访问过
        """
        fid = request_fingerprint(request)
        result = self.conn.sadd('visited_urls', fid)
        if result == 1:
            return False
        return True

  

settings.py

1
2
3
4
5
6
7
8
REDIS_HOST = '140.143.227.206'                            # 主机名
REDIS_PORT = 8888                                   # 端口
REDIS_PARAMS  = {'password':'beta'}                                  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
REDIS_ENCODING = "utf-8"                            # redis编码类型             默认:'utf-8'
 
# REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)
DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'

  

 

 

继承scrapy-redis 实现自定制

1
2
3
4
5
6
7
8
9
10
11
from scrapy_redis.dupefilter import RFPDupeFilter
from scrapy_redis.connection import get_redis_from_settings
from scrapy_redis import defaults
 
class RedisDupeFilter(RFPDupeFilter):
    @classmethod
    def from_settings(cls, settings):
 
        key = defaults.DUPEFILTER_KEY % {'timestamp': 'xiaodongbei'}
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(server, key=key, debug=debug)

 settings.py 

1
2
3
4
5
6
7
8
9
REDIS_HOST = '140.143.227.206'                            # 主机名
REDIS_PORT = 8888                                   # 端口
REDIS_PARAMS  = {'password':'beta'}                                  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
REDIS_ENCODING = "utf-8"                            # redis编码类型             默认:'utf-8'
 
# REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)
DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
 
DUPEFILTER_CLASS = 'dbd.xxx.RedisDupeFilter'

 

 

案例

1
2
3
4
5
6
7
8
9
10
11
12
13
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
import scrapy_redis
from scrapy_redis.spiders import RedisSpider
 
 
class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
 
    def parse(self, response):
        print(response)

  

 

使用scrapy-redis的配置

复制代码
# ############### scrapy redis连接 ####################

REDIS_HOST = '140.143.227.206'                            # 主机名
REDIS_PORT = 8888                                   # 端口
REDIS_PARAMS  = {'password':'beta'}                                  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
REDIS_ENCODING = "utf-8"                            # redis编码类型             默认:'utf-8'

# REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)

################ 去重 ######################
DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'

# ###################### 调度器 ######################
from scrapy_redis.scheduler import Scheduler
# 由scrapy_redis的调度器来进行负责调配
# enqueue_request: 向调度器中添加任务
# next_request: 去调度器中获取一个任务
SCHEDULER = "scrapy_redis.scheduler.Scheduler"

# 规定任务存放的顺序
# 优先级
DEPTH_PRIORITY = 1  # 广度优先
# DEPTH_PRIORITY = -1 # 深度优先
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'  # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)

# 广度优先
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'  # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
# 深度优先
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'  # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)

"""
redis = {
    chouti:requests:[
        pickle.dumps(Request(url='Http://wwwww',callback=self.parse)),
        pickle.dumps(Request(url='Http://wwwww',callback=self.parse)),
        pickle.dumps(Request(url='Http://wwwww',callback=self.parse)),
    ],
    cnblogs:requests:[
    
    ]
}
"""
SCHEDULER_QUEUE_KEY = '%(spider)s:requests'  # 调度器中请求存放在redis中的key

SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"  # 对保存到redis中的数据进行序列化,默认使用pickle

SCHEDULER_PERSIST = False  # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空
SCHEDULER_FLUSH_ON_START = True  # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空
# SCHEDULER_IDLE_BEFORE_CLOSE = 10  # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。


SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'  # 去重规则,在redis中保存时对应的key
SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'  # 去重规则对应处理的类


START_URLS_KEY = '%(name)s:start_urls'
REDIS_START_URLS_AS_SET = False
复制代码

 

 

 

 往redis中放起始url

1
2
3
4
5
6
7
# by luffycity.com
import redis
 
conn = redis.Redis(host='140.143.227.206',port=8888,password='beta')
 
 
conn.lpush('chouti:start_urls','https://dig.chouti.com/r/pic/hot/1')

  

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

posted @   Crazymagic  阅读(157)  评论(0编辑  收藏  举报
编辑推荐:
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
阅读排行:
· 终于写完轮子一部分:tcp代理 了,记录一下
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
点击右上角即可分享
微信分享提示