scrapy_代理使用
Redis
1.使用scrapy-proxy随机IP代理插件:
安装:
pip install scrapy_proxies
设置settings.py
# Retry many times since proxies often fail RETRY_TIMES = 10 # Retry on most error codes since proxies fail for different reasons RETRY_HTTP_CODES = [500, 503, 504, 400, 403, 404, 408] DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, 'scrapy_proxies.RandomProxy': 100, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, } # ip文件示例如下 # http://host1:port # http://username:password@host2:port # http://host3:port # 这是存放代理IP列表的位置 PROXY_LIST = '/path/to/proxy/list.txt' #代理模式 # 0 = 每个请求都有不同的代理 # 1 = 仅从列表中获取一个代理,并将其分配给每个请求 # 2 = 在设置中使用自定义代理 PROXY_MODE = 0 #如果使用模式2,将下面解除注释: #CUSTOM_PROXY = "http://host1:port"
使用方法:
- 将之前用Python爬到的代理IP列表存储到
PROXY_LIST
可以找到的位置; - 几种
PROXY_MODE
里,可能0
是最常用的;如果有哪个IP是特别稳定的话,应该使用2
。
最关键的还是维护一个高可用的IP代理池。
2.轮训:
middlewear中使用代理
# -*- coding: utf-8-*- from scrapy import signals from Application import redis_conn class ProxyMiddleware(object): # 代理 def __init__(self): pass @classmethod def from_crawler(cls, crawler): s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): ''' 更换ip :param request: :param spider: :return: ''' proxy_ip = redis_conn.brpoplpush(src='proxies', dst='proxies', timeout=3) request.meta['proxy'] = proxy_ip return None def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
添加ip
r = redis.Redis(connection_pool=redis.ConnectionPool( host="localhost", port=6379, # password="", decode_responses=True, # 设置为str,不然可能会是bytes db=1 )) proxy_list = ["ip1", "ip2"] count = 0 for proxy in proxy_list: if not r.sismember('proxies_set', proxy): r.sadd('proxies_set', proxy) r.lpush('proxies', proxy) count += 1 print('代理加入队列成功, 共加入代理:' + str(count))