scrapy-redis ip代理池

中间件

import random
import redis
from scrapy.exceptions import NotConfigured
from twisted.internet.error import ConnectError, TimeoutError


class RandomProxyMiddleWare(object):
    def __init__(self, settings):
        # 2.初始化配置及相关变量
        self.r = redis.Redis(host='127.0.0.1')
        self.proxy_key = settings.get('PROXY_REDIS_KEY')
        self.proxy_stats_key = self.proxy_key + '_stats'
        self.max_failed = 3

    @property
    def proxies(self):
        return [i.decode('utf-8') for i in self.r.lrange(self.proxy_key, 0, -1)]

    @classmethod
    def from_crawler(cls, crawler):
        # 1. 创建中间件对象
        # 默认代理是启用的
        if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
            raise NotConfigured
        return cls(crawler.settings)

    def process_request(self, request, spider):
        # 3. 为每个request对象分配随机的ip代理
        if self.proxies and not request.meta.get('proxy') and request.url not in spider.start_urls:
            request.meta['proxy'] = random.choices(self.proxies)

    def process_response(self, request, response, spider):
        # 4.0 请求成功
        cur_proxy = request.meta.get('proxy')
        # 判断ip是否被对方封禁
        if response.status in (401, 403):
            self.r.hincrby(self.proxy_stats_key, cur_proxy, 1)
        # 当某个IP的失败次数累积到一定的数量
        filed_times = self.hget(self.proxy_stats_key, cur_proxy) or 0
        if int(filed_times) >= self.max_failed:
            print('got wrong http code (%s) when use %s' % (response.status, cur_proxy))
            # 可以认为该IP被对方封禁。从代理池中将该IP删除
            self.remove_proxy(cur_proxy)
            del request.meta['proxy']
            # 返回request 将该请求重新->调度器
            return request
        return response

    def process_exception(self, request, exception, spider):
        # 4.1 请求失败
        cur_proxy = request.meta.get('proxy')
        # 请求使用代理,并且网络请求报错,认为该IP出错,删除,并重新->调度器
        if cur_proxy and isinstance(cur_proxy, (ConnectError, TimeoutError)):
            print('error (%s) occur when use proxy %s' % (exception, cur_proxy))
            self.remove_proxy(cur_proxy)
            del request.meta['proxy']
            return request

    def remove_proxy(self, proxy):
        if proxy in self.proxies:
            self.r.lrem(self.proxy_key, proxy)
            self.r.hdel(self.proxy_stats_key, proxy)

setting配置文件

PROXY_REDIS_KEY= ""

把ip放进去,单独定义一个py文件

import redis


PROXY = [
    'http://192.169.1.1:8000',
]

r = redis.Redis(host='127.0.0.1')

r.lpush('PROXY_REDIS_KEY的名称', *PROXY)

 

 

posted @ 2019-11-16 00:21  市丸银  阅读(512)  评论(0编辑  收藏  举报