Scrapy IP代理池

  如果爬虫爬取速度过快,很容易被反爬虫给禁掉IP,公司的网络,大部分的网络是一种动态分配的,对待这种情况 变化方案和设置IP代理,除了这点也要考虑网站门户的访问压力。

主要有效方案:
  1、设置IP代理池。

  2、adsl定时拨号()。
设置爬虫的 ip代理:

middlewares.py

from fake_useragent import UserAgent
class ArticlespiderMiddleware(object):
    def __init__(self,crawler):
        super(ArticlespiderMiddleware, self).__init__()
        self.ua = UserAgent()
        self.ua_type = crawler.settings.get('RANDOM_UA_TPYE','random')

    @classmethod
    def from_crawler(cls,crawler):
        '''在初始化是调用'''
        return cls(crawler)

    def process_requests(self,request,spider):

        def get_ua():
            return getattr(self.ua,self.ua_type)

        request.headers.setdefault('User-Agent',get_ua())
        request.meta["proxy"] = "http://ip:Port"

上边只是简单的在中间件中设置了一个IP代理,但是我们需要的不是一个,所以,可以通过免费的IP代理来设置创建我们的Ip代理池:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = 'Fade Zhao'

import requests
from scrapy.selector import Selector
from fake_useragent import UserAgent
import MySQLdb



class GetHost(object):
    def __init__(self):
        self.conn = MySQLdb.connect(host='localhost', user='root', password='zhaoyinghan', db='fiction', charset='utf8')
        self.cursor = self.conn.cursor()
        self.headers = {}
    def insert_IP(self):
        for i in range(1, 11):
            url = 'http://www.xicidaili.com/nn/{0}'.format(i)
            agent = UserAgent().random
            self.headers = {
                'User-Agent': agent
            }
            session = requests.session()
            response = session.get(url=url, headers=self.headers)
            selector = Selector(text=response.text)

            all_trs = selector.css('#ip_list tr')
            ip_list = []
            for tr in all_trs[1:]:
                speed_str = tr.css(".bar::attr(title)").extract()[0]
                speed = 1
                if speed_str:
                    speed = float(speed_str.split('')[0])
                    if speed > 0.3:
                        continue
                td_text = tr.css("td::text").extract()
                ip = td_text[0]
                port = td_text[1]
                proxy_type = td_text[5]
                ip_list.append((ip, port, proxy_type, speed))
            print(ip_list)
            sql_str = '''INSERT INTO proxy_ip(ip,port,proxy_type,speed) 
                         VALUES(%s,%s,%s,%s) 
                         on duplicate 
                         key update port=values(port)'''
            try:
                self.cursor.executemany(sql_str, ip_list)
                self.conn.commit()
                print(i, '插完了')
            except Exception as e:
                print('插入错误', e)
                self.conn.rollback()
    def judge_ip(self,ip,port):
        '''判断ip是否可用'''
        http_url = 'https://www.baidu.com'
        proxy_url = 'http://{0}:{1}'.format(ip,port)
        try:
            proxy_dict = {
                "http":proxy_url
            }
            response = requests.get(http_url,proxies=proxy_dict,headers=self.headers)
        except Exception as e:
            print('invalid ip and port =',e)
            self.delete_ip(ip)
            return False
        else:
            code= response.status_code
            if code >= 200 and code < 300:
                print('成功代理')
                return True
            else:
                return False

    def delete_ip(self,ip):
        '''删除废弃host'''
        sql_str ='''delete from proxy_ip WHERE ip="{0}"'''.format(ip)
        self.cursor.execute(sql_str)
        self.conn.commit()

    def get_IP(self):
        sql_str = '''select ip,port from proxy_ip ORDER BY RAND() limit 1'''

        self.cursor.execute(sql_str)
        host = self.cursor.fetchone()
        ip = host[0]
        port = host[1]
        print(host)
        judge = self.judge_ip(ip,port)
        if judge:
            return "http://{0}:{0}".format(ip,port)
        else:
            self.get_IP()

middlewares.py

from ArticleSpider.utls.crawl_xiciIP import GetHost
class RandomProxyMiddleware(object):
    '''随机获取代理'''
    def process_requests(self,request,spider):
        host = GetHost()
        request.meta['proxy'] = host.get_IP()

以上就是IP代理池的设置和添加,但是我要说的是,GitHub上也已经有人给出了更多功能的IP代理吃的插件,正所谓是前任摘树,后人纳凉啊~

https://github.com/aivarsk/scrapy-proxies

 

posted @ 2017-12-01 00:24  LeeeetMe  阅读(402)  评论(0编辑  收藏  举报