Scrapy IP代理池
如果爬虫爬取速度过快,很容易被反爬虫给禁掉IP,公司的网络,大部分的网络是一种动态分配的,对待这种情况 变化方案和设置IP代理,除了这点也要考虑网站门户的访问压力。
主要有效方案:
1、设置IP代理池。
2、adsl定时拨号()。
设置爬虫的 ip代理:
middlewares.py
from fake_useragent import UserAgent class ArticlespiderMiddleware(object): def __init__(self,crawler): super(ArticlespiderMiddleware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get('RANDOM_UA_TPYE','random') @classmethod def from_crawler(cls,crawler): '''在初始化是调用''' return cls(crawler) def process_requests(self,request,spider): def get_ua(): return getattr(self.ua,self.ua_type) request.headers.setdefault('User-Agent',get_ua()) request.meta["proxy"] = "http://ip:Port"
上边只是简单的在中间件中设置了一个IP代理,但是我们需要的不是一个,所以,可以通过免费的IP代理来设置创建我们的Ip代理池:
#!/usr/bin/env python # -*- coding: utf-8 -*- __author__ = 'Fade Zhao' import requests from scrapy.selector import Selector from fake_useragent import UserAgent import MySQLdb class GetHost(object): def __init__(self): self.conn = MySQLdb.connect(host='localhost', user='root', password='zhaoyinghan', db='fiction', charset='utf8') self.cursor = self.conn.cursor() self.headers = {} def insert_IP(self): for i in range(1, 11): url = 'http://www.xicidaili.com/nn/{0}'.format(i) agent = UserAgent().random self.headers = { 'User-Agent': agent } session = requests.session() response = session.get(url=url, headers=self.headers) selector = Selector(text=response.text) all_trs = selector.css('#ip_list tr') ip_list = [] for tr in all_trs[1:]: speed_str = tr.css(".bar::attr(title)").extract()[0] speed = 1 if speed_str: speed = float(speed_str.split('秒')[0]) if speed > 0.3: continue td_text = tr.css("td::text").extract() ip = td_text[0] port = td_text[1] proxy_type = td_text[5] ip_list.append((ip, port, proxy_type, speed)) print(ip_list) sql_str = '''INSERT INTO proxy_ip(ip,port,proxy_type,speed) VALUES(%s,%s,%s,%s) on duplicate key update port=values(port)''' try: self.cursor.executemany(sql_str, ip_list) self.conn.commit() print(i, '插完了') except Exception as e: print('插入错误', e) self.conn.rollback() def judge_ip(self,ip,port): '''判断ip是否可用''' http_url = 'https://www.baidu.com' proxy_url = 'http://{0}:{1}'.format(ip,port) try: proxy_dict = { "http":proxy_url } response = requests.get(http_url,proxies=proxy_dict,headers=self.headers) except Exception as e: print('invalid ip and port =',e) self.delete_ip(ip) return False else: code= response.status_code if code >= 200 and code < 300: print('成功代理') return True else: return False def delete_ip(self,ip): '''删除废弃host''' sql_str ='''delete from proxy_ip WHERE ip="{0}"'''.format(ip) self.cursor.execute(sql_str) self.conn.commit() def get_IP(self): sql_str = '''select ip,port from proxy_ip ORDER BY RAND() limit 1''' self.cursor.execute(sql_str) host = self.cursor.fetchone() ip = host[0] port = host[1] print(host) judge = self.judge_ip(ip,port) if judge: return "http://{0}:{0}".format(ip,port) else: self.get_IP()
middlewares.py
from ArticleSpider.utls.crawl_xiciIP import GetHost class RandomProxyMiddleware(object): '''随机获取代理''' def process_requests(self,request,spider): host = GetHost() request.meta['proxy'] = host.get_IP()
以上就是IP代理池的设置和添加,但是我要说的是,GitHub上也已经有人给出了更多功能的IP代理吃的插件,正所谓是前任摘树,后人纳凉啊~
https://github.com/aivarsk/scrapy-proxies。