干货分享!用户级爬虫,怎敢封IP
一般来说,我们在爬取其他网站的数据的时候,会遇到ip被限制的情况,这时候就需要代理ip池进行处理了
1、获取ip代理的方法
def getProxyIp(): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } proxy = [] try: #html解析获取ip代理 url1 = 'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list' resq = requests.get(url1, headers=header) res = '[' + resq.replace('}\n{','},{') + ']' _s = json.loads(res) ips = [] for i in _s: if i['type'] == 'https': ip = str(i['host']) + ':' + str(i['port']) ips.append(ip) #验证每一个ip代理的有效性,然后加入到代理池中 for x in ips: try: requests.get(url2, proxies={'http':'http://'+x},timeout = 2) proxy.append(x) except Exception as e: print(str(e)) except Exception as e: print(str(e)) return proxy
2、在爬虫中使用代理时先检查代理ip的可用性,若可用则该ip代理爬取网页,若不可用则从代理池中剔除,调用获取代理池方法加入新代理ip。
#调用之前的ip代理获取函数能用的ip代理进行下一步流程,不能用的剔除代理池,加入新的ip代理 def getHrefInfo(url, a): url_h = url headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} proxy_list = getProxyIp() proxy_ip = proxy_list[a] #获取新的有效ip 当原来的ip连接不上的时候删除并获得新的ip加入到代理池中 while True: #ip代理验证,如果http请求回应为200,则ip代理可用。 try: resq = requests.get(url_h,headers=headers,proxies={'https':'https://' + proxy_ip}) if resq.status_code == 200: break #删除不能获取正确http回应的ip代理,并新加入不在代理池中的代理 else: proxy_list.remove(proxy_ip) _new_ip = getProxyIp() for ip in _new_ip: if ip not in proxy_list: proxy_list.append(ip) break #删除不能获取http回应的ip代理,并新加入不在代理池中的代理 except Exception as e: proxy_list.remove(proxy_ip) _new_ip = getProxyIp() for ip in _new_ip: if ip not in proxy_list: proxy_list.append(ip) break proxy_ip = random.choice(proxy_list) print(proxy_ip + "已从代理池中移除" + '已添加新ip进入代理池' + str(e)) time.sleep(3)
3、最后进行检测模块
import requests import time import traceback from requests.exceptions import ProxyError, ConnectionError from db.mongo_db import MongoDB from multiprocessing.pool import ThreadPool def valid_many(proxy_list, method): pool = ThreadPool(16) for proxy in proxy_list: pool.apply_async(valid_one, args=(proxy, method)) pool.close() pool.join() def valid_one(proxy, method, url='https://www.baidu.com'): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } proxies = { 'http': 'http://' + proxy['proxy'], 'https': 'http://' + proxy['proxy'] } try: start_time = time.time() # requests.packages.urllib3.disable_warnings() resp = requests.get(url, headers=headers, proxies=proxies, timeout=5, verify=False) delay = round(time.time() - start_time, 2) if resp.status_code == 200: proxy['delay'] = delay if method == 'insert': MongoDB().insert(proxy) elif method == 'check': MongoDB().update({'proxy': proxy['proxy']}, {'delay': proxy['delay']}) else: if method == 'check': MongoDB().delete({'proxy': proxy['proxy']}) except (ProxyError, ConnectionError): if method == 'check': MongoDB().delete({'proxy': proxy['proxy']}) except Exception: traceback.print_exc()