建立爬虫代理IP池
单线程构建爬虫代理IP池
#!/usr/bin/python3.5 # -*- coding:utf-8 -*- import time import tempfile from lxml import etree from urllib import request user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0' def get_content(url): # 获取网页内容 global user_agent headers = {'User-Agent': user_agent} req = request.Request(url=url, headers=headers) res = request.urlopen(req) return res.read().decode('utf-8') def get_info(tmp,content): # 提取网页信息 / ip 端口 ip_list = etree.HTML(content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()') port_list = etree.HTML(content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()') for i in range(0,len(ip_list)): out = u"" out += u"" + ip_list[i] out += u":" + port_list[i] tmp.write((out + u"\n").encode('utf-8')) # 所有ip和端口号写入data文件 def verify_ip(ip,port,test_url): # 验证 ip+port 有效性 global user_agent headers = {'User-Agent': user_agent,'Host': 'www.12306.cn','Referer': 'http://www.12306.cn/'} proxy = {'http':'http://%s:%s'%(ip,port)} print(proxy) proxy_handler = request.ProxyHandler(proxy) opener = request.build_opener(proxy_handler) request.install_opener(opener) req = request.Request(url=test_url,headers=headers) time.sleep(1) try: res = request.urlopen(req) time.sleep(2) content = res.read() if content: print('{0}:{1} is ok'.format(ip,port)) with open("proxy_info.txt", "a") as fd: # 可用ip+port保存到proxy_info.txt文件中 fd.write(ip + u":" + port + "\n") else: print('{0}:{1} is unavailable'.format(ip,port)) except request.URLError as e: print(e.reason) def verify_ip2(ip,port,test_url): import requests try: response = requests.get(test_url,proxies={'http':'http://{0}:{1}'.format(ip,port)},timeout=2) # print(response.status_code) except Exception as e: print("{0}:{1} failed".format(ip,port),e) else: print("{0}:{1} is ok".format(ip,port)) with open("proxy_info.txt", "a") as fd: # 可用ip+port保存到proxy_info.txt文件中 fd.write(ip + u":" + port + "\n") if __name__ == '__main__': url = 'http://www.xicidaili.com/nn/' test_url = "http://httpbin.org/" url_list = [ url + str(i) for i in range(1,2) ] tmp = tempfile.TemporaryFile() for url in url_list: content = get_content(url) time.sleep(2) get_info(tmp,content) tmp.seek(0) for item in tmp.readlines(): item = item.decode('utf-8') # verify_ip(item.split(u":")[0],item.split(u":")[1].strip(),test_url) verify_ip2(item.split(u":")[0],item.split(u":")[1].strip(),test_url) tmp.close()
使用线程池加快验证代理的速度
concurrent.futures.ThreadPoolExecutor
#!/usr/bin/python3.5 # -*- coding:utf-8 -*- import time import tempfile from lxml import etree from urllib import request from concurrent.futures import ThreadPoolExecutor user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0' ip2port = [] def get_content(url): # 获取网页内容 global user_agent headers = {'User-Agent': user_agent} req = request.Request(url=url, headers=headers) res = request.urlopen(req) return res.read().decode('utf-8') def get_info(tmp, content): # 提取网页信息 / ip 端口 ip_list = etree.HTML(content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()') port_list = etree.HTML(content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()') for i in range(0, len(ip_list)): out = u"" out += u"" + ip_list[i] out += u":" + port_list[i] tmp.write((out + u"\n").encode('utf-8')) # 所有ip和端口号写入data文件 def verify_ip(ip, port, url): ret = { 'code':-1,'ipport':None } import requests try: response = requests.get(url, proxies={'http': 'http://{0}:{1}'.format(ip, port)}, timeout=3) print('{}:{} --> {}'.format(ip,port,response.status_code)) except Exception as e: # print("{0}:{1} failed".format(ip, port), e) pass else: # print("{0}:{1} is ok".format(ip, port)) if 200 == response.status_code: ret['code'] = 0 ret['ipport'] = '{0}:{1}'.format(ip,port) finally: return ret def callback(future): global ip2port ret = future.result() if 0 == ret['code']: ip2port.append(ret['ipport']) if __name__ == '__main__': url = 'http://www.xicidaili.com/nn/' verify_url = "http://httpbin.org/" url_list = [url + str(i) for i in range(1, 2)] tmp = tempfile.TemporaryFile() for url in url_list: content = get_content(url) time.sleep(2) get_info(tmp, content) print('原始数据下载完毕,开始构建代理池...') tmp.seek(0) ipports = [ item.decode('utf-8').strip().split(':') for item in tmp.readlines() ] tmp.close() pool = ThreadPoolExecutor(20) for ipport in ipports: ip,port = ipport v = pool.submit(verify_ip, ip, port, verify_url) v.add_done_callback(callback) pool.shutdown(wait=True) print('代理池构建完毕,共获得可用代理 {} 个'.format(len(ip2port))) print(ip2port)
multiprocessing.dummy.Pool
import time import requests from lxml import etree from requests.exceptions import RequestException from multiprocessing.dummy import Pool as ThreadPool available_proxies = [] def get_one_page(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } reponse = requests.get(url, headers=headers) if reponse.status_code == 200: return reponse.text return None except RequestException: return None def get_one_parse(url): print('url: {}'.format(url)) ipports = [] html = get_one_page(url) if html: html = etree.HTML(html) ips = html.xpath('.//*[@id="list"]/table/tbody//td[1]/text()') ports = html.xpath('.//*[@id="list"]/table/tbody//td[2]/text()') for (ip, port) in zip(ips, ports): ipports.append('{}:{}'.format(ip, port)) ipports = list(set(ipports)) print('res: {}'.format(ipports)) return ipports return None def fetch(all_proxies): url = 'https://www.kuaidaili.com/free/intr/{}/' for i in range(1, 61): ret = get_one_parse(url.format(i)) if ret: all_proxies.extend(ret) time.sleep(1) all_proxies = list(set(all_proxies)) print('爬取了前60页,去重后共获得{}个代理'.format(len(all_proxies))) def save(): with open('ip2port.txt', 'a+') as wf: for item in available_proxies: wf.write(item + '\n') print('{}个可用代理保存完毕'.format(len(available_proxies))) def sub_verify(item): proxy = {'http': 'http://{0}'.format(item)} try: response = requests.get("http://httpbin.org/", proxies=proxy, timeout=3) if response.status_code == 200: print("{} is ok".format(item)) available_proxies.append(item) except Exception as e: print("{} failed".format(item)) def verify(ipports): print('开始验证可用代理...') pool = ThreadPool(20) pool.map(sub_verify, ipports) print('验证完毕,共获取可用代理 {} 个'.format(len(available_proxies))) save() if __name__ == "__main__": all_proxies = [] fetch(all_proxies) print(all_proxies,len(all_proxies)) ipports = list(map(lambda x: x.strip(), all_proxies)) verify(ipports)
作者:Standby — 一生热爱名山大川、草原沙漠,还有我们小郭宝贝!
出处:http://www.cnblogs.com/standby/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。
出处:http://www.cnblogs.com/standby/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。