获取免费ip_存入excel_用了线程池_封装清晰

from lxml import etree
import time
import requests
from multiprocessing.pool import ThreadPool #线程池
import xlwt

class Visit(): #访问代理ip

def __init__(self):
self.i = 0
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
self.f = xlwt.Workbook(encoding='utf-8')
self.sheet1 = self.f.add_sheet(u'sheet1', cell_overwrite_ok=True)

def get_html(self,url): # 请求页面

try:
res = requests.get(url, headers=self.headers)
res.encoding = res.apparent_encoding
if res.status_code == 200:
html = res.text
return html
else:
time.sleep(0.1)
return self.get_html(url)
except Exception as e: # except BaseException 这个也可以 e是打印出错误的原因
print("问题是", e)

def parse(self,response): #解析
content = etree.HTML(response) # 标准的使用 xpath套路
list_tr = content.xpath("//table[@class=\"layui-table\"]/tbody//tr")
for tr in list_tr:
li_ip = str(tr.xpath("./td[1]/text()")[0]).replace("\t", '').replace("\n", '')
li_port = str(tr.xpath("./td[2]/text()")[0]).replace("\t", '').replace("\n", '')
#print(li_ip, li_port)
text_ip = "http://{}:{}".format(li_ip,li_port)
self.Test_ip(text_ip)
def Test_ip(self,text_ip):
try:
requests.get('http://wenshu.court.gov.cn/', proxies={"http": text_ip}) #这个地方也可以写res.status_code == 200: 作为判断
except:
print('代理链接失败connect failed',text_ip)
else:
print('代理成功success',text_ip)
self.excel_write(text_ip)


def excel_write(self,text_ip):

self.sheet1.write(self.i, 0, text_ip)
self.i += 1
self.f.save(r'e:\get_ip.xls') # 保存
if __name__ == '__main__':
url = "http://www.89ip.cn/"
pool = ThreadPool(16)#实现一个线程池 ,参数是线程的数量, 这里就是两个线程等待调用
a = Visit()
a.parse(a.get_html(url)) # 解析详细页面, 调用requests请求
for i in range(2,11):
next_page = "http://www.89ip.cn/index_{}.html".format(i)
a.parse(a.get_html(next_page))
pool.apply_async(a.parse(a.get_html(next_page))) # 这个线程池传参很精髓
pool.close() # 关闭线程池, 不在提交任务,
pool.join() # 等待线程池里面的任务 运行完毕

posted on 2018-08-25 13:28  袁佳佳  阅读(167)  评论(0编辑  收藏  举报

导航