代理池-豆瓣电影
代理池
实现了简单的代理池免费ip的获取,并使用有效的ip,进行爬取工作
import requests
import re
from lxml import etree
url = 'https://www.xicidaili.com/nn/'
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}
def get(proxies):
url2 = 'https://movie.douban.com/top250'
for i in range(0,250,25):
payload = {'start': i}
movie_response = requests.get(url=url2,headers=headers,proxies=proxies,params=payload).text
tree = etree.HTML(movie_response)
title.extend(tree.xpath("//div[@class='item']//a/span[1]/text()"))
movie_url.extend(tree.xpath("//div[@class='item']//a/@href"))
fen.extend(tree.xpath("//div[@class='star']//span[2]/text()"))
ping.extend(tree.xpath("//div[@class='star']//span[4]/text()"))
#构建代理池
def ip_run():
ip_response = requests.get(url=url,headers=headers).text
ips = re.findall("<td>(\d+\.\d+\.\d+\.\d+)</td>", ip_response, re.S)
ports = re.findall("<td>(\d+)</td>", ip_response, re.S)
for ip in(zip(ips,ports)):
proxies = {
"http":"http://"+ip[0]+":"+ip[1],
"https":"http://"+ip[0]+":"+ip[1],
}
try:
res = requests.get('http://www.baidu.com',proxies=proxies, timeout=2)
print("ip能使用")
get(proxies)
break
except Exception as e:
print("ip不能使用")
if __name__ == '__main__':
title = []
movie_url = []
fen= []
ping = []
ip_run()
jie = zip(title,movie_url,fen,ping)
for i in jie:
print(i)