爬取代理池(高匿http)
爬取的目标网站
66ip
##定义单页爬取函数
def parse_page(url):
r = requests.get(url,headers=headers)
if r.status_code == 200:
html = etree.HTML(r.text)
trs = html.xpath('//div[@align="center"]/table//tr')
for tr in trs[1:]:
ip = tr.xpath('.//td[1]/text()')[0]
port = tr.xpath('.//td[2]/text()')[0]
ip_port = ip+":"+port
def main():
##爬虫进行翻页
for i in range(1,34):
url = "http://www.66ip.cn/areaindex_%d/1.html" %i
parse_page(url)
if __name__ == '__main__':
main()
proxylist
这个网站为:http://proxylist.fatezero.org/
打开为此页面
我们查看一下网页源代码,如下图,发现并没有ip,说明此网页用了ajax技术
,我们进行检查并到network很容易发现接口
打开此接口的url就感觉非常的好,里面是一条条json数据包裹着的代理Ip,每一条以换行符\n隔开,所以我们只需要对接口也就是http://proxylist.fatezero.org/proxy.list进行请求爬取
下面是代码
url = "http://proxylist.fatezero.org/proxy.list"
r = requests.get(url,headers=headers)
if r.status_code == 200:
##使用split即将每一条ip都变成一个列表的一个元素
lists = r.text.split('\n')
for i in lists:
try:
li = json.loads(i,strict=False)
##提取高匿http代理
if str(li['anonymity']) == 'high_anonymous' and str(li['type']) == 'http':
ip_port = str(li['host'])+":"+str(li['port'])
except:
continue
快代理
快代理提取较简单,代码如下
##定义单页爬取函数
def parse_page(url):
r = requests.get(url,headers=headers)
html = etree.HTML(r.text)
trs = html.xpath('//tbody//tr')
for tr in trs:
ip = tr.xpath('./td[1]/text()')[0]
port = tr.xpath('./td[2]/text()')[0]
ip_port = ip+":"+port
def main():
##爬虫进行翻页
for i in range(1,30):
url = "https://www.kuaidaili.com/free/inha/%d" %i
parse_page(url)
if __name__ == '__main__':
main()
验证ip可用性
在上面3个网站爬取的ip+port放入temp列表中
def test_proxy():
for ip_port in temp:
proxy = {
'http':ip_port
}
try:
r = requests.get('http://www.baidu.com',headers=headers,proxies=proxy,timeout=5)
print(r.status_code)
if r.status_code != 200:
temp.remove(ip_port)
except:
temp.remove(ip_port)
print("faild:{}".format(ip_port))
使用try函数是排除掉代理无法访问www.baidu.com的ip,再使用状态码是否为200排除错误状态码的ip
完整代码
```python
from lxml import etree
import requests
temp = []
def get_66ip():
def parse_page(url):
r = requests.get(url,headers=headers)
if r.status_code == 200:
html = etree.HTML(r.text)
trs = html.xpath('//div[@align="center"]/table//tr')
for tr in trs[1:]:
ip = tr.xpath('.//td[1]/text()')[0]
port = tr.xpath('.//td[2]/text()')[0]
ip_port = ip+":"+port
temp.append(ip_port)
def main():
for i in range(1,34):
url = "http://www.66ip.cn/areaindex_%d/1.html" %i
parse_page(url)
if __name__ == '__main__':
main()
def pro():
url = "http://proxylist.fatezero.org/proxy.list"
r = requests.get(url,headers=headers)
if r.status_code == 200:
lists = r.text.split('\n')
for i in lists:
try:
li = json.loads(i,strict=False)
if str(li['anonymity']) == 'high_anonymous' and str(li['type']) == 'http':
ip_port = str(li['host'])+":"+str(li['port'])
temp.append(ip_port)
except:
continue
def kuai():
def parse_page(url):
r = requests.get(url,headers=headers)
html = etree.HTML(r.text)
trs = html.xpath('//tbody//tr')
for tr in trs:
ip = tr.xpath('./td[1]/text()')[0]
port = tr.xpath('./td[2]/text()')[0]
ip_port = ip+":"+port
temp.append(ip_port)
def main():
for i in range(1,30):
url = "https://www.kuaidaili.com/free/inha/%d" %i
parse_page(url)
if __name__ == '__main__':
main()
def test_proxy():
for ip_port in temp:
proxy = {
'http':ip_port
}
try:
r = requests.get('http://www.baidu.com',headers=headers,proxies=proxy,timeout=5)
print(r.status_code)
if r.status_code != 200:
temp.remove(ip_port)
except:
temp.remove(ip_port)
print("faild:{}".format(ip_port))
if __name__ == '__main__':
get_66ip()
kuai()
pro()
test_proxy()
for i in temp:
print(i)
#以a模式为追加写入到Txt文件
for i in temp:
print(i)
f = open('2222.txt','a',encoding='utf-8')
f.write(i)
f.write('\n')
f.close
结尾
爬取代理ip就结束了,大家也可以考虑部署到服务器上,做一个api接口,有兴趣的可以自己探索一下。。。。