ip代理池学习
代理的作用
网上有许多售卖代理的网站,也有免费的,不过其功效性会能影响。通过代理网站,我们可以向访问的目标访问器隐藏自己的真实ip
,避免ip
地址以访问频率过高等原因被封。
步骤
1.搜集一个免费的代理
2.通过urllib.request
的ProxyHandler
构造一个代理,以字典形式,键名是协议
proxy = '95.45.235.178:40056'
proxy_handler = ProxyHandler({
'http': 'http://'+proxy,
'https': 'https://'+proxy
})
3.通过urllib.request
的build_opener
构造一个请求方法
opener = build_opener(proxy_handler)
4.发起请求
from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener
proxy = '95.45.235.178:40056'
# 如果代理需要登录,则可以这样写
# proxy = 'username:password@95.45.235.178:40056
proxy_handler = ProxyHandler({
'http': 'http://'+proxy,
'https': 'https://'+proxy
})
opener = build_opener(proxy_handler)
try:
response = opener.open('http://httpbin.org/get')
print(response.read().decode("utf-8"))
except URLError as e:
print(e.reason)
完整代码
from lxml import etree
from time import sleep
import requests
from requests.exceptions import ProxyError, ConnectTimeout, ReadTimeout
headers = {
'User-Agent': ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
' (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'),
}
# 0 1 2 .... 代表权重,9表示可用,0表示不可用
ip_poor = {
0: [], 1: []
}
# 爬取免费 ip
def crawl_ip():
ip_list = []
for page in range(1, 11):
response = requests.get("https://www.kuaidaili.com/free/inha/{}/".format(page))
response.encoding = 'utf-8'
html = etree.HTML(response.text)
ip = html.xpath('//*[@id="list"]/table/tbody/tr/td[1]/text()')
port = html.xpath('//*[@id="list"]/table/tbody/tr/td[2]/text()')
ip_list.append([i[0]+':'+i[1] for i in zip(ip, port)])
sleep(2)
return ip_list
# 测试 ip 的可用性
def test_ip(ip_addr):
proxies = {
'http': 'http://'+ip_addr,
'https': 'https://'+ip_addr,
}
try:
resp = requests.get('http://httpbin.org/get', proxies=proxies, headers=headers, timeout=5)
return True
except (ProxyError, ConnectTimeout, ReadTimeout):
return False
# 测试入口
def test():
# while True:
use = [ip for ip in ip_poor[1]]
for ip in use:
if test_ip(ip) is False:
ip_poor[0].append(ip)
ip_poor[1].pop(ip_poor[1].index(ip))
not_use = [ip for ip in ip_poor[0]]
for ip in not_use:
if test_ip(ip) is False:
ip_poor.pop(ip_poor[0].index(ip))
else:
ip_poor[1].append(ip)
ip_poor.pop(ip_poor[0].index(ip))
return ip_poor
def main():
ip_list = crawl_ip()
for item in ip_list:
for i in item:
ip_poor[1].append(i)
test()
if __name__ == '__main__':
main()