Python爬虫篇(代理IP)--lizaza.cn
在做网络爬虫的过程中经常会遇到请求次数过多无法访问的现象,这种情况下就可以使用代理IP来解决。但是网上的代理IP要么收费,要么没有API接口。秉着能省则省的原则,自己创建一个代理IP库。
废话不多说,直接上代码:
1 import requests 2 from bs4 import BeautifulSoup 3 4 5 # 发送请求 6 def GetInfo(url): 7 headers = { 8 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' 9 } 10 proxies = {"http": "https://119.180.173.81:8060"} 11 response = requests.get(url=url, proxies=proxies, headers=headers) 12 response.encoding = "utf8" 13 return response.text 14 15 16 # 将数据写入文件 17 def WriteData(): 18 for i in range(100): 19 url = "https://www.xicidaili.com/nn/" + str(i+1) 20 data = GetData(url) 21 file = open('Proxies.txt', 'a+') 22 file.write(str(data)) 23 file.close() 24 25 26 # 验证该代理能否使用 27 def verify(proxies): 28 req = requests.get("https://www.baidu.com", proxies=proxies) 29 return req.status_code 30 31 32 # 解析页面 33 def GetData(url): 34 data = list() 35 html = GetInfo(url) 36 soup = BeautifulSoup(html, "lxml") 37 table = soup.find_all("table", id="ip_list") 38 soup = BeautifulSoup(str(table[0]), "lxml") 39 trs = soup.find_all("tr") 40 del trs[0] 41 for tr in trs: 42 ip = tr.select("td")[1].get_text() 43 port = tr.select("td")[2].get_text() 44 protocol = tr.select("td")[5].get_text() 45 address = protocol.lower()+"://"+ip+":"+port 46 proxies = {'http': address} 47 if verify(proxies) == 200: 48 data.append(address) 49 return data 50 51 if __name__ == '__main__': 52 WriteData()
返回数据:
['http://111.222.141.127:8118', 'https://117.88.177.101:3000', 'http://183.166.136.144:8888', 'http://27.208.231.100:8060', 'http://123.169.99.177:9999', 'http://119.84.84.185:12345', 'http://101.132.190.101:80', 'https://114.99.54.65:8118', 'https://119.4.13.26:1133', 'http://58.253.158.177:9999', 'http://114.223.208.165:8118', 'http://112.84.73.53:9999']