《爬虫》爬取可用的免费IP
import telnetlib import urllib.request from bs4 import BeautifulSoup for d in range(1, 3): # 采集1到2页 scrapeUrl = 'http://www.xicidaili.com/nn/%d/' % d req = urllib.request.Request(scrapeUrl) req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') response = urllib.request.urlopen(req) html = response.read() bsObj = BeautifulSoup(html, "html.parser") for i in range(100): speed = float(bsObj.select('td')[6 + i * 10].div.get('title').replace('秒', '')) if speed < 0.2: # 验证速度,只要速度在0.2秒之内的 ip = bsObj.select('td')[1 + i * 10].get_text() port = bsObj.select('td')[2 + i * 10].get_text() ip_address = 'http://' + ip + ':' + port try: telnetlib.Telnet(ip, port=port, timeout=2) # 用telnet对ip进行验证 except: print('fail') else: print('sucess:' + ip_address) f = open('proxy_list.txt', 'a') f.write(ip_address + '\n') f.close()
版本二:
import threading import time import json import telnetlib class TestProxy(object): def __init__(self): today = time.strftime('%Y%m%d', time.localtime()) self.filename = today + '.txt' self.sFile = self.filename self.dFile = r'alive.txt' self.URL = r'http://www.baidu.com' self.threads = 10 self.timeout = 3 self.aliveList = [] self.run() def run(self): with open(self.sFile, 'r',encoding='utf-8') as f: lines = f.readlines() line = lines.pop() line = json.loads(line) while lines: for i in range(self.threads): t = threading.Thread(target=self.linkWithProxy, args=(line,)) t.start() if lines: line = lines.pop() else: continue with open(self.dFile, 'w') as f: for i in range(len(self.aliveList)): f.write(self.aliveList[i] + '\n') def linkWithProxy(self, line): line = json.loads(line) protocol = line['protocol'].lower() ip = line['ip'] port = line['port'] server = protocol + '://' + line['ip'] + ':' + line['port'] print(server) try: response = telnetlib.Telnet(ip, port=port, timeout=self.timeout) except: print('%s 链接失败' % server) return else: print('%s 链接成功!' % server) self.aliveList.append(server) print(self.aliveList) if __name__ == '__main__': TP = TestProxy()