《爬虫》爬取可用的免费IP

import telnetlib
import urllib.request
from bs4 import BeautifulSoup

for d in range(1, 3):  # 采集1到2页
	scrapeUrl = 'http://www.xicidaili.com/nn/%d/' % d
	req = urllib.request.Request(scrapeUrl)
	req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
	response = urllib.request.urlopen(req)
	html = response.read()

	bsObj = BeautifulSoup(html, "html.parser")

	for i in range(100):
		speed = float(bsObj.select('td')[6 + i * 10].div.get('title').replace('秒', ''))
		if speed < 0.2:  # 验证速度，只要速度在0.2秒之内的
			ip = bsObj.select('td')[1 + i * 10].get_text()
			port = bsObj.select('td')[2 + i * 10].get_text()
			ip_address = 'http://' + ip + ':' + port
			try:
				telnetlib.Telnet(ip, port=port, timeout=2)  # 用telnet对ip进行验证
			except:
				print('fail')
			else:
				print('sucess：' + ip_address)
				f = open('proxy_list.txt', 'a')
				f.write(ip_address + '\n')
				f.close()

版本二：

import threading
import time
import json

import telnetlib


class TestProxy(object):
	def __init__(self):
		today = time.strftime('%Y%m%d', time.localtime())
		self.filename = today + '.txt'
		self.sFile = self.filename
		self.dFile = r'alive.txt'
		self.URL = r'http://www.baidu.com'
		self.threads = 10
		self.timeout = 3
		self.aliveList = []

		self.run()

	def run(self):
		with open(self.sFile, 'r',encoding='utf-8') as f:
			lines = f.readlines()
			line = lines.pop()
			line = json.loads(line)
			while lines:
				for i in range(self.threads):
					t = threading.Thread(target=self.linkWithProxy, args=(line,))
					t.start()
					if lines:
						line = lines.pop()
					else:
						continue
			with open(self.dFile, 'w') as f:
				for i in range(len(self.aliveList)):
					f.write(self.aliveList[i] + '\n')

	def linkWithProxy(self, line):
		line = json.loads(line)
		protocol = line['protocol'].lower()
		ip = line['ip']
		port = line['port']
		server = protocol + '://' + line['ip'] + ':' + line['port']
		print(server)
		try:
			response = telnetlib.Telnet(ip, port=port, timeout=self.timeout)
		except:
			print('%s 链接失败' % server)
			return
		else:
			print('%s 链接成功！' % server)
			self.aliveList.append(server)
		print(self.aliveList)


if __name__ == '__main__':
	TP = TestProxy()

posted @ 2020-04-08 15:09 水墨黑阅读(447) 评论(0) 编辑收藏举报

刷新页面返回顶部

水墨黑

《爬虫》爬取可用的免费IP

公告