一开始是打算去抓取一些数据,但是总是访问次数多了之后被封IP,所以做了一个专门做了个工具用来抓取在西刺和快站的高匿IP。
运行环境的话是在python3.5下运行的,需要requests库
在制作的过程中也参考的以下网上其他人的做法,但是发现很大一部分都不是多线程去抓取有点浪费时间了,又或者或网上已经有人做好了轮子了,但是现在的技术还有点看不懂,所以就做了这只在一个py文件上运行的代理池。
对于旧IP的处理,我这下面的代码是没有让他运行的,如果要运行可以开启,不过必须要在同一个文件夹里创建一个名为“old_ip.txt”的文档,本来可以做一个自动判定,没有就可以生成的,到最后又没弄了。
可能有部分人需要透明IP或者https代理ip,所以里面也有控制抓取那几个网站的控制器,如把下面的:target3=False 改成 target3=Ture 就可以抓取西刺的透明代理了。
有一点是要重点注意一下的就是:不建议一下子抓取超过10页,因为我没有设定request访问限定,西刺或者快站都会因为你访问速度太快而判定你为爬虫封24小时的IP......我已经尝试过。
以下是代码:
# -*- coding: UTF-8 -*- import threading, requests, datetime from bs4 import BeautifulSoup import random import queue ''' 1、抓取西刺代理、快代理的ip 2、提取之前保存的ip,重新验证ip池内ip的可用性 3、使用ip去访问指定网站 即:url,设定其响应时间为5秒,并且要在规定时间内返回200,即为可用IP 。 4、old_path为IP地址堆叠处,保存旧IP。 4、最后都保存到指定文件夹 ''' # ------------------------------IP多线程设置-------------------------- class Mythread(threading.Thread): # def __init__(self, ip, path, url, type="new_ip"): super(Mythread, self).__init__() self.ip = ip self.path = path self.url = url self.type = type def run(self): if self.type == "new_ip": if semaphoer.acquire(): target1 = check_ip(self.ip, self.url) if target1 == True: write(self.ip, self.path) all_IP.add(self.ip) print("这个ip可以使用", self.ip) semaphoer.release() else: if semaphoer.acquire(): target2 = check_ip(self.ip, self.url) if target2 == True: all_IP.add(self.ip) print("这个旧IP可使用", self.ip) semaphoer.release() # ------------------------------------时间计算----------------------------------------- def cost(start, end): seconds = (end - start).seconds m, s = divmod(seconds, 60) h, m = divmod(m, 60) cost_time = ("%s:%s:%s" % (h, m, s)) return cost_time # -----------------随机选择请求头参数-------------------- def getheaders(): user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] UserAgent = random.choice(user_agent_list) headers = {"User-Agent": UserAgent} return headers # --------------------并发IP验证------------------------------- def check_ip(ip, url): header = getheaders() proxies = {"http": "http://" + ip, "https": "http://" + ip} print("开始测试这个ip", ip) # 实时反映测试IP,不希望被IP刷屏 可关闭。 try: response = requests.get(url=url, proxies=proxies, headers=header, timeout=5).status_code # 定为5秒内有响应即为可用ip if response == 200: return True else: return False except: return False # ---------------------清空函数----------------------------- def clearing_txt(path): with open(path, 'w', encoding="utf-8") as f: f.truncate() # --------------------读取函数(并返回一个列表)------------------- def read_txt(path): txt = [] with open(path, "r", encoding="utf-8") as h: for line in h.readlines(): txt.append(line.strip()) return txt # --------------------写入函数---------------------------- def write(ip, path): with open(path, "a", encoding="utf8") as f: f.writelines(ip) f.write("\n") # --------------------------阻塞主线程------------------------- def join(list): for i in list: i.start() for i in list: i.join() # -------------------------网站爬虫抓取IP--------------------------------- def call_net(num, pagenum): scrapy_url = {2: 'https://www.kuaidaili.com/free/inha/', # 快代理 1: 'http://www.xicidaili.com/nn/', # 西刺高匿代理, 3: "http://www.xicidaili.com/nt/", # 西刺普通代理 4: "http://www.xicidaili.com/wn/", # 西刺https代理 5: "http://www.xicidaili.com/wt/" # 西刺http代理 } get_url = scrapy_url[num] + str(pagenum) # URL组合 header = getheaders() # 请求头 html = requests.get(url=get_url, headers=header, timeout=6).text # 访问并提取网页源代码 soup = BeautifulSoup(html, 'lxml') # 格式化 if num == 2: all_ip = soup.find_all(attrs={"data-title": "IP"}) # 快代理 IP 和 port 都是不同的标签参数,所以要用两种方法处理 all_port = soup.find_all(attrs={"data-title": "PORT"}) all = len(all_ip) for i in range(all): ip_3 = all_ip[i].text + ":" + all_port[i].text q.put(ip_3) # 放入IP队列 else: all = soup.find_all("tr", class_="odd") # 西刺代理的网站源代码IP有两种不同的tr标签,所以要两种方法处理 all2 = soup.find_all("tr", class_="") for i in all: t = i.find_all('td') ip_1 = t[1].text + ':' + t[2].text q.put(ip_1) # 放入IP队列 for h in all2: x = h.find_all('td') if x == []: continue ip_2 = x[1].text + ':' + x[2].text q.put(ip_2) # 放入IP队列 # ---------------------------获取IP主程序------------------------------ def get_ip(url, path, path_old, page): clearing_txt(path) # 清空文件内容 threads = [] # 爬虫线程列表 threads_ip = [] # IP线程测试列表 threads_old_ip = [] # 旧IP线程测试列表 start_time = datetime.datetime.now() # 记录开始时间 for num in range(2): for page_num in range(page): net_threads = threading.Thread(target=call_net, args=(num + 1, page_num + 1)) # 创建爬虫线程进行访问 threads.append(net_threads) print("开始抓取西刺、快站的高匿代理") if target3: num = 3 for page_num in range(page): net_threads = threading.Thread(target=call_net, args=(num + 1, page_num + 1)) # 创建爬虫线程进行访问 threads.append(net_threads) print("开始抓取西刺的透明代理") if target4: num = 4 for page_num in range(page): net_threads = threading.Thread(target=call_net, args=(num + 1, page_num + 1)) # 创建爬虫线程进行访问 threads.append(net_threads) print("开始抓取西刺的HTTPS代理") if target5: num = 5 for page_num in range(page): net_threads = threading.Thread(target=call_net, args=(num + 1, page_num + 1)) # 创建爬虫线程进行访问 threads.append(net_threads) print("开始抓取西刺的HTTP代理") join(threads) # print("测试旧存储IP") # for i in old_IP_list: # threads_old_ip.append(Mythread(i,path,url,type="old")) #创建旧IP线程进行访问 # join(threads_old_ip) # print("一共%s个旧IP可使用" %len(all_IP)) while not q.empty(): # 提取q队列内的IP,并创建IP测试线程 i = q.get() threads_ip.append(Mythread(i, path, url, )) # 创建新IP线程进行访问 join(threads_ip) print("成功爬取") end_time = datetime.datetime.now() # 记录结束时间 cost_time = cost(start_time, end_time) clearing_txt(path_old) for all_ip in all_IP: # 所有IP写入 write(all_ip, path_old) new_ip = read_txt(path) old_ip = read_txt(path_old) print("耗时:%s 一共抓取:%s个新IP 以保存:%s个IP" % (cost_time, len(new_ip), len(old_ip))) # --------------------------主要参数配置点------------------------------------ if __name__ == '__main__': semaphoer = threading.Semaphore(15) # 线程池数量大小,默认为15 q = queue.Queue() # 创建队列 page = 2 # 爬取西刺和快代理的页数 不能设置过大 path = "ip.txt" # 设定新爬取IP保存的文件名 path_old = "ip_old.txt" # 设定旧IP保存点 url = "https://www.baidu.com/" # 设定验证IP可用性的url参数 # old_IP_list = read_txt(path_old) #旧IP提取 all_IP = set() # 全IP集合 target3 = False # 是否抓取西刺透明代理 target4 = False # 是否抓取西刺HTTPS代理 target5 = False # 是否抓取西刺HTTPS代理 get_ip(url, path, path_old, page)