python：多线程抓取西刺和快站代理IP - 痘疤脸

公告

python：多线程抓取西刺和快站代理IP

　　一开始是打算去抓取一些数据，但是总是访问次数多了之后被封IP，所以做了一个专门做了个工具用来抓取在西刺和快站的高匿IP。

　　运行环境的话是在python3.5下运行的，需要requests库

　　在制作的过程中也参考的以下网上其他人的做法，但是发现很大一部分都不是多线程去抓取有点浪费时间了，又或者或网上已经有人做好了轮子了，但是现在的技术还有点看不懂，所以就做了这只在一个py文件上运行的代理池。

　　对于旧IP的处理，我这下面的代码是没有让他运行的，如果要运行可以开启，不过必须要在同一个文件夹里创建一个名为“old_ip.txt”的文档，本来可以做一个自动判定，没有就可以生成的，到最后又没弄了。

　　可能有部分人需要透明IP或者https代理ip，所以里面也有控制抓取那几个网站的控制器，如把下面的：target3=False 改成 target3=Ture 就可以抓取西刺的透明代理了。

　　有一点是要重点注意一下的就是：不建议一下子抓取超过10页，因为我没有设定request访问限定，西刺或者快站都会因为你访问速度太快而判定你为爬虫封24小时的IP......我已经尝试过。　

　　以下是代码：

# -*- coding: UTF-8 -*-     
import threading, requests, datetime
from bs4 import BeautifulSoup
import random
import queue

'''
1、抓取西刺代理、快代理的ip
2、提取之前保存的ip，重新验证ip池内ip的可用性
3、使用ip去访问指定网站 即：url，设定其响应时间为5秒，并且要在规定时间内返回200，即为可用IP 。
4、old_path为IP地址堆叠处，保存旧IP。
4、最后都保存到指定文件夹
'''


# ------------------------------IP多线程设置--------------------------
class Mythread(threading.Thread):  #

    def __init__(self, ip, path, url, type="new_ip"):
        super(Mythread, self).__init__()
        self.ip = ip
        self.path = path
        self.url = url
        self.type = type

    def run(self):
        if self.type == "new_ip":
            if semaphoer.acquire():
                target1 = check_ip(self.ip, self.url)
                if target1 == True:
                    write(self.ip, self.path)
                    all_IP.add(self.ip)
                    print("这个ip可以使用", self.ip)
                semaphoer.release()
        else:
            if semaphoer.acquire():
                target2 = check_ip(self.ip, self.url)
                if target2 == True:
                    all_IP.add(self.ip)
                    print("这个旧IP可使用", self.ip)
                semaphoer.release()


# ------------------------------------时间计算-----------------------------------------

def cost(start, end):
    seconds = (end - start).seconds
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    cost_time = ("%s:%s:%s" % (h, m, s))
    return cost_time


# -----------------随机选择请求头参数--------------------

def getheaders():
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    UserAgent = random.choice(user_agent_list)
    headers = {"User-Agent": UserAgent}
    return headers


# --------------------并发IP验证-------------------------------

def check_ip(ip, url):
    header = getheaders()
    proxies = {"http": "http://" + ip, "https": "http://" + ip}
    print("开始测试这个ip", ip)  # 实时反映测试IP，不希望被IP刷屏  可关闭。
    try:
        response = requests.get(url=url, proxies=proxies, headers=header, timeout=5).status_code  # 定为5秒内有响应即为可用ip
        if response == 200:
            return True
        else:
            return False
    except:
        return False


# ---------------------清空函数-----------------------------

def clearing_txt(path):
    with open(path, 'w', encoding="utf-8") as f:
        f.truncate()


# --------------------读取函数（并返回一个列表）-------------------

def read_txt(path):
    txt = []
    with open(path, "r", encoding="utf-8") as h:
        for line in h.readlines():
            txt.append(line.strip())
    return txt


# --------------------写入函数----------------------------

def write(ip, path):
    with open(path, "a", encoding="utf8") as f:
        f.writelines(ip)
        f.write("\n")


# --------------------------阻塞主线程-------------------------
def join(list):
    for i in list:
        i.start()
    for i in list:
        i.join()


# -------------------------网站爬虫抓取IP---------------------------------

def call_net(num, pagenum):
    scrapy_url = {2: 'https://www.kuaidaili.com/free/inha/',  # 快代理
                  1: 'http://www.xicidaili.com/nn/',  # 西刺高匿代理,
                  3: "http://www.xicidaili.com/nt/",  # 西刺普通代理
                  4: "http://www.xicidaili.com/wn/",  # 西刺https代理
                  5: "http://www.xicidaili.com/wt/"  # 西刺http代理
                  }
    get_url = scrapy_url[num] + str(pagenum)  # URL组合
    header = getheaders()  # 请求头
    html = requests.get(url=get_url, headers=header, timeout=6).text  # 访问并提取网页源代码
    soup = BeautifulSoup(html, 'lxml')  # 格式化
    if num == 2:
        all_ip = soup.find_all(attrs={"data-title": "IP"})  # 快代理 IP  和 port 都是不同的标签参数，所以要用两种方法处理
        all_port = soup.find_all(attrs={"data-title": "PORT"})
        all = len(all_ip)
        for i in range(all):
            ip_3 = all_ip[i].text + ":" + all_port[i].text
            q.put(ip_3)  # 放入IP队列

    else:
        all = soup.find_all("tr", class_="odd")  # 西刺代理的网站源代码IP有两种不同的tr标签，所以要两种方法处理
        all2 = soup.find_all("tr", class_="")
        for i in all:
            t = i.find_all('td')
            ip_1 = t[1].text + ':' + t[2].text
            q.put(ip_1)  # 放入IP队列
        for h in all2:
            x = h.find_all('td')
            if x == []:
                continue
            ip_2 = x[1].text + ':' + x[2].text
            q.put(ip_2)  # 放入IP队列


# ---------------------------获取IP主程序------------------------------
def get_ip(url, path, path_old, page):
    clearing_txt(path)  # 清空文件内容
    threads = []  # 爬虫线程列表
    threads_ip = []  # IP线程测试列表
    threads_old_ip = []  # 旧IP线程测试列表
    start_time = datetime.datetime.now()  # 记录开始时间
    for num in range(2):
        for page_num in range(page):
            net_threads = threading.Thread(target=call_net, args=(num + 1, page_num + 1))  # 创建爬虫线程进行访问
            threads.append(net_threads)
    print("开始抓取西刺、快站的高匿代理")

    if target3:
        num = 3
        for page_num in range(page):
            net_threads = threading.Thread(target=call_net, args=(num + 1, page_num + 1))  # 创建爬虫线程进行访问
            threads.append(net_threads)
        print("开始抓取西刺的透明代理")

    if target4:
        num = 4
        for page_num in range(page):
            net_threads = threading.Thread(target=call_net, args=(num + 1, page_num + 1))  # 创建爬虫线程进行访问
            threads.append(net_threads)
        print("开始抓取西刺的HTTPS代理")

    if target5:
        num = 5
        for page_num in range(page):
            net_threads = threading.Thread(target=call_net, args=(num + 1, page_num + 1))  # 创建爬虫线程进行访问
            threads.append(net_threads)
        print("开始抓取西刺的HTTP代理")

    join(threads)
    # print("测试旧存储IP")
    # for i in old_IP_list:
    #     threads_old_ip.append(Mythread(i,path,url,type="old"))       #创建旧IP线程进行访问
    # join(threads_old_ip)
    # print("一共%s个旧IP可使用" %len(all_IP))
    while not q.empty():  # 提取q队列内的IP，并创建IP测试线程
        i = q.get()
        threads_ip.append(Mythread(i, path, url, ))  # 创建新IP线程进行访问
    join(threads_ip)
    print("成功爬取")
    end_time = datetime.datetime.now()  # 记录结束时间
    cost_time = cost(start_time, end_time)
    clearing_txt(path_old)
    for all_ip in all_IP:  # 所有IP写入
        write(all_ip, path_old)
    new_ip = read_txt(path)
    old_ip = read_txt(path_old)
    print("耗时：%s 一共抓取：%s个新IP  以保存：%s个IP" % (cost_time, len(new_ip), len(old_ip)))


# --------------------------主要参数配置点------------------------------------
if __name__ == '__main__':
    semaphoer = threading.Semaphore(15)  # 线程池数量大小，默认为15
    q = queue.Queue()  # 创建队列
    page = 2  # 爬取西刺和快代理的页数 不能设置过大
    path = "ip.txt"  # 设定新爬取IP保存的文件名
    path_old = "ip_old.txt"  # 设定旧IP保存点
    url = "https://www.baidu.com/"  # 设定验证IP可用性的url参数
    # old_IP_list = read_txt(path_old) #旧IP提取
    all_IP = set()  # 全IP集合
    target3 = False  # 是否抓取西刺透明代理
    target4 = False  # 是否抓取西刺HTTPS代理
    target5 = False  # 是否抓取西刺HTTPS代理
    get_ip(url, path, path_old, page)

posted on 2018-11-23 13:59 痘疤脸阅读(1080) 评论(0) 收藏举报

刷新页面返回顶部