python 代理池

python+requests+pyquery+threading 多线程测试代理可用不可用

import requests
from requests.exceptions import RequestException #请求出现异常的模块
from pyquery import PyQuery as pq #解析模块
import threading #多线程模块
from queue import Queue #队列模块
class ProXiesAD(threading.Thread): #创建一个线程类
    def __init__(self,set_queue):
        super(ProXiesAD, self).__init__()
        self.set_queue = set_queue
    def run(self):
        global KY_HOST
        while not PARSE_EXIT: #循环取队列一直将队列取空
            try:
                host = self.set_queue.get()
                proxies = {'https':host,'http':host}
                get_code = requests.get(url='https://www.baidu.com/',proxies=proxies,timeout=5,headers=headers).status_code #返回请求百度的状态码
            except RequestException:
                print("不可用")
            else:
                if get_code == 200:
                    KY_HOST.append(host)
                    print('可用'+host)
                else:
                    print('请求超时')
        print(KY_HOST)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'} #
HOST = []
KY_HOST = []
PARSE_EXIT = False
def url_text(url):
    try:
        texts = requests.get(url,headers=headers)
    except RequestException:
        print('请求错误')
    else:
        if texts.status_code==200: #状态码为200说明请求成功
            return texts.text
def re_response(text):
    global HOST#全局变量声明
    doc = pq(text)
    for i in doc('tr').items():
        ip = i('tr td:lt(3)').text().replace(' ',"",1).replace(' ',':')#解析
        if ip.find("."):
            HOST.append(ip)#存到列表里面
    del HOST[0]
def main(ints):
    url = 'http://www.xicidaili.com/nn/'+ints
    text = url_text(url)
    re_response(text)
    pagequeue = Queue()#创建队列
    threadkeep=[]
    for i in HOST:
        pagequeue.put(i)#将ip放到队列里面去
    keepList = ["保存线程一号","保存线程二号","保存线程三号"]
    for threadName in keepList: #创建三个线程
        thread = ProXiesAD(pagequeue)
        thread.start()
        threadkeep.append(thread)

    while not pagequeue.empty(): #等待队列为空
        pass
    global PARSE_EXIT
    PARSE_EXIT = True #队列为空循环结束
    for thread in threadkeep:#主线程等待子线程结束在结束
        thread.join()
if __name__ == '__main__':
    main("1")

 

python+requests+pyquery+threading 多线程测试代理可用不可用python+requests+pyquery+threading 多线程测试代理可用不可用python+requests+pyquery+threading 多线程测试代理可用不可用

posted @ 2018-07-16 13:58  锋逸  阅读(779)  评论(0编辑  收藏  举报