python3 爬取代理池

import  re
import requests
from bs4 import BeautifulSoup as bs
import _thread
import time
headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'zh-CN,zh;q=0.9',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Upgrade-Insecure-Requests':'1'
}
def iplist():
    ip_port = [];
    for x in range(1,30):
        url=r'http://www.66ip.cn/{}.html'.format(x)
        html=requests.get(url=url,headers=headers)
        html=html.text.encode("latin1").decode('gbk') #中文乱码解决
        re_ip=re.findall("((?:[0-9]{1,3}\.){3}[0-9]{1,3})",html) # ip正则匹配
        re_port=re.findall("<td>(\d{1,5})</td>",html) #port正则匹配
        result=dict(zip(re_ip,re_port)) #列表合成字典
        for key in result: #字典遍历
            ip_port.append(key+":"+result[key])
    return ip_port

def iplist2():#https://www.89ip.cn/
    a=[]
    url='https://www.89ip.cn/tqdl.html?api=1&num=100'
    iplist2get=requests.get(url=url,headers=headers).text
    re_ip2=re.findall('((?:[0-9]{1,3}\.){3}[0-9]{1,3}.*?\d{1,5})',iplist2get)
    for x in re_ip2:
        a.append(x)
    return a

def ver(ips):
    for ip in ips:
        proxies={
            'http':'http://{}'.format(ip),
            'https':'https://{}'.format(ip)
        }
        url='http://www.baidu.com'
        try:
            res=_thread.start_new_thread(requests.get(url=url,proxies=proxies,timeout=0))
            if res.status_code == 200:
                print('Success:'+ip)
                with open('Success.txt','a+') as f:
                    f.write(ip+'\n')
        except Exception as e:
            print ('Failed:'+ip)


def result():
    ips=iplist2()
    ver(ips)
    ips=iplist()
    ver(ips)


if __name__=='__main__':
    start=time.clock()
    result()
    end=time.clock()
    print ('time: %s'%(end-start))
posted @ 2021-07-26 15:31  G0mini  阅读(110)  评论(0编辑  收藏  举报