爬代理ip并验证可用性

爬代理ip

# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 17:30:36 2020

@author: Administrator
"""

#生成可用代理ip#python版本2.7
import sys
import time
import random

import re
import requests
from bs4 import BeautifulSoup as bs
from lxml import etree
from fake_useragent import UserAgent

#查看userAgent池文件地址
#https://pan.baidu.com/s/1_Qv1LGBSjO2bnF4ocMqhwQ 提取码: 2hpu
import tempfile
print(tempfile.gettempdir() + '\\fake_useragent_0.1.11.json')
# 实例化 UserAgent类
# 如报错就把上述json放到temp文件夹中
ua = UserAgent()

# 对应浏览器的头部信息
#print(ua.ie)
#print(ua.opera)
#print(ua.chrome)
#print(ua.firefox)
#print(ua.safari)

# 随机返回头部信息,推荐使用
print(ua.random)
#reload(sys)
#sys.setdefaultencoding('utf-8')

# 利用一个正则就可以直接采集代理IP的站点
PROXY_SITES_BY_REGX = {
    'urls': [
        'http://ab57.ru/downloads/proxyold.txt',
        'http://www.proxylists.net/http_highanon.txt',
        'http://www.atomintersoft.com/high_anonymity_elite_proxy_list',
        'http://www.atomintersoft.com/transparent_proxy_list',
        'http://www.atomintersoft.com/anonymous_proxy_list',
        'http://www.proxy4free.info/',
        'http://tools.rosinstrument.com/proxy/plab100.xml',
        'https://www.rmccurdy.com/scripts/proxy/good.txt',
        'http://proxy.ipcn.org/proxylist2.html',
        'http://best-proxy.ru/feed',
        'http://www.proxylists.net/?HTTP',
        'http://uks.pl.ua/script/getproxy.php?last'
    ],
    'proxy_regx': r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,4}"
}

//*[@id="services"]/div/div[2]/div/div/div/table/tbody/tr[1]/td[1]

# 需要利用xpath 定位代理IP 的站点
PROXY_SITES_BY_XPATH = [
    {
        'urls': ['http://www.66ip.cn/%s.html' % page for page in ['index'] + list(range(2, 11))],
        'ip_xpath': ".//*[@id='main']/div/div[1]/table/tr[position()>1]/td[1]/text()" ,
        'port_xpath': ".//*[@id='main']/div/div[1]/table/tr[position()>1]/td[2]/text()"
    },
    {
        'urls': ['http://www.mimiip.com/gngao/%s' % page for page in range(2, 10)],
        'ip_xpath': ".//table[@class='list']/tbody/tr/td[1]/text()",
        'port_xpath': ".//table[@class='list']/tbody/tr/td[2]/text()"
    },
    {
        'urls': ['http://www.ip181.com/daili/%s.html' % page for page in range(1, 8)],
        'ip_xpath': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]/td[1]/text()" ,
        'port_xpath': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]/td[2]/text()"
    }
]


#http://www.goubanjia.com/
#res = [i.xpath('./td/*/text()') for i in selector.xpath('.//*[@class="table table-hover"]/tbody//tr')]
#[[''.join(i[:-7])+':'+i[-7],]+i[-6:] for i in res]  #结果不对


# 抓取代理ip及port
def get_proxy(inFile):
    headers= {'User-Agent':str(UserAgent().random)}
    fp = open(inFile, 'a+')
    #利用一个正则就可以直接采集代理IP的站点抓取
    pattern = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,4}")
    for url in PROXY_SITES_BY_REGX['urls']:
        try:
            response = requests.get(url, headers = headers).text
            #response.split('\r\n')
            proxy_list = pattern.findall(response)
            fp.write('\n'.join(proxy_list))
            #fp.writelines([ip+'\n' for ip in proxy_list])
            print('+++Success:', url)
        except Exception as e:
            print('---Failure:', url)
            print(e)
    
    #需要利用xpath定位代理IP的站点抓取
    print('*'*30)
    for i in range(len(PROXY_SITES_BY_XPATH)):
        proxy_sites = PROXY_SITES_BY_XPATH[i]
        #pattern = proxy_sites['ip_xpath'].strip('/td[1]/text()')  #strip的坑
        pattern = proxy_sites['ip_xpath'].replace('/td[1]/text()','')
        for url in proxy_sites['urls']:
            try:
                response = requests.get(url, headers = headers).text
                selector = etree.HTML(response)
                proxy_list = [
                    ':'.join(i.xpath('./td/text()')[:2]) 
                    for i in selector.xpath(pattern)
                    ]
                #fp.write('\n'.join(proxy_list))
                fp.writelines([ip+'\n' for ip in proxy_list])
                print('+++Success:', url)
            except Exception as e:
                print('---Failure:', url)
                print(e)
    fp.close()



# 代理输出位置,可用fake_useragent包替代
def Header_get(agentFile):
    agents = []
    for line in open(AgentFile, "r"):
        agents.append(line.strip('\n\r')[1:-1])
    fakeheader = {}
    fakeheader['User-agent'] = agents[random.randint(0, len(agents)-1)]
    return fakeheader

#这里没有完全将上面所有存在代理ip的地址全部爬取下来,你可以将那些网址上的ip直接拷贝写到文件上,然后测试哪个对你当前的网络能够使用,这里使用百度的网址进行测试
def inspect_ip(inFile, outFile):
    import http.client
    import threading
    
    # requestHeaders = {
    #     'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
    #     }
    requestHeaders= {'User-Agent': str(UserAgent().random)}
    requestUrl = 'http://www.baidu.com/'
    f_in = open(inFile, 'r')
    f_out = open(outFile, 'w')
    lock = threading.Lock()
    
    while True:
        lock.acquire()
        ll = f_in.readline().strip()
        lock.release()
        if len(ll) == 0: break
        line = ll.strip().split(':')
        ip = line[0]
        port = line[1]
        try:
			#http://ip:prot,http.client.HTTPConnection才是https
            conn = http.client.HTTPConnection(ip, port, timeout=5.0)
            conn.request(method='GET', url=requestUrl, headers=requestHeaders)
            res = conn.getresponse()
            lock.acquire()
            print("+++Success:" + ip + ":" + port)
            f_out.write(ll + "\n")
            lock.release()
        except:
            print("---Failure:" + ip + ":" + port)
    
    f_in.close()
    f_out.close()
    


if __name__ == '__main__':
    inFile = r'C:\Users\Administrator\Desktop\proxy.txt'
    outFile = r'C:\Users\Administrator\Desktop\verified.txt'
    #OUTPUT_FILE = "proxy_list.txt"
    #AgentFile = r'C:\Users\Administrator\Desktop\user_agents.txt'
    #get_proxy(inFile)  #抓取代理ip
    inspect_ip(inFile, outFile)

  

posted on 2020-10-27 15:40  iUpoint  阅读(209)  评论(0编辑  收藏  举报

导航