Python爬虫之爬取小幻HTTP 代理 ip

网上大多数搜索到的帖子都是西插,快代理的 ip,唯独没有获取小幻的,本着学习的态度,对小幻的代理 ip 列表进行获取.

直接放代码:

  1 import requests
  2 import time
  3 from bs4 import BeautifulSoup
  4 from fake_useragent import UserAgent
  5 
  6 ua = UserAgent()
  7 
  8 headers = {'User-Agent': ua.ff}
  9 
 10 # 获取全国的url 地址
 11 init_url = 'https://ip.ihuan.me/address/5Lit5Zu9.html'
 12 
 13 page_tag = 'pagination'
 14 tbody_tag = 'tbody'
 15 
 16 init_data = []
 17 page_data = []
 18 REQ_TIMEOUT = 5
 19 check_url = "http://httpbin.org/headers"
 20 
 21 
 22 def get_init(base_url, current_page=''):
 23     response = requests.get(base_url, headers=headers)
 24     print(response)
 25     result = BeautifulSoup(response.text, 'lxml')
 26     tables = result.findAll('table')
 27     tab = tables[0]
 28     for tr in tab.tbody.findAll('tr'):
 29         for index, td in enumerate(tr.findAll('td')):
 30             if index == 0:
 31                 ip = td.find('a').text
 32                 continue
 33 
 34             if index == 1:
 35                 port = td.getText()
 36                 continue
 37 
 38             if index == 4:
 39                 support_https = td.getText()
 40                 continue
 41 
 42             if index == 5:
 43                 support_post = td.getText()
 44                 continue
 45 
 46             if index == 7:
 47                 speed = td.getText().replace('', '')
 48                 continue
 49 
 50         init_data.append([ip, port, support_https, support_post, speed])
 51 
 52     pages = result.find_all('ul', class_=page_tag)
 53     for index, i in enumerate(pages[0].find_all('li')):
 54         if index == 0:
 55             continue
 56         if current_page != i.find('a')['href'] and i.find('a')['href'] not in page_data:
 57             page_data.append(i.find('a')['href'])
 58     print(page_data)
 59 
 60 
 61 def check_content_and_ip(init_data):
 62     for i in init_data:
 63         if i[3] == '支持':
 64             print(i[0], i[1])
 65             if i[2] == '支持':
 66                 w = 'https://{0}:{1}'.format(i[0], i[1])
 67                 proxies = {'https': w}
 68             else:
 69                 w = 'http://{0}:{1}'.format(i[0], i[1])
 70                 proxies = {'http': w}
 71 
 72             OrigionalIP = requests.get(check_url, timeout=REQ_TIMEOUT).content
 73             print('OrigionalIP', OrigionalIP)
 74             try:
 75                 MaskedIP = requests.get(check_url, timeout=REQ_TIMEOUT, proxies=proxies).content
 76                 print('MaskedIP', MaskedIP)
 77                 if OrigionalIP != MaskedIP:
 78                     ip_pool.writelines(w + '\n')
 79                     ip_pool.flush()
 80             except:
 81                 continue
 82 
 83 if __name__ == '__main__':
 84     ip_pool = open('ip_pool.txt', 'w')
 85     get_init(init_url)
 86 
 87     num = 30   # 循环获取 29 次,数字可修改,加上上面的初始化操作,一共30次
 88     for _k in range(1, num):
 89         print(_k, page_data[_k])
 90         url = init_url + page_data[_k]
 91         try:
 92             get_init(url, current_page=page_data[_k])
 93         except Exception as e:
 94             print(str(e))
 95 
 96         # page_data = list(set(page_data))
 97         if _k > num:
 98             break
 99         time.sleep(1)
100 
101     check_content_and_ip(init_data)
102     ip_pool.close()

对爬虫的代理 ip 很感兴趣,就抓取了一下,小试牛刀.
想知道运行的结果,直接复制代码,运行就可以了 
运行环境: python 3.7
posted @ 2020-06-08 23:26  进击的pythoner  阅读(1744)  评论(2编辑  收藏  举报