Python爬虫之爬取小幻HTTP 代理 ip
网上大多数搜索到的帖子都是西插,快代理的 ip,唯独没有获取小幻的,本着学习的态度,对小幻的代理 ip 列表进行获取.
直接放代码:
1 import requests 2 import time 3 from bs4 import BeautifulSoup 4 from fake_useragent import UserAgent 5 6 ua = UserAgent() 7 8 headers = {'User-Agent': ua.ff} 9 10 # 获取全国的url 地址 11 init_url = 'https://ip.ihuan.me/address/5Lit5Zu9.html' 12 13 page_tag = 'pagination' 14 tbody_tag = 'tbody' 15 16 init_data = [] 17 page_data = [] 18 REQ_TIMEOUT = 5 19 check_url = "http://httpbin.org/headers" 20 21 22 def get_init(base_url, current_page=''): 23 response = requests.get(base_url, headers=headers) 24 print(response) 25 result = BeautifulSoup(response.text, 'lxml') 26 tables = result.findAll('table') 27 tab = tables[0] 28 for tr in tab.tbody.findAll('tr'): 29 for index, td in enumerate(tr.findAll('td')): 30 if index == 0: 31 ip = td.find('a').text 32 continue 33 34 if index == 1: 35 port = td.getText() 36 continue 37 38 if index == 4: 39 support_https = td.getText() 40 continue 41 42 if index == 5: 43 support_post = td.getText() 44 continue 45 46 if index == 7: 47 speed = td.getText().replace('秒', '') 48 continue 49 50 init_data.append([ip, port, support_https, support_post, speed]) 51 52 pages = result.find_all('ul', class_=page_tag) 53 for index, i in enumerate(pages[0].find_all('li')): 54 if index == 0: 55 continue 56 if current_page != i.find('a')['href'] and i.find('a')['href'] not in page_data: 57 page_data.append(i.find('a')['href']) 58 print(page_data) 59 60 61 def check_content_and_ip(init_data): 62 for i in init_data: 63 if i[3] == '支持': 64 print(i[0], i[1]) 65 if i[2] == '支持': 66 w = 'https://{0}:{1}'.format(i[0], i[1]) 67 proxies = {'https': w} 68 else: 69 w = 'http://{0}:{1}'.format(i[0], i[1]) 70 proxies = {'http': w} 71 72 OrigionalIP = requests.get(check_url, timeout=REQ_TIMEOUT).content 73 print('OrigionalIP', OrigionalIP) 74 try: 75 MaskedIP = requests.get(check_url, timeout=REQ_TIMEOUT, proxies=proxies).content 76 print('MaskedIP', MaskedIP) 77 if OrigionalIP != MaskedIP: 78 ip_pool.writelines(w + '\n') 79 ip_pool.flush() 80 except: 81 continue 82 83 if __name__ == '__main__': 84 ip_pool = open('ip_pool.txt', 'w') 85 get_init(init_url) 86 87 num = 30 # 循环获取 29 次,数字可修改,加上上面的初始化操作,一共30次 88 for _k in range(1, num): 89 print(_k, page_data[_k]) 90 url = init_url + page_data[_k] 91 try: 92 get_init(url, current_page=page_data[_k]) 93 except Exception as e: 94 print(str(e)) 95 96 # page_data = list(set(page_data)) 97 if _k > num: 98 break 99 time.sleep(1) 100 101 check_content_and_ip(init_data) 102 ip_pool.close()
对爬虫的代理 ip 很感兴趣,就抓取了一下,小试牛刀.
想知道运行的结果,直接复制代码,运行就可以了
运行环境: python 3.7
本博客的内容如果没有标注转载字样,均属个人原创!欢迎学习交流,如果觉得有价值,欢迎转载,转载请注明出处,谢谢!