环境:python3.6
主要用到模块:requests,PyQuery
代码比较简单,不做过多解释了
#!usr/bin/python # -*- coding: utf-8 -*- import requests from pyquery import PyQuery as pq class GetProxy(object): def __init__(self): # 代理ip网站 self.url = 'http://www.xicidaili.com/nn/' self.header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} self.file = r'F:\python\code2\get_proxy\proxies.txt' # 用于检查代理ip是否可用 self.check_url = 'https://www.python.org/' self.title = 'Welcome to Python.org' def get_page(self): response = requests.get(self.url, headers=self.header) # print(response.status_code) return response.text def page_parse(self, response): stores = [] result = pq(response)('#ip_list') for p in result('tr').items(): if p('tr > td').attr('class') == 'country': ip = p('td:eq(1)').text() port = p('td:eq(2)').text() protocol = p('td:eq(5)').text().lower() # if protocol == 'socks4/5': # protocol = 'socks5' proxy = '{}://{}:{}'.format(protocol, ip, port) stores.append(proxy) return stores def start(self): response = self.get_page() proxies = self.page_parse(response) print(len(proxies)) file = open(self.file, 'w') i = 0 for proxy in proxies: try: check = requests.get(self.check_url, headers=self.header, proxies={'http': proxy}, timeout=5) check_char = pq(check.text)('head > title').text() if check_char == self.title: print('%s is useful'%proxy) file.write(proxy + '\n') i += 1 except Exception as e: continue file.close() print('Get %s proxies'%i) if __name__ == '__main__': get = GetProxy() get.start()