urllib练习
# coding = utf-8 """ 解析https://www.kuaidaili.com/free/网站里的代理地址, 并测试是否可用 """
import re import time import urllib.request def downHtml(url, retry=3): """ 请求网页地址并下载源代码,如果请求失败,重试三次 :param url: 请求地址 :param retry: 重试次数 :return: 网页源码 """ try: request = urllib.request.Request(url) # 获取网页源码 html = urllib.request.urlopen(request).read().decode() except urllib.error.URLError as e: print('请求异常:', e.reason) if retry > 0: time.sleep(2) # 两秒后重试 downHtml(url, retry=retry-1) else: return None else: return html def getProxy(html): """ 使用正则表达式,从源码中匹配出所有的代理 :param html: 网页源码 :return: 列表,包含匹配的代理 """ proxies = re.findall(r'<td data-title="IP">(.*?)</td>.*?<td data-title="PORT">(.*?)</td>', html, re.S) return proxies def isAbleToUse(ips): """ 利用ip测试网站,判断获取的代理是否可用 :param ips: 匹配到的代理ip :return: """ # 测试网站 url = "http://httpbin.org/ip" # 构造代理 proxy = {'http': '{}:{}'.format(ips[0], ips[1]), 'https': '{}:{}'.format(ips[0], ips[1])} # 创建代理处理器 proxies = urllib.request.ProxyHandler(proxy) # 创建opener处理对象 opener = urllib.request.build_opener(proxies, urllib.request.HTTPHandler) urllib.request.install_opener(opener) try: data = opener.open(url).read().decode() # 请求 print(data) except Exception as e: print(e) else: print('{}:{}'.format(ips[0], ips[1]), '可用!') if __name__ == '__main__': url = "https://www.kuaidaili.com/free/" # 获取源码 html = downHtml(url) # 从源码中解析代理 proxies = getProxy(html) # 测试代理是否可用 for proxy in proxies: isAbleToUse(proxy)