python urllib库详解
Python的内置HTTP请求库
urllib.request 请求模块
urllib.error 异常处理模块
urllib.parse url解析模块
urllib.robotparse robots.txt解析模块
1 import urllib.request 2 from urllib.parse import urlparse,urlunparse,urljoin,urlencode 3 import http.cookiejar 4 from urllib import error 5 import socket 6 #urlopen 7 #请求 8 # data = bytes(urllib.parse.urlencode({'world':'hello'}),encoding='utf-8') 9 # response = urllib.request.urlopen('http://httpbin.org/post',data=data)#post请求 10 # print(response.read()) 11 # 12 # #响应 13 # #响应类型 14 # response = urllib.request.urlopen('https://www.python.org') 15 # print(type(response)) 16 # 17 # #状态码、响应头 18 # response = urllib.request.urlopen('http://www.python.org') 19 # print(response.status) 20 # print(response.getheaders()) 21 # print(response.getheader('Server')) 22 # 23 # #获取响应体的内容 24 # response = urllib.request.urlopen('http://www.python.org') 25 # print(response.read().decode('utf-8')) 26 # #Request 27 # request = urllib.request.Request('http://www.python.org') 28 # response = urllib.request.urlopen(request) 29 # print(response.read().decode('utf-8')) 30 31 #构建post请求加点headers 32 # url = 'http://httpbin.org/post' 33 # headers = { 34 # 'User-Agent':'Mozilla/4.0(compatible;MSE 5.5;Windows NT)', 35 # 'Host':'httpbin.org' 36 # } 37 # dict = { 38 # 'name':'Germey' 39 # } 40 # data = bytes(urllib.parse.urlencode(dict),encoding='utf-8')#form 41 # req = urllib.request.Request(url=url,data=data,headers=headers,method='POST') 42 # response = urllib.request.urlopen(req) 43 # print(response.read().decode('utf-8')) 44 # 45 # #Habdler 46 # #代理,切换ip 47 # proxy_handler = urllib.request.ProxyHandler({ 48 # 'http':'http://127.0.0.1:9743', 49 # 'https':'https://127.0.0.1:9743' 50 # }) 51 # opener = urllib.request.build_opener(proxy_handler) 52 # response = opener.open('http://httpbin.org/post') 53 # print(response.read()) 54 #cookie是用来记录用户身份,维持登录状态 55 cookie = http.cookiejar.CookieJar() 56 handler = urllib.request.HTTPCookieProcessor(cookie) 57 opener = urllib.request.build_opener(handler) 58 response = opener.open('http://baidu.com') 59 for item in cookie: 60 print(item.name+'='+item.value) 61 62 # #用来保持登录信息所以可以把cookie保存文本信息 63 # filename = 'cookie.txt' 64 # cookie = http.cookiejar.LWPCookieJar(filename) 65 # handler = urllib.request.HTTPCookieProcessor(cookie) 66 # opener = urllib.request.build_opener(handler) 67 # response = opener.open('http://baidu.com') 68 # cookie.save(ignore_discard=True,ignore_expires=True) 69 # #读取cookie文本文件 70 # cookie = http.cookiejar.LWPCookieJar() 71 # cookie.load('cookie.txt',ignore_expires=True,ignore_discard=True) 72 # handler = urllib.request.HTTPCookieProcessor(cookie) 73 # opener = urllib.request.build_opener(handler) 74 # response = opener.open('http://www.baidu.com') 75 # print(response.read().decode('utf-8')) 76 77 #异常处理,urllib库的异常大类就HTTPError,URLError两类 78 try: 79 response = urllib.request.urlopen('http://cuiqingcai.com/index.htm') 80 except error.HTTPError as e: 81 print(e.reason,e.code,e.headers,sep='\n') 82 except error.URLError as e: 83 print(e.reason) 84 else: 85 print("Request Successfully") 86 87 #具体验证是哪种异常:isinstance 88 try: 89 response = urllib.request.urlopen('http://www.baidu.com',timeout=0.01) 90 except error.URLError as e: 91 print(type(e.reason)) 92 if isinstance(e.reason,socket.timeout): 93 print('TIME OUT') 94 95 #URL解析 96 #urllib.parse.urlparse(url,scheme='',allow_fragments=True) 97 #urlparse将URL分割进行赋值,协议类型,域名,路径,参数等 98 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment') 99 print(type(result),result) 100 101 #只有URL没有协议类型,URL有协议类型,scheme不会生效 102 result = urlparse('www.baidu.com/index.html;user?id=5#comment',scheme='https') 103 print(result) 104 105 #fragment是锚点连接,令其为False会将fragment的内容加到前面的参数里 106 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment',allow_fragments=False) 107 print(result) 108 109 #urlunparse是将URL进行拼接 110 data = {'http','www.baidu.com','index.html','user','a=6','comment'} 111 print(urlunparse(data))#user://www.baidu.com/http;index.html?comment#a=6 112 113 #urljoin拼接URL,后面的字段名会覆盖前面的字段名 114 print(urljoin('http://www.baidu.com','FAQ.html'))#http://www.baidu.com/FAQ.html 115 print(urljoin('http://www.baidu.com','https://cuiqingcai.com/FAQ.html'))#https://cuiqingcai.com/FAQ.html 116 117 #urlencode将字典对象转化为get请求参数 118 params = { 119 'name':'gemmey', 120 'age':22 121 } 122 base_url = 'http://www.baidu.com?' 123 url = base_url+urlencode(params) 124 print(url)