爬虫--Urllib库详解
1、什么是Urllib?
2、相比Python2的变化
3、用法讲解
(1)urlopen
urlllb.request.urlopen(url,data=None[timeout,],cahle=None,capath=None,cadefault=False,context=None) #第一个参数为url网址,第二个参数为额外的数据,第三个参数为超时的设置,剩下的参数暂时用不到
######### GET 类型的请求 ############# import urllib.request response =urllib.request.urlopen("http://ww.baidu.com") print(response.read().decode("utf-8")
<!DOCTYPE html> <!--STATUS OK--> ······················· ······················ ····················· <script> if(navigator.cookieEnabled){ document.cookie="NOJS=;expires=Sat, 01 Jan 2000 00:00:00 GMT"; } </script> </body> </html>
######### POST 类型的请求 ############# import urllib.request import urllib.parse data=bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf8') response=urllib.request.urlopen("http://httpbin.org/post",data=data) # http://httpbin.org/post HTTP测试的网址 print(response.read())
b'{\n "args": {}, \n "data": "", \n "files": {}, \n "form": {\n "word": "hello"\n }, \n "headers": {\n "Accept-Encoding": "identity", \n "Connection": "close", \n "Content-Length": "10", \n "Content-Type": "application/x-www-form-urlencoded", \n "Host": "httpbin.org", \n "User-Agent": "Python-urllib/3.5"\n }, \n "json": null, \n "origin": "221.208.253.76", \n "url": "http://httpbin.org/post"\n}\n'
import urllib.request ############### 超时的设置 ############### response=urllib.request.urlopen("http://httpbin.org/get",timeout=1) # 设置一个超时的时间,在规定的时间没有响应,则抛出异常 print(response.read())
b'{\n "args": {}, \n "headers": {\n "Accept-Encoding": "identity", \n "Connection": "close", \n "Host": "httpbin.org", \n "User-Agent": "Python-urllib/3.5"\n }, \n "origin": "221.208.253.76", \n "url": "http://httpbin.org/get"\n}\n'
import urllib.request import urllib.error import socket ############### 超时的设置,超出响应时间 ############### try: response = urllib.request.urlopen('htp://httpbin.org/get', timeout=0.1) except urllib.error.URLError as e: if isinstance(e.reason,socket.timeout): print("Time out")
Time out
(2)响应
响应类型
import urllib.request response=urllib.request.urlopen('https://www.python.org') print(type(response))
<class 'http.client.HTTPResponse'>
状态码、响应头
import urllib.request response =urllib.request.urlopen('https://www.python.org') print(response.status) # 获取状态码 print(response.getheaders) # 获取响应头 print(response.getheader('Server'))
200
<bound method HTTPResponse.getheaders of <http.client.HTTPResponse object at 0x0000000002D04EB8>>
nginx
(3)request
import urllib.request request=urllib.request.Request("https://python.org") response=urllib.request.urlopen(request) print(response.read().decode("utf-8"))
<!doctype html> <!--[if lt IE 7]> <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9"> <![endif]--> <!--[if IE 7]> <html class="no-js ie7 lt-ie8 lt-ie9"> <![endif]--> <!--[if IE 8]> <html class="no-js ie8 lt-ie9"> <![endif]--> <!--[if gt IE 8]><!--><html class="no-js" lang="en" dir="ltr"> <!--<![endif]--> <head> ················· ···················· </body> </html>
from urllib import request,parse url='http://httpbin.org/post' ############ POST 请求 ############### headers={ "User-Agent":"Mozilla/4.0(compatible;MSIE 5.5;Windows NT)", "Host":'httpbin.org' } dict={ 'name':"Germey" } data =bytes(parse.urlencode(dict),encoding="utf-8") req =request.Request(url=url,data=data,headers=headers,method='POST') response=request.urlopen(req) print(response.read().decode('utf-8'))
{ "args": {}, "data": "", "files": {}, "form": { "name": "Germey" }, "headers": { "Accept-Encoding": "identity", "Connection": "close", "Content-Length": "11", "Content-Type": "application/x-www-form-urlencoded", "Host": "httpbin.org", "User-Agent": "Mozilla/4.0(compatible;MSIE 5.5;Windows NT)" }, "json": null, "origin": "221.208.253.76", "url": "http://httpbin.org/post" }
from urllib import request,parse url ="http://httpbin.org/post" dict={ 'name':'Germey' } data =bytes(parse.urlencode(dict),encoding='utf8') req = request.Request(url=url,data=data,method="POST") req.add_header('User-Agent','Mozilla/4.0(compatible;MSIE5.5;Windows NT)') response = request.urlopen(req) print(response.read().decode('utf-8'))
{ "args": {}, "data": "", "files": {}, "form": { "name": "Germey" }, "headers": { "Accept-Encoding": "identity", "Connection": "close", "Content-Length": "11", "Content-Type": "application/x-www-form-urlencoded", "Host": "httpbin.org", "User-Agent": "Mozilla/4.0(compatible;MSIE5.5;Windows NT)" }, "json": null, "origin": "221.208.253.76", "url": "http://httpbin.org/post" }
(4)Handler
代理
import urllib.request proxy_handler = urllib.request.ProxyHandler({ 'http':'http://127.0.0.1:9743', # 代理http 'https':'https://127.0.0.1:9743' # 代理https }) opener =urllib.request.build_opener(proxy_handler) response=opener.open("http://www.baidu.com") print(response.read())
因为我没有代理,所以打印出来的结果为:
urllib.error.URLError: <urlopen error [WinError 10061] 由于目标计算机积极拒绝,无法连接。>
Cookie
import http.cookiejar,urllib.request cookie=http.cookiejar.CookieJar() # 获取Cookie信息 handler=urllib.request.HTTPCookieProcessor(cookie) # 把Cookie信息放入到 handler中 opener=urllib.request.build_opener(handler) # 建立opener response=opener.open("http://www.baidu.com") for item in cookie: print(item.name+"=”+item.value)
BAIDUID=DDCB4C216AE8EE90C7D95E7AF8FA577F:FG=1 BIDUPSID=DDCB4C216AE8EE90C7D95E7AF8FA577F H_PS_PSSID=1452_21078_26350_27111 PSTM=1536830732 BDSVRTM=0 BD_HOME=0 delPer=0
########### 把Cookie 保存成文件 ########## import http.cookiejar,urllib.request filename = "cookie.txt" cookie=http.cookiejar.MozillaCookieJar(filename) handler=urllib.request.HTTPCookieProcessor(cookie) opener=urllib.request.build_opener(handler) response=opener.open("http://www.baidu.com") cookie.save(ignore_discard=True,ignore_expires=True)
在工程目录下多了一个cookie.txt文件 该文件的内容为: # Netscape HTTP Cookie File # http://curl.haxx.se/rfc/cookie_spec.html # This is a generated file! Do not edit. .baidu.com TRUE / FALSE 3684314677 BAIDUID CB67C520D33E28D7204C570EB7DFA28F:FG=1 .baidu.com TRUE / FALSE 3684314677 BIDUPSID CB67C520D33E28D7204C570EB7DFA28F .baidu.com TRUE / FALSE H_PS_PSSID 1434_21113_26350_20930 .baidu.com TRUE / FALSE 3684314677 PSTM 1536831034 www.baidu.com FALSE / FALSE BDSVRTM 0 www.baidu.com FALSE / FALSE BD_HOME 0 www.baidu.com FALSE / FALSE 2482910974 delPer 0
########### 另一种 Cookie 的保存案例 ########## import http.cookiejar,urllib.request filename = "cookies.txt" cookie=http.cookiejar.LWPCookieJar(filename) handler=urllib.request.HTTPCookieProcessor(cookie) opener=urllib.request.build_opener(handler) response=opener.open("http://www.baidu.com") cookie.save(ignore_discard=True,ignore_expires=True)
代码运行结果与上面相同!
(5)异常处理
from urllib import request,error try: response=request.urlopen("http://cuiqingcai.com/index.htm") except error.URLError as e: print(e.reason)
Not Found
from urllib import request,error try: response =request.urlopen('http://cuiqingcai.com/index.htm') except error.HTTPError as e: print(e.reason,e.code,e.headers,sep='\n') except error.URLError as e: print(e.reason) else: print("Request Successfully")
Not Found 404 Server: nginx/1.10.3 (Ubuntu) Date: Thu, 13 Sep 2018 11:08:18 GMT Content-Type: text/html; charset=UTF-8 Transfer-Encoding: chunked Connection: close Vary: Cookie Expires: Wed, 11 Jan 1984 05:00:00 GMT Cache-Control: no-cache, must-revalidate, max-age=0 Link: <https://cuiqingcai.com/wp-json/>; rel="https://api.w.org/"
import socket import urllib.request import urllib.error try: response = urllib.request.urlopen("https://www.baidu.com",timeout=0.000000001) except urllib.error.URLError as e: print(type(e.reason)) if isinstance(e.reason,socket.timeout): print("TimeOut")
<class 'socket.timeout'> TimeOut
(6)URL解析
urlparse
urllib.parse.urlparse(urlstring.scheme="",allow_fragments=True)
from urllib.parse import urlparse result =urlparse("http://www.baidu.com/index.html;user?id=5i#comment") print(type(result),result)
<class 'urllib.parse.ParseResult'> ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5i', fragment='comment')
######## 无协议类型 ########### from urllib.parse import urlparse result =urlparse("www.baidu.com/index.html;user?id=5i#comment,scheme=/https") print(result)
ParseResult(scheme='', netloc='', path='www.baidu.com/index.html', params='user', query='id=5i', fragment='comment,scheme=/https')
######## 默认的协议类型 ########### from urllib.parse import urlparse result=urlparse("http://www.baidu.com/index.html;user?id=5i#comment,scheme=/https") print(result)
ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5i', fragment='comment,scheme=/https')
from urllib.parse import urlparse result =urlparse("http://www.baidu.com/index.html;user?id=5i#comment",allow_fragments=False) print(result)
ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5i#comment', fragment='')
from urllib.parse import urlparse result =urlparse("http://www.baidu.com/index.htmlf#comment",allow_fragments=False) print(result)
ParseResult(scheme='http', netloc='www.baidu.com', path='/index.htmlf#comment', params='', query='', fragment='')
urlunparse
from urllib.parse import urlunparse data =["http","www.baidu.cogn","index.html","user",'a=6','comment'] print(urlunparse(data))
http://www.baidu.cogn/index.html;user?a=6#comment
urljoin(url拼接,前面若在为补充,后面若在为基准)
from urllib.parse import urljoin print(urljoin('http://www.baidu.com','FAQ.html')) print(urljoin('http://www.baidu.com','https://cuiqingcai.com/FAQ.html')) print(urljoin('http://www.baidu.com/about.html','https://cuiqingcai.com/FAQ.html')) print(urljoin('http://www.baidu.com/about.html','https://cuiqingcai.com/FAQ.html?question=2')) print(urljoin('http://www.baidu.com?wd=abc','https://cuiqingcai.com/infex.php')) print(urljoin('http://www.baidu.com','?category=2#commen:')) print(urljoin('www.baidu.com','?category=2t#comment')) print(urljoin('www.baidu.comi#comment','?category=2'))
http://www.baidu.com/FAQ.htmr https://cuiqingcai.com/FAQ.html https://cuiqingcai.com/FAQ.html https://cuiqingcai.com/FAQ.html?question=2 https://cuiqingcai.com/infex.php http://www.baidu.com?category=2#commen: www.baidu.com?category=2t#comment www.baidu.comi?category=2
urlencode(把字典对象转化为GET请求参数)
from urllib.parse import urlencode params={ 'name':'germey', 'agel':'22' } base_url='http://www.baidu.com?' url=base_url+urlencode(params) print(url)
http://www.baidu.com?name=germey&agel=22