python爬虫之urllib
urllib
提供了一系列用于操作URL的功能。
1.urllib.request
请求模块
urlopen(url,url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False,context=None):直接请求、指定数据、指定响应时间,未响应可以抛出异常
urlopen一般常用的有三个参数url,data,timeout
response.read()可以获取到网页的内容。
get
response = urllib.request.urlopen('http://www.baidu.com')
post
data = bytes(urllib.parse.urlencode({'word': 'hello'}), encoding='utf8') response = urllib.request.urlopen('http://httpbin.org/post', data=data)
timeout
在某些网络情况不好或者服务器端异常的情况会出现请求慢的情况,或者请求异常,所以这个时候我们需要给请求设置一个超时时间,而不是让程序一直在等待结果。
response = urllib.request.urlopen('http://httpbin.org/get', timeout=1)
设置Headers
from urllib import request, parse url = 'http://httpbin.org/post' dict = { 'name': 'Germey' } data = bytes(parse.urlencode(dict), encoding='utf8') req = request.Request(url=url, data=data, method='POST') req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') response = request.urlopen(req) print(response.read().decode('utf-8'))
handler代理
import urllib.request proxy_handler = urllib.request.ProxyHandler({ 'http': 'http://127.0.0.1:9743', 'https': 'https://127.0.0.1:9743' }) opener = urllib.request.build_opener(proxy_handler) response = opener.open('http://httpbin.org/get') print(response.read())
cookie保存
cookie中保存中我们常见的登录信息,有时候爬取网站需要携带cookie信息访问,这里用到了http.cookijar,用于获取cookie以及存储cookie
import http.cookiejar, urllib.request filename = "cookie.txt" cookie = http.cookiejar.MozillaCookieJar(filename)#http.cookiejar.LWPCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard=True, ignore_expires=True)
cookie读取
同样的如果想要通过获取文件中的cookie获取的话可以通过load方式,当然用哪种方式写入的,就用哪种方式读取。
import http.cookiejar, urllib.request cookie = http.cookiejar.LWPCookieJar() cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') print(response.read().decode('utf-8'))
响应类型、状态码、响应头、响应结果
print(type(response)) print(response.status) print(response.getheaders()) print(response.getheader('Server'))
2.urllib.error
异常处理模块
在urllb异常这里有两个个异常错误:
URLError,HTTPError,HTTPError是URLError的子类
URLError里只有一个属性:reason,即抓异常的时候只能打印错误信息
HTTPError里有三个属性:code,reason,headers,即抓异常的时候可以获得code,reson,headers三个信息
from urllib import request, error try: response = request.urlopen('http://cuiqingcai.com/index.htm') except error.HTTPError as e: print(e.reason, e.code, e.headers, sep='\n') except error.URLError as e: print(e.reason) else: print('Request Successfully')
3.urllib.parse
url解析模块
url地址拆分
urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)
from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html;user?id=5#comment') print(result)
>> ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
url地址拼接
from urllib.parse import urlunparse data = ['http','www.baidu.com','index.html','user','a=123','commit'] print(urlunparse(data)) >>http://www.baidu.com/index.html;user?a=123#commit
urljoin,url地址拼接
from urllib.parse import urljoin print(urljoin('http://www.baidu.com', 'FAQ.html')) print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html')) print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html')) print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html?question=2')) print(urljoin('http://www.baidu.com?wd=abc', 'https://cuiqingcai.com/index.php')) print(urljoin('http://www.baidu.com', '?category=2#comment')) print(urljoin('www.baidu.com', '?category=2#comment')) print(urljoin('www.baidu.com#comment', '?category=2')) >> http://www.baidu.com/FAQ.html https://cuiqingcai.com/FAQ.html https://cuiqingcai.com/FAQ.html https://cuiqingcai.com/FAQ.html?question=2 https://cuiqingcai.com/index.php http://www.baidu.com?category=2#comment www.baidu.com?category=2#comment www.baidu.com?category=2
urlencode():字典对象改变为get请求对象
from urllib.parse import urlencode params = { 'name': 'germey', 'age': 22 } base_url = 'http://www.baidu.com?' url = base_url + urlencode(params) print(url) >>http://www.baidu.com?name=germey&age=22
parse_qs() :将get请求url后的参数转换为字典
from urllib.parse import parse_qs url = 'http://www.baidu.com?name=germey&age=22' print(parse_qs(url)) >>{'http://www.baidu.com?name': ['germey'], 'age': ['22']}
parse_qsl() :将get请求url后的参数将参数转化为元组组成的列表
from urllib.parse import parse_qsl url = 'http://www.baidu.com?name=germey&age=22' print(parse_qsl(url)) >>[('http://www.baidu.com?name', 'germey'), ('age', '22')]
urlsplit() :拆分url
from urllib.parse import urlsplit url = 'http://www.baidu.com?name=germey&age=22' print(urlsplit(url)) >>SplitResult(scheme='http', netloc='www.baidu.com', path='', query='name=germey&age=22', fragment='')
quote() :将内容转化为URL编码的格式。URL中带有中文参数时,有时可能会导致乱码的问题,此时用这个方法可以将中文字符转化为URL编码.
from urllib.parse import quote keyword = '壁纸' url = 'https://www.baidu.com/s?wd=' + quote(keyword) print(url) >>https://www.baidu.com/s?wd=%E5%A3%81%E7%BA%B8
unquote() :进行URL解码
from urllib.parse import unquote url = 'https://www.baidu.com/s?wd=%E5%A3%81%E7%BA%B8' print(unquote(url)) >>https://www.baidu.com/s?wd=壁纸