爬虫(二):Urllib库详解
什么是Urllib:
python内置的HTTP请求库
urllib.request : 请求模块
urllib.error : 异常处理模块
urllib.parse: url解析模块
urllib.robotparser : robots.txt解析模块
GET请求方式
POST请求方式
超时timeout,异常处理
响应类型(响应码,响应头...)
POST请求添加Headers
代理方法
cookie添加 读取
---------- parse 包下 -----------
urlparse 解析网址
urlunparse 拼接网址
urlencode GET参数化(比较有用)
python3:
urlopen
# urllib参数 urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None) # url post数据 超时 ############################# import urllib.request # GET方式(不加data) response = urllib.request.urlopen('http://www.baidu.com') # 请求数据 print(response.read().decode('utf-8')) # 转换为字符串编码,read()得到的是byte格式的 ############################# import urllib.parse import urllib.request # POST方式(加data) data = bytes(urllib.parse.urlencode({'word': 'hello'}), encoding='utf8') response = urllib.request.urlopen('http://httpbin.org/post', data=data) # http://httpbin.org/post 用来做HTTP测试的网址 print(response.read()) ############################# import urllib.request #超时设置 response = urllib.request.urlopen('http://httpbin.org/get', timeout=1) print(response.read()) ##############################
响应
# 响应类型 import urllib.request response = urllib.request.urlopen('https://www.python.org') print(type(response)) #<class 'http.client.HTTPResponse'> ############################# # 状态码、响应头 import urllib.request response = urllib.request.urlopen('https://www.python.org') print(response.status) #获取状态码 print(response.getheaders()) # 获取响应头 print(response.getheader('Server')) # 响应的服务 ############################# import urllib.request #获取响应内容 response = urllib.request.urlopen('https://www.python.org') print(response.read().decode('utf-8')) # read() 获取bytes类型
Request
# 加入headers,发送一个POST请求 from urllib import request, parse url = 'http://httpbin.org/post' headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Host': 'httpbin.org' } dict = { 'name': 'Germey' } data = bytes(parse.urlencode(dict), encoding='utf8') req = request.Request(url=url, data=data, headers=headers, method='POST') response = request.urlopen(req) print(response.read().decode('utf-8'))
Handler
# 添加代理 import urllib.request proxy_handler = urllib.request.ProxyHandler({ # 代理设置 'http': 'http://127.0.0.1:9743', 'https': 'https://127.0.0.1:9742' }) opener = urllib.request.build_opener(proxy_handler) response = opener.open('http://httpbin.org/get') print(response.read())
Cookie
import http.cookiejar, urllib.request # 获取cookies cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') for item in cookie: print(item.name+"="+item.value) ############################# # 将cookie保存为txt文件 import http.cookiejar, urllib.request filename = 'cookie.txt' cookie = http.cookiejar.LWPCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard=True, ignore_expires=True) ############################# # 读取cookie文件 import http.cookiejar, urllib.request # 用哪种格式存cookies,就用哪种方法读取 cookie = http.cookiejar.LWPCookieJar() cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') print(response.read().decode('utf-8'))
异常处理
# 异常处理1 from urllib import request, error try: response = request.urlopen('http://cuiqingcai.com/index.htm') except error.URLError as e: print(e.reason) ############################# # 异常处理2 from urllib import request, error try: response = request.urlopen('http://cuiqingcai.com/index.htm') except error.HTTPError as e: print(e.reason, e.code, e.headers, sep='\n') except error.URLError as e: print(e.reason) else: print('Request Successfully') ############################# # 异常处理3 import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('https://www.baidu.com', timeout=0.01) except urllib.error.URLError as e: print(type(e.reason)) if isinstance(e.reason, socket.timeout): print('TIME OUT')
URL解析
# 一个参数 from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html;user?id=5#comment') print(type(result), result) ########################## # 指定协议, 如果没有取https, 有就用url带的 from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', scheme='https') # 指定协议类型 print(result) ########################## # allow_fragments=False 一般不会用,把锚链接部分移动到参数(没有参数在往前移动#XXXX) from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html#comment', allow_fragments=False) print(result)
urlunparse
from urllib.parse import urlunparse # 拼接网站 data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment'] print(urlunparse(data)) #http://www.baidu.com/index.html;user?a=6#comment
urljoin
from urllib.parse import urljoin # 拼接 print(urljoin('http://www.baidu.com', 'Faq.html')) # 以第二个位基准 print(urljoin('http://www.baidu.com', 'https://www.baidu.com/aaa')) # 拼接 print(urljoin('http://www.baidu.com', '?a=1'))
urlencode
# 参数化get参数 from urllib.parse import urlencode params = { 'name': 'germey', 'age': 22 } base_url = 'http://www.baidu.com?' url = base_url + urlencode(params) print(url) # http://www.baidu.com?name=germey&age=22