爬虫学习--Urllib库基本使用 Day1

一、Urllib库详解

1、什么是Urllib

Python内置的HTTP请求库

urllib.request 　　　请求模块（模拟实现传入网址访问）

urllib.error 　　异常处理模块（如果出现错误，进行捕捉这个异常，然后进行重试和其他的操作保证程序不会意外的中止）

urllib.parse url解析模块（工具模块，提供了许多url处理方法，例如：拆分，合并等）

urllib.robotparser robots.txt解析模块（主要是用来识别网页的robots.txt文件，判断哪些网站是可以爬的，哪些是不可以爬的）

2、相比Python变化

Python2

import urllib2

response = urllib2.urlopen('http://www.baidu.com')

Python3

import urllib.request

response = urllib.request.urlopen('http://www.baidu.com')

3、基本用法

Urllib

urlopen

urllib.request.urlopen(url,data=None,[timeout,]*,cafile=None,capath=None,cadefault=False,context=None)

方法1

1 import urllib.request
2 
3 response = urllib.request.urlopen('http://www.baidu.com')
4 print(response.read().decode('utf-8'))  # 获取相应体的内容，用decode('utf-8')显示

方法2

import urllib.request
import urllib.parse

data = bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf-8')
response = urllib.request.urlopen('http://httpbin.org/post',data=data) # 加了data 是已post形式传递 ，不加则是get方式传递
print(response.read())

方法3

1 import urllib.request
2 
3 response = urllib.request.urlopen('http://httpbin.org/get',timeout=1)
4 print(response.read())

方法4

 1 import socket
 2 import urllib.request
 3 import urllib.error
 4 
 5 
 6 try:
 7     response = urllib.request.urlopen('http://httpbin.org/get',timeout=0.1)
 8 except urllib.error.URLError as e:
 9     if isinstance(e.reason,socket.timeout):
10         print('TIME OUT')

响应

响应类型

1 import urllib.request
2 
3 response = urllib.request.urlopen('http://www.baidu.com')
4 print(type(response))

状态码、响应头

1 import urllib.request
2 
3 response = urllib.request.urlopen('http://www.python.org')
4 print(response.status) # 获取状态码
5 print(response.getheaders())  # 获取响应头
6 print(response.getheader('Server')) # 获取特定的响应头，这里拿 Server举例

Request

url作为对象传给urlopen

1 import urllib.request
2 
3 request = urllib.request.Request('https://python.org') # 把url封装成一个对象
4 response = urllib.request.urlopen(request)  # 把对象传给urlopen一样可以访问
5 print(response.read().decode('utf-8'))

添加request请求的方式

 1 from urllib import request,parse
 2 
 3 url = 'http://httpbin.org/post'
 4 headers={
 5     'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)',
 6     'Host':'httpbin.org'
 7 }
 8 dict = {
 9     'name':'Germey'
10 }
11 data = bytes(parse.urlencode(dict),encoding='utf-8')
12 req = request.Request(url=url,data=data,headers=headers,method='POST')
13 response = request.urlopen(req)
14 print(response.read().decode('utf-8'))

request.add_header()方法

 1 from urllib import request,parse
 2 
 3 url = 'http://httpbin.org/post'
 4 dict = {
 5     'name':'Germey'
 6 }
 7 data = bytes(parse.urlencode(dict),encoding='utf-8')
 8 req = request.Request(url=url,data=data,method='POST')
 9 req.add_header('User-Agent','Mozilla/4.0(compatible;MSIE 5.5;Windows NT)')
10 response = request.urlopen(req)
11 print(response.read().decode('utf-8'))

Handler

代理

 1 import urllib.request
 2 
 3 # 构建了两个代理Handler，一个有代理IP，一个没有代理IP
 4 httpproxy_handler = urllib.request.ProxyHandler({"http" : "127.0.0.1:9743"})
 5 nullproxy_handler = urllib.request.ProxyHandler({})
 6 #定义一个代理开关
 7 proxySwitch = True
 8 # 通过 urllib2.build_opener()方法使用这些代理Handler对象，创建自定义opener对象
 9 # 根据代理开关是否打开，使用不同的代理模式
10 if proxySwitch:
11     opener = urllib.request.build_opener(httpproxy_handler)
12 else:
13     opener = urllib.request.build_opener(nullproxy_handler)
14 
15 request = urllib.request.Request("http://www.baidu.com/")
16 
17 # 使用opener.open()方法发送请求才使用自定义的代理，而urlopen()则不使用自定义代理。
18 response = opener.open(request)
19 
20 # 就是将opener应用到全局，之后所有的，不管是opener.open()还是urlopen() 发送请求，都将使用自定义代理。
21 urllib.request.install_opener(opener)
22 # response = urlopen(request)
23 
24 print(response.read())

使用选择的代理构建代理处理器对象

 1 import urllib.request
 2 
 3 # 使用选择的代理构建代理处理器对象
 4 proxy_handler = urllib.request.ProxyHandler({
 5     'http':'http://127.0.0.1:9743',
 6     'https':'https://127.0.0.1:9743'
 7 })
 8 opener = urllib.request.build_opener(proxy_handler)
 9 request = urllib.request.Request("http://www.baidu.com")
10 response = opener.open(request)
11 print(response.read())

Cookie维持登陆状态的一个机制

实现cookie的获取

import http.cookiejar,urllib.request

1 import http.cookiejar,urllib.request
2 
3 cookie = http.cookiejar.CookieJar()
4 handler = urllib.request.HTTPCookieProcessor(cookie)
5 opener = urllib.request.build_opener(handler)
6 response = opener.open('http://www.baidu.com')
7 for item in cookie:
8     print(item.name+"="+item.value)

把cookie保存成一个文本文件

1 import http.cookiejar,urllib.request
2 
3 filename = "cookie.txt"
4 cookie = http.cookiejar.MozillaCookieJar(filename) # CookieJar子类的一个对象 MozillaCookieJar()
5 handler = urllib.request.HTTPCookieProcessor(cookie)
6 opener = urllib.request.build_opener(handler)
7 response = opener.open('http://www.baidu.com')
8 cookie.save(ignore_discard=True,ignore_expires=True) #  MozillaCookieJar()里包含了一个save()方法保存成txt文件

Cookie另一种保存格式方法2

1 import http.cookiejar,urllib.request
2 
3 filename = "cookie.txt"
4 cookie = http.cookiejar.LWPCookieJar(filename) # CookieJar子类的一个对象 LWPCookieJar()
5 handler = urllib.request.HTTPCookieProcessor(cookie)
6 opener = urllib.request.build_opener(handler)
7 response = opener.open('http://www.baidu.com')
8 cookie.save(ignore_discard=True,ignore_expires=True) #  LWPCookieJar()里包含了一个save()方法保存成txt文件

用cookie方法2的方法读取获取到的Cookie(LWPCookieJar())

import http.cookiejar,urllib.request

cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie.txt',ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8')) # 用文本文件的方式存储cookie,再读取出来放在request里请求访问网页，请求的结果就是登陆时候的看到的结果

URL解析

 1 # urlparse  urllib.parse.urlparse(urlstring,scheme='',allow_fragments=True)
 2 # 把url分割成许多部分
 3 from urllib.parse import urlparse,urlunparse
 4 
 5 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
 6 print(type(result),result) # 输出 <class 'urllib.parse.ParseResult'> ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
 7 
 8 # 指定协议类型
 9 result = urlparse('www.baidu.com/index.html;user?id=5#comment',scheme='https')
10 print(result) # 输出 ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=5', fragment='comment')
11 
12 #如果url里添加了协议，后面分割的就是这个协议方式
13 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment',scheme='https')
14 print(result) # 输出 ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
15 
16 #锚点链接 allow_fragments参数
17 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment',allow_fragments=False)
18 print(result) # 将comment拼接到query里 ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5#comment', fragment='')
19 
20 #把query去掉，直接拼接到path里
21 result = urlparse('http://www.baidu.com/index.html#comment',allow_fragments=False)
22 print(result) # 输出 ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment='')
23 
24 #-----------------------------------------------------------------------------------------------------------------------
25 # urlunparse 将url里的参数进行拼接成完整的url
26 data = ['http','www.baidu.com','index.html','user','a=6','comment']
27 print(urlunparse(data)) # 输出 http://www.baidu.com/index.html;user?a=6#comment
28 
29 #-----------------------------------------------------------------------------------------------------------------------
30 # urljoin 后面url里的字段会覆盖前面的url
31 from urllib.parse import urljoin
32 print(urljoin('http://www.baidu.com/about.html','https://cuiqincai.com/FAQ.html'))
33 # 输出 https://cuiqincai.com/FAQ.html
34 
35 #-----------------------------------------------------------------------------------------------------------------------
36 from urllib.parse import urlencode
37 
38 params = {
39     'name':'germey',
40     'age':22
41 }
42 base_url = 'http://www.baidu.com?'
43 url = base_url + urlencode(params) # 把字典转换成请求参数
44 print(url) # 输出 http://www.baidu.com?name=germey&age=22

异常处理

 1 # from urllib import request,error # 1，2可用
 2 # 打印出异常处理
 3 # try:
 4 #     response = request.urlopen('http://wyh.com/index.html')
 5 # except error.URLError as e:
 6 #     print(e.reason) # 打印出异常原理，保证程序是正常运行的
 7 
 8 # 具体可以捕捉哪些异常
 9 # try:
10 #     response = request.urlopen('http://wyh.com/index.html')
11 # except error.HTTPError as e: # HTTPError是子类异常
12 #     print(e.reason,e.code,e.headers,sep='\n') # e.headers 打印响应头的一些信息
13 # except error.URLError as e:  # URLError是父类异常
14 #     print(e.reason)
15 # else:
16 #     print('Request Successfully!')
17 
18 # 加一个原因判断
19 import socket
20 import urllib.request
21 import urllib.error
22 
23 try:
24     response = urllib.request.urlopen('http://www.baidu.com',timeout=0.01)
25 except urllib.error.URLError as e:
26     print(type(e.reason)) # 它是一个类
27     if isinstance(e.reason,socket.timeout): # isinstance()方法判断是不是匹配的
28         print('TIME OUT!')

posted @ 2019-06-19 21:40 Xiaohu_BigData 阅读(341) 评论(0) 编辑收藏举报

刷新页面返回顶部

Xiaohu_BigData

爬虫学习--Urllib库基本使用 Day1

公告