urllib模块使用

urllib.request

urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

import urllib.request

url = 'http://httpbin.org/ip'
response = urllib.request.urlopen(url)
html = response.read()  # 返回bytes类型数据
print(html)

url = 'http://www.baidu.com'
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8') # 通过decode()方法将bytes类型数据转化为str类型数据
print(html)

发送post数据

import urllib.request
import urllib.parse

url = 'http://httpbin.org/post'

data = {
    'name' : "小明",
    'age' : 30
}
# data = urllib.parse.urlencode(data)  # Error: POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str
# data = urllib.parse.urlencode(data).encode('utf-8')
data = bytes(urllib.parse.urlencode(data),encoding="utf-8")
response = urllib.request.urlopen(url, data=data)
html = response.read().decode('utf-8')
print(html)

设置timeout

import urllib.request

url = 'http://httpbin.org/get'
response = urllib.request.urlopen(url, timeout=1)
html = response.read().decode('utf-8')
print(html)

import socket
import urllib.request
import urllib.error

url = 'http://httpbin.org/get'
try:
    response = urllib.request.urlopen(url, timeout=0.1)
    html = response.read().decode('utf-8')
    print(html)
except urllib.error.URLError as e:
    print("捕获异常....")
    print(e.reason)
    if isinstance(e.reason, socket.timeout):
        print("请求超时")

响应

响应类型、状态码、响应头、实际获取的url

import urllib.request

url = 'http://www.python.org'
response = urllib.request.urlopen(url)
# 响应类型
response_type = type(response)
print(response_type)  # <class 'http.client.HTTPResponse'>
# 状态码
status_code = response.getcode()
print(status_code)
# 状态码对应的信息
status = response.reason
print(status)    # 比如 200对应Ok, 404对应Not Found
# 响应头
response_headers = response.getheaders()  # 返回列表
print(response_headers)
server_type = response.getheader('Server') # getheader()获取响应头的指定部分信息
print(server_type)
print(type(response.headers))  # <class 'http.client.HTTPMessage'>
content_type = response.headers['Content-Type'] # 获取Content-Type
print(content_type)
# 实际获取的url, 可以用来判断是否发生重定向
actual_url = response.geturl()
print(actual_url)

class urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)¶

import urllib.request

url = 'http://httpbin.org/get'
request = urllib.request.Request(url)  # 创建请求对象
response = urllib.request.urlopen(request) # 发送请求
html = response.read().decode('utf-8')
print(html)
# 默认的User-Agent为"Python-urllib/x.x" # x.x为python版本号

发送post数据

import urllib.request
import urllib.parse

url = 'http://httpbin.org/post'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}
data = {
    'name' : 'peter', 
    'age' : 20
}

data = bytes(urllib.parse.urlencode(data), encoding="utf-8") # POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str
request = urllib.request.Request(url, data=data, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
print(html)
# post数据时  "Content-Type": "application/x-www-form-urlencoded"

urllib.request.Request 对象方法

import urllib.request


url = 'http://httpbin.org/get'
request = urllib.request.Request(url)
# add_header(key, val)   # 添加请求头信息
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36')
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
print(html)

Handlers

ProxyHandler(代理)

import urllib.request

# 字典，key为协议类型,value 为 ip地址:端口号
proxy_dict = {
    'http': '127.0.0.1:6688',
    'https': '127.0.0.1:6688',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}


proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
opener = urllib.request.build_opener(proxy_hanlder)
urllib.request.install_opener(opener)

opener.addheaders = headers.items()   # 设置请求头

url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
response = urllib.request.urlopen(url)
print(response.read().decode('utf-8'))

# 常见错误: 
# HTTPError: HTTP Error 403: Forbidden : 很可能代理服务器设置了权限，当前ip不在代理服务器允许访问列表中

代理需要身份认证

# 错误提示: HTTPError: HTTP Error 407: Proxy Authentication Required

#方法1: 代理ip设置格式 http://用户名:密码@ip地址:端口号
import urllib.request

# 字典，key为协议类型,value 为 ip地址:端口号
proxy_dict = {
    'http': 'http://name:password@127.0.0.1:6688',
    'https': 'http://name:password@127.0.0.1:6688',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}


proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
opener = urllib.request.build_opener(proxy_hanlder)
urllib.request.install_opener(opener)

opener.addheaders = headers.items()   # 设置请求头

url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
response = opener.open(url)
print(response.read().decode('utf-8'))

#方法2: 使用ProxyBasicAuthHandler用于代理登陆验证(需要提供相应的用户名和密码)
import urllib.request

# 字典，key为协议类型,value 为 ip地址:端口号
proxy_dict = {
    'http': 'http://127.0.0.1:6688',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}


proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, 'http://127.0.0.1:6688', 'name', 'password') # #realm(域)设为None即可
proxy_auth_handler = urllib.request.ProxyBasicAuthHandler(password_mgr)
opener = urllib.request.build_opener(proxy_hanlder, proxy_auth_handler)
urllib.request.install_opener(opener)

opener.addheaders = headers.items()   # 设置请求头

url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
response = opener.open(url)
print(response.read().decode('utf-8'))

HTTPBasicAuthHandler

用于访问web服务器时的身份验证

import urllib.request

url = 'http://127.0.0.1/test/'
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, url, 'admin','password')  # 添加对应url的用户名和密码
http_auth_handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
opener = urllib.request.build_opener(http_auth_handler)
response = opener.open(url)
print(response.read().decode('utf-8'))

FTPHandler

import urllib.request


url = 'ftp://ftp1.linuxidc.com'
username = 'ftp1.linuxidc.com'
password = 'www.linuxidc.com'

ftp_url = 'ftp://%s:%s@ftp1.linuxidc.com' %(username, password)
ftp_handler = urllib.request.FTPHandler()  
opener = urllib.request.build_opener(ftp_handler)
response = opener.open(ftp_url)
print(response.read().decode('utf-8', 'ignore'))

HTTPHandler、HTTPSHandler

import urllib.request


url = 'http://www.baidu.com'
# 通过将debuglevel=1,将debug Log 打开,这样收发包的内容就会在屏幕上打印出来，方便调试
http_handler = urllib.request.HTTPHandler(debuglevel=1)
https_handler = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(http_handler, https_handler)
response = opener.open(url)

'''
效果:
send: b'GET / HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: www.baidu.com\r\nUser-Agent: Python-urllib/3.6\r\nConnection: close\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: Date header: Content-Type header: Transfer-Encoding header: Connection header: Vary header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: P3P header: Cache-Control header: Cxy_all header: Expires header: X-Powered-By header: Server header: X-UA-Compatible header: BDPAGETYPE header: BDQID header: BDUSERID 
'''

CookieJar

import urllib.request
import http.cookiejar


url = 'http://www.baidu.com'
cookie = http.cookiejar.CookieJar()
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(cookie_handler)
response = opener.open(url)
print(response.getcode())
for item in cookie:  # item为<class 'http.cookiejar.Cookie'>
    print(item.name, item.value, sep=" : ")

MozillaCookieJar

创建与Mozilla cookies.txt文件兼容的FileCookieJar实例

import urllib.request
import http.cookiejar


url = 'https://www.zhihu.com/settings/profile'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'
}

cookie = http.cookiejar.MozillaCookieJar("zhihu_cookie.txt")
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(cookie_handler)
opener.addheaders = headers.items()

try:
    cookie.load()    # 将cookie数据从文件加载到内存  很重要
except http.cookiejar.LoadError as e:
    print('cookie文件加载失败')
except IOError as e:
    print("cookie文件不存在")

response = opener.open(url)
print(response.geturl())  # 将geturl()返回的结果和url比对，判断是否登陆成功，失败会转到知乎登陆界面
html = response.read().decode('utf-8')
print(html)

# 对于登陆成功，需要调用MozillaCookieJar对象的save()方法，将数据从内存保存到文件中

LWPCookieJar

创建与libwww-perl Set-Cookie3文件兼容的FileCookieJar实例

import urllib.request
import http.cookiejar


url = 'http://www.baidu.com'
cookie = http.cookiejar.LWPCookieJar("cookies.txt")
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
response = opener.open(url)
# 必须调用save方法 将内存中的cookie对象保存到本地文件中, 下次再次使用cookie，只需调用load方法将其加载到内存中即可
cookie.save(ignore_discard=True, ignore_expires=True)

异常处理

URLError

引起URLError的原因通常有: 无网络连接,即本机无法上网、访问的目标服务器不存在。在这种情况下，异常对象会有reason属性(它是一个由(错误码、错误原因)组成的元组对象)。捕获异常方法如下:

import urllib.request


try:
    response = urllib.request.urlopen('http://www.hello_world.org')
except urllib.request.URLError as e:
    print(type(e.reason)) #  <class 'socket.gaierror'>
    print(e.reason)  # # [Errno 11001] getaddrinfo failed

HTTPError

HTTPError是URLError的子类，每次调用urlopen方法发出一个请求时，服务器上都会产生对应response,它包含一个数字"状态码"，
常见的状态码有200(请求成功),302(重定向)，304(文档的内容(自上次访问以来或者根据请求的条件)并没有改变)
这些状态码有的表示服务器无法完成请求。如果无法处理请求，urlopen会抛出HTTPError。
典型的错误包括404(页面没有找到)、403(请求被禁止)、401(当前请求需要用户认证)、407(需要代理验证)、500(服务器内部错误)

# 方式1
import urllib.request
import urllib.error


url = 'http://www.hello_world.org'
# url = 'http://example.com/test.html'
try:
    response = urllib.request.urlopen(url)
# HTTPError是URLError子类，要放到前面处理
except urllib.error.HTTPError as e:
    print("The server cannot fulfill the request...")
    print("Error code: ", e.code)
    print("Reason: ", e.reason)
except urllib.error.URLError as e:
    print("failed to fetch the server...")
    print("Reason: ", e.reason)

# 方式2
import urllib.request
import urllib.error


url = 'http://www.hello_world.org'
# url = 'http://example.com/test.html'
try:
    response = urllib.request.urlopen(url)
except urllib.error.URLError as e:
    if hasattr(e, 'code'):
        print("The server cannot fulfill the request...")
        print("Error code: ", e.code)
        print("Reason: ", e.reason)
    else:
        print("failed to fetch the server...")
        print("Reason: ", e.reason)

urllib.parse

urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)¶

负责解析URL

from urllib.parse import urlparse

# def urlparse(url, scheme='', allow_fragments=True)
# 将url解析成6部分 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
# 返回6元祖 (scheme, netloc, path, params, query, fragment)


result = urlparse('http://www.baidu.com/index.html;user?id=100#comment')
print(type(result))  # <class 'urllib.parse.ParseResult'>
print(result)   # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')

# 只有通过// 才能识别netloc
result = urlparse(
    '//www.baidu.com/index.html;user?id=100#comment',
    scheme="https")
print(result)  # ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')


result = urlparse(
    'www.baidu.com/index.html;user?id=100#comment',
    scheme="https")
print(result)  # ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=100', fragment='comment')

# 原url已包含scheme，使用已有的scheme
result = urlparse(
    'http://www.baidu.com/index.html;user?id=100#comment',
    scheme="https")
print(result)  # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')

result = urlparse(
    "http://www.baidu.com/index.html;user?id=100#comment",
    allow_fragments=False)
print(result)  # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100#comment', fragment='')

result = urlparse(
    "http://www.baidu.com/index.html#comment",
    allow_fragments=False)
print(result)  # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment='')

urllib.parse.urlunparse(parts)

from urllib.parse import urlunparse


data = ("http", 'www.baidu.com','index.html', 'user','id=100','comment')
url = urlunparse(data)
print(url)

urllib.parse.urljoin(base, url, allow_fragments=True)

# 以相对路径的url为准，base url向相对路径url提供相对路径url缺少的scheme(协议),netloc(主机地址), 来构造完整的url路径
from urllib.parse import urljoin


print(urljoin("http://www.baidu.com","FAQ.html"))
print(urljoin("http://www.baidu.com/index.html","FAQ.html"))
print(urljoin("http://www.baiud.com/index.html", "http://www.google.com/FAQ.html"))
print(urljoin("http://www.baidu.com/index.html", "http://www.google.com/FAQ.html?question=2"))
print(urljoin("http://www.baidu.com/index.html?wd=abc", "http://www.google.com/FAQ.html"))
print(urljoin("http://www.baidu.com/", "?category=5#comment"))
print(urljoin("http://www.baidu.com/#comment", "?category=5"))

'''
http://www.baidu.com/FAQ.html
http://www.baidu.com/FAQ.html
http://www.google.com/FAQ.html
http://www.google.com/FAQ.html?question=2
http://www.google.com/FAQ.html
http://www.baidu.com/?category=5#comment
http://www.baidu.com/?category=5
'''

urllib.parse.urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=quote_plus)

 from urllib.parse import urlencode

basic_url = 'http://httpbin.org/get'
data = {
    "key": '天气',
}
data = urlencode(data)
full_url = '%s?%s' % (basic_url, data)
print(full_url) # http://httpbin.org/get?key=%E5%A4%A9%E6%B0%94

posted on 2017-05-30 22:47 wanlifeipeng 阅读(337) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

wanlifeipeng

公告

搜索

积分与排名

随笔分类

阅读排行榜

评论排行榜

推荐排行榜

最新评论

urllib模块使用

urllib.request

urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

发送post数据

设置timeout

响应

响应类型、状态码、响应头、实际获取的url

class urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)¶

发送post数据

urllib.request.Request 对象方法

Handlers

ProxyHandler(代理)

代理需要身份认证

HTTPBasicAuthHandler

用于访问web服务器时的身份验证

FTPHandler

HTTPHandler、HTTPSHandler

CookieJar

MozillaCookieJar

创建与Mozilla cookies.txt文件兼容的FileCookieJar实例

LWPCookieJar

创建与libwww-perl Set-Cookie3文件兼容的FileCookieJar实例

异常处理

URLError

HTTPError

urllib.parse

urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)¶

负责解析URL

urllib.parse.urlunparse(parts)

urllib.parse.urljoin(base, url, allow_fragments=True)

urllib.parse.urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=quote_plus)