urllib基本使用
urllib基本使用
官方文档:https://docs.python.org/zh-cn/3/library/urllib.html
Python urllib 库用于操作网页 URL,并对网页的内容进行抓取处理,包含4个模块:request、error、parse、robotparser
request
最基本的 http 请求模块,可以用来发送 http 请求,并接收服务端的响应数据;下面例举了一些 urllib 的 request 模块基础方法使用案例:
import urllib.request
url = 'http://www.baidu.com'
# 发送请求
response = urllib.request.urlopen(url)
# 打印 response 类型:<class 'http.client.HTTPResponse'>
print(type(response))
# 获取响应页面源码(read 读取的是二进制的数据),并以 utf-8 格式解码
content = response.read().decode('utf-8')
# 获取响应页面源码的前5个字节内容
response.read(5)
# 读取响应页面源码的一行内容
line = response.readline()
# 一行一行获取响应页面源码的内容直至结束(打印为空,暂时不知道为什么)
lines = response.readlines()
# 获取状态码
code = response.getcode()
# 获取访问的 url
url = response.geturl()
# 获取响应头
headers = response.getheaders()
下载文件
# 下载文件,可以是网页、图片、视频......;第一个参数:文件 url,第二个参数:下载的文件名
urllib.request.urlretrieve(url, 'baidu.html')
构造 Request 发送请求
有的网站会识别请求是否携带 UA,如果没有则认为该请求是非正常请求,而不会返回数据给你;所以有的时候需要构造 Request 携带 UA 进行访问
# 设置 UA
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
# 设置 Request,注意:设置参数时带上参数名(默认第二个参数是 data)
request = urllib.request.Request(url='https://www.baidu.com', headers=header)
https_content = urllib.request.urlopen(request).read().decode('utf8')
UA介绍:User Agent 中文名为用户代理,简称 UA,它是一个特殊字符串头,使得服务器能够识别客户使用的操作系统 及版本、CPU 类型、浏览器及版本;浏览器内核、浏览器渲染引擎、浏览器语言、浏览器插件等(F12 打开浏览器控制台,可以在一个请求的 Request Headers 中看到 UA 的相关信息)
parse
有时候我们发送请求时需要携带一些参数,需要将参数转化成 unicode 编码格式后再拼接至 url,否则会报 UnicodeEncodeError 错误
单个参数编码
import urllib.request
import urllib.parse
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
# 将参数转化成 unicode 编码格式成后拼接至 url
url = 'https://www.baidu.com/s?wd=' + urllib.parse.quote('博客园')
request = urllib.request.Request(url=url, headers=header)
response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
多个参数编码
import urllib.request
import urllib.parse
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
url = 'https://www.baidu.com/s?'
data = {
'wd': '小明',
'sex': '男'
}
# 将参数转化成 unicode 编码格式成后拼接至 url
url = url + urllib.parse.urlencode(data)
request = urllib.request.Request(url=url, headers=header)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
post 请求
import urllib.request
import urllib.parse
import json
url = 'https://fanyi.baidu.com/v2transapi?from=zh&to=en'
# 百度翻译有反爬,需要携带 Cookie,否则返回错误(如果参数都对但是请求报错考虑是否缺失请求头信息)
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Cookie':'BIDUPSID=FE844A67B2FF56C5ADABC85593863D5C; PSTM=1667189167; BAIDUID=6D23EF3A381CB93BF5C068B9D4FDEA14:FG=1; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1672211305; BDSFRCVID=avtOJeC627YLYa3jGtkwhhHrzKoEzoJTH6ao5T1ROv0CoasKvifEEG0PXU8g0KFM3JSsogKK3gOTH4DF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF=tbPqVCIbJK03j48kb-o5Mt-HenryqURZ5mAqofcMfnnmqRFCyp6Ny6K7QP5qL4rp3JcnaIQqabb-ShRuDx7J04rLb-RNQt543bRTbRLy5KJvEq6KqtTBhP-UyNbMWh37JgnlMKoaMp78jR093JO4y4Ldj4oxJpOJ5JbMonLafD-KMCP9jT8-en8_bU_XKPQe2CvX0n5b2R3_KROkepnKQM4pbt-qJtr75gTW_bFKWMoqOCQ2QU5-MMC9bxnnBT5ht23x3tTlBbbhh-oj2fRS358kQN3T3qLO5bRiLl6KLtjkDn3oyTbJXp0nj-Oly5jtMgOBBJ0yQ4b4OR5JjxonDh83bG7MJPKtfJPH_Ct2JDPKbn6vb-Qoq4tSeUJBWURZ5mAqotnp2ROlfRFCyPvVy6K7QP5la-od2eQnaIQqaKodhxTzKxT-QMDLb4_O-lv43bRT-hCy5KJvfq6CKU7khP-UyNbMWh37JgnlMKoaMp78jR093JO4y4Ldj4oxJpOJ5JbMonLafD8WhCI9D6tben-W5gTKbPcha5-X3buQbRcO8pcNLTDKetAuLG59afbJ25T3Bpn-Mq6BDn5GXpO1j4_eWlOvBCrBQeOfXq6cbJTtsq5jDh3B25ksD-RlW6JZL57y0hvctb3cShPmQMjrDRLbXU6BK5vPbNcZ0l8K3l02V-bIe-t2b6QhDNAHq6kHJbksL-35HJj2jJQv2KTjh6P_5lO9BtQmJJufhlRwKCO8sPbc5P7sybDhyUn4htRZQg-q3RAaQIjC8PLxLPnHKUPBMfj40x-jLNbPVn0MWhQYhpcbKPnJyUnybPnnBT3R3H8HL4nv2JcJbM5m3x6qLTKkQN3T-PKO5bRh_CcJ-J8XhC-mjjQP; BA_HECTOR=a4ah0k258hah8ha40g8502541hqqij41j; delPer=0; PSINO=7; ZFY=7mb8SwFsz7cnjNmOUQlaxa5L1Q4su3qRs06N4WXJHYE:C; BAIDUID_BFESS=6D23EF3A381CB93BF5C068B9D4FDEA14:FG=1; BDSFRCVID_BFESS=avtOJeC627YLYa3jGtkwhhHrzKoEzoJTH6ao5T1ROv0CoasKvifEEG0PXU8g0KFM3JSsogKK3gOTH4DF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tbPqVCIbJK03j48kb-o5Mt-HenryqURZ5mAqofcMfnnmqRFCyp6Ny6K7QP5qL4rp3JcnaIQqabb-ShRuDx7J04rLb-RNQt543bRTbRLy5KJvEq6KqtTBhP-UyNbMWh37JgnlMKoaMp78jR093JO4y4Ldj4oxJpOJ5JbMonLafD-KMCP9jT8-en8_bU_XKPQe2CvX0n5b2R3_KROkepnKQM4pbt-qJtr75gTW_bFKWMoqOCQ2QU5-MMC9bxnnBT5ht23x3tTlBbbhh-oj2fRS358kQN3T3qLO5bRiLl6KLtjkDn3oyTbJXp0nj-Oly5jtMgOBBJ0yQ4b4OR5JjxonDh83bG7MJPKtfJPH_Ct2JDPKbn6vb-Qoq4tSeUJBWURZ5mAqotnp2ROlfRFCyPvVy6K7QP5la-od2eQnaIQqaKodhxTzKxT-QMDLb4_O-lv43bRT-hCy5KJvfq6CKU7khP-UyNbMWh37JgnlMKoaMp78jR093JO4y4Ldj4oxJpOJ5JbMonLafD8WhCI9D6tben-W5gTKbPcha5-X3buQbRcO8pcNLTDKetAuLG59afbJ25T3Bpn-Mq6BDn5GXpO1j4_eWlOvBCrBQeOfXq6cbJTtsq5jDh3B25ksD-RlW6JZL57y0hvctb3cShPmQMjrDRLbXU6BK5vPbNcZ0l8K3l02V-bIe-t2b6QhDNAHq6kHJbksL-35HJj2jJQv2KTjh6P_5lO9BtQmJJufhlRwKCO8sPbc5P7sybDhyUn4htRZQg-q3RAaQIjC8PLxLPnHKUPBMfj40x-jLNbPVn0MWhQYhpcbKPnJyUnybPnnBT3R3H8HL4nv2JcJbM5m3x6qLTKkQN3T-PKO5bRh_CcJ-J8XhC-mjjQP; __bid_n=184f128997e3755a704207; FEID=v10-08ddd07c15dd88d197f4e7f8bfcaf7b5e9fb30de; __xaf_fpstarttimer__=1672304282882; __xaf_ths__={"data":{"0":1,"1":43200,"2":60},"id":"8e46496e-1a9f-4f9a-a78a-1d07b987c7c0"}; __xaf_thstime__=1672304282925; FPTOKEN=fDAYMZZw8zbLcUKBWuajW+qKbwAEUqx1CeV6NzuPMFFJp7/4sKoA4c+U51hYM1V66Ml3sIP14RBbug7OHtV/iiVK8IWdYZOqUrIhUvOvE+Ors70OxBA2HrZgQRDzfZHA+S9z1qHX2Bw14n/HybBABbuf1xhrKQ+HATw4f2Ysll8M05xMQq+smOVrnPXbQ0JSaC4GXTv5JUVnbj/VKnsoV51ZEcXTzYW3iH/djONTjjY0swy5IiaxI3dHvDViOhXKDOh1E1/vQg1vPsH9mHSML62TSw+bcc97em8PkbRmPA7mYBbPDlOMlbp1b3eL67aUDSTUm358wSSbmY9HUsXYXw5MSUeAJr/vRqzmMQVeUlMk7Qk+/YgePe0VfqhsGEj/MTPeSl6/n3WSuelrfQA1sQ==|yf3XNK7EdJvOYIq7WLxJ/XrzCOpvI8lp/P0XHJkwhgE=|10|e474f348bc2b35ee7ee8b88336293351; __xaf_fptokentimer__=1672304283063; H_PS_PSSID=36558_37647_37689_37625_36920_36802_37930_37900_26350_37959_37881; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1672319616; ab_sr=1.0.1_NTk0YTRhZjliYzE3MDczYWQ5ZjczNzY0ZDEwYzQxOWExN2ZjNjAyNTViN2I3ODI1M2Y3MzE3ZDNjMjY2MDNjZDQ3YzVjZTZiZDI2NjU1MWY5NmQ1M2Y3OGQwYTM3MjFlMzUzNDBlZjBmMGI0Y2E2M2QzNDY2NGRlNjI4MDM3ZWQyOGUyNTZmMDljNWIyMzUzNDQyMDc3Y2EzZjAzODYzOQ=='
}
data = {
'from': 'zh',
'to': 'en',
'query': '朋友',
'transtype': 'tanslang',
'simple_means_flag': 3,
'sign': 646578.867971,
'token': '69f8f800722d4702e68dc07ddecc2054',
'domain': 'common'
}
# 将参数转化成 unicode 编码格式(并以 utf-8 编码)
data_encode = urllib.parse.urlencode(data).encode('utf-8')
# post 请求参数需要放在 Request 中
request = urllib.request.Request(url=url, data=data_encode, headers=header)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# 将字符串内容转化成 python 对象
obj_content = json.loads(content)
# 将 python 对象转化成 json 字符串,ensure_ascii=False: 忽略字符集编码
json_content = json.dumps(obj_content, ensure_ascii=False)
print(json_content)
如果在 Request 中指定了 data 参数,那么 urlopen 默认发送 post 请求;当设置了 data 参数后指定 method 参数为 get,那么 data 数据并不会提交给服务器
实战
爬取豆瓣电影排行榜指定页数据(get 请求)
import urllib.request
import urllib.parse
# 获取请求头
def get_request(page_no):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100:90&action=&'
param = {
'start': (page_no - 1) * 20,
'limit': 20
}
url = url + urllib.parse.urlencode(param)
return urllib.request.Request(url=url, headers=headers)
if __name__ == '__main__':
page_start = int(input('请输入起始页码:'))
page_end = int(input('请输入结束页码:'))
# range 用法: https://blog.csdn.net/TUSTer_/article/details/122280110
for page_no in range(page_start, page_end + 1):
request = get_request(page_no)
# 返回的是 json 数据
content = urllib.request.urlopen(request).read().decode('utf-8')
'''
打开一个文件并返回一个文件(open 方法默认使用 gbk 编码,如果要保存汉字需要指定 utf-8 编码)
参考: https://blog.csdn.net/a379749/article/details/123994571
'''
file = open('movie-douban-' + str(page_no) + '.json', 'w', encoding='utf-8')
file.write(content)
file.close()
以上写数据到文件代码还有另一种写法:
with open('movie-douban-' + str(page_no) + '.json', 'w', encoding='utf-8') as file:
file.write(content)
file.close()
爬取肯德基餐厅地址指定页数据(post 请求)
import urllib.request
import urllib.parse
# 获取请求头
def get_request(page_no):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
param = {
'cname': '',
'pid': '',
'keyword': '广州',
'pageIndex': page_no,
'pageSize': 10
}
data = urllib.parse.urlencode(param).encode('utf-8')
return urllib.request.Request(url=url, data=data, headers=headers)
if __name__ == '__main__':
page_start = int(input('请输入起始页码:'))
page_end = int(input('请输入结束页码:'))
for page_no in range(page_start, page_end + 1):
request = get_request(page_no)
# 返回的是 json 数据
content = urllib.request.urlopen(request).read().decode('utf-8')
with open('address-kdj-' + str(page_no) + '.json', 'w', encoding='utf-8') as file:
file.write(content)
file.close()
有的页面需要登录之后才可以访问,此时请求头需要携带登录的 cookie 信息才能获取到页面数据;补充 referer 请求头作用:判断当前路径是不是由指定路径进来的,一般用于图片的防盗链
Handler
Handler 可以定制更高级的请求头,随着业务逻辑的复杂,一般的 Request 已经满足不了我们的需求,比如:动态 cookie 和代理
Handler 的基本使用:通过 handler 获取百度页面源码
import urllib.request
import urllib.parse
url = 'http://www.baidu.com'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=header)
# 获取 HTTPHandler 对象
handler = urllib.request.HTTPHandler()
# 获取 opener 对象
opener = urllib.request.build_opener(handler)
# 发送请求
response = opener.open(request)
content = response.read().decode('utf-8')
print(content)
通过代理访问页面,eg:
import urllib.request
import urllib.parse
url = 'http://www.baidu.com/s?wd=ip'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
proxies = {'http': '117.141.155.244:53281'}
request = urllib.request.Request(url=url, headers=header)
# 获取 ProxyHandler 对象
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
print(content)
代理多个 ip,eg:
proxies_pool = [
{'http': '117.141.155.244:53281'},
{'http': '117.141.155.244:53281'}
]
# 随机获取一个代理 ip
proxies = random.choice(proxies_pool)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Ollama——大语言模型本地部署的极速利器
· 使用C#创建一个MCP客户端
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· ollama系列1:轻松3步本地部署deepseek,普通电脑可用
· 按钮权限的设计及实现