urllib基本使用1
urllib不需要下载
1 # 使用urllib来获取百度首页的源码 2 import urllib.request 3 4 # 1、定义一个url 5 url = 'http://www.baidu.com' 6 7 # 2、模拟浏览器向服务器发送请求 8 response = urllib.request.urlopen(url) 9 10 # 3、获取响应中的页面的源码 11 # read方法返回的是字节形式的二进制数据 12 content = response.read().decode('utf-8') 13 14 print(content) 15 16 # print(type(response)) 17 # HTTPResponse类型 18 19 # response.read() 20 # 一个字节一个字节的读 21 # response.read(5) 22 # 一次读5个字节 23 24 # response.readline() 25 # 一次读一行 26 27 # response.readlines() 28 # 一行一行读直至读完 29 30 # response.getcode() 31 # 状态码 32 33 # response.geturl() 34 # 访问的url地址 35 36 # response.getheaders() 37 # 获取响应头
urllib下载
1 import urllib.request 2 3 # 下载网页 4 url_page = 'http://www.baidu.com' 5 # url:下载路径,filename文件名字 6 urllib.request.urlretrieve(url_page,'baidu.html') 7 8 # 下载图片 9 # 下载视频
定制对象
1 import urllib.request 2 3 url = 'https://www.baidu.com' 4 5 # url的组成 6 # http/https www.baidu.com 80/443 # 7 # 协议 主机 端口号 路径 参数 锚点 8 9 10 headers = { 11 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0' 12 } 13 request = urllib.request.Request(url=url, headers=headers) 14 response = urllib.request.urlopen(request) 15 content = response.read().decode('utf-8') 16 print(content)
编解码
get请求
1 import urllib.request 2 import urllib.parse 3 4 url = 'https://www.baidu.com/s?wd=' 5 name = urllib.parse.quote("周杰伦") 6 # 多参数 https://www.baidu.com/s?wd=周杰伦&sex=男 7 data = { 8 'wd': '周杰伦', 9 'sex': '男' 10 } 11 params = urllib.parse.urlencode(data) 12 print(params)
post请求
1 import urllib.request 2 import urllib.parse 3 4 5 url = 'https://fanyi.baidu.com/sug' 6 headers = { 7 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0' 8 } 9 data = { 10 'kw': '蜘蛛' 11 } 12 # post请求的参数 必须进行编码(字节码) 13 data = urllib.parse.urlencode(data).encode("utf-8") 14 15 request = urllib.request.Request(url=url, data=data, headers=headers) 16 response = urllib.request.urlopen(request) 17 content = response.read().decode('utf-8') 18 print(content)
cookie
1 import urllib.request 2 import urllib.parse 3 4 url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh' 5 6 headers = {"Accept": "*/*", 7 # "Accept-Encoding": "gzip, deflate, br", 8 "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", 9 "Connection": "keep-alive", "Content-Length": "136", 10 "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", 11 "Cookie": "BAIDUID=5157A4D583A89C37A7545DADAB81726C:FG=1; BIDUPSID=36874D3B080FE4D3FE00F130757B1DCB; PSTM=1618275809; __yjs_duid=1_5c8c0adae28380c2c87efcd8b022c3551618281414735; MCITY=-289%3A179%3A; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1648619861,1649214103,1649658988,1649730956; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; APPGUIDE_10_0_2=1; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1649730956; ab_sr=1.0.1_MWVlMDU1ZjZmYWEwNzE5N2UxMzJmMjkwZWQ4Y2U5ZTU3NWQ2NzE1M2YxYWYxYzc3M2Y5NWM5MzQ5YzU2YTRkMTZkNTUyYzQzOWViMjJkMzdiNGQxZjAyNTYxYmRlN2Q5MTcxODg4NDFjYWMxM2I0ZThjZGZmM2YxNTNmNGJlYzNkZDczODgxMzAzMTM1NTlhYTk3ZmYxZGY2ODBkZTMzMw==", 12 "Host": "fanyi.baidu.com", "Origin": "https://fanyi.baidu.com", "Referer": "https://fanyi.baidu.com/" 13 , "Sec-Fetch-Dest": "empty", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin" 14 , "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0" 15 , "X-Requested-With": "XMLHttpRequest" 16 } 17 18 data = { 19 "from": "en", 20 "to": "zh", 21 "query": "spider", 22 "transtype": "realtime", 23 "simple_means_flag": "3", 24 "sign": "63766.268839", 25 "token": "c45b7821850766d1e62222dc6115e145", 26 "domain": "common" 27 } 28 29 data = urllib.parse.urlencode(data).encode("utf-8") 30 31 request = urllib.request.Request(url=url, data=data, headers=headers) 32 response = urllib.request.urlopen(request) 33 content = response.read().decode('utf-8') 34 print(content) 35 36 import json 37 print(json.loads(content))