urilib: 基本使用&get&post

urllib_1_基本使用

"""
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/23 20:54.
@Author: haifei
"""
import time
import urllib.request

if __name__ == '__main__':
    start = time.time()

    # 1. 定义访问地址
    url = 'http://www.baidu.com'
    # 2. 模拟浏览器想服务器发送请求
    response = urllib.request.urlopen(url)
    # 3. 获取响应中的页面源码
    content = response.read()  # .read()返回字节形式的二进制数据
    # print(content)
    # 4. 解码:二进制--》字符串
    content = content.decode('utf-8')
    print(content)

    print('It takes', time.time() - start, "seconds.")

urllib_2_一个类型和六个方法

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/23 21:01.
@Author: haifei
"""
import time
from urllib import request

url = "http://irun2u.top"
response = request.urlopen(url)
# content = response.read().decode("utf-8")
# print(content)

# 一个类型
print(type(response))  # <class 'http.client.HTTPResponse'>

# 六个方法:.read([n字节])  .readline()  .readlines()  .getcode()  .geturl()  .getheaders()

# read均为按字节读取(注意多个read不能同时读)
# print(response.read(5))  # 读取5个字节
# print(response.readline())  # 读取一行
# print(response.readlines())  # 按行读,直到读完

print(response.getcode())  # 获取状态码:200--》OK
print(response.geturl())  # 获取当前访问的目标地址:http://irun2u.top
print(response.getheaders())  # 获取请求头信息

if __name__ == '__main__':
    start = time.time()
    print('It takes', time.time() - start, "seconds.")

urllib_3_下载

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/23 21:16.
@Author: haifei
"""
import time
from urllib import request

# 下载网页
url_page = 'http://irun2u.top'
request.urlretrieve(url_page, './download/irun2utop.html')

# 下载图片
url_img = 'https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fsafe-img.xhscdn.com%2Fbw1%2F91239c50-d064-4ec1-b998-1e5f979c9c46%3FimageView2%2F2%2Fw%2F1080%2Fformat%2Fjpg&refer=http%3A%2F%2Fsafe-img.xhscdn.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=auto?sec=1682170811&t=53fd80c95575efcc38e04269a4addf3f'
request.urlretrieve(url=url_img, filename='./download/lisa.jpg')

# 下载视频
url_video = 'https://vd4.bdstatic.com/mda-kg0pcztgi0rucsza/v1-cae/sc/mda-kg0pcztgi0rucsza.mp4?v_from_s=hkapp-haokan-nanjing&auth_key=1679580855-0-0-293c71bb38a72b92a305768a159a1da1&bcevod_channel=searchbox_feed&pd=1&cd=0&pt=3&logid=2655222603&vid=10392909521055706475&abtest=107353_1&klogid=2655222603'
request.urlretrieve(url_video, './download/lisa.mp4')


if __name__ == '__main__':
    start = time.time()
    print('It takes', time.time() - start, "seconds.")

urllib_4_请求对象定制

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/23 21:52.
@Author: haifei
"""
import time
from urllib import request

# http 80
# https=http+ssl 443

url = 'https://www.baidu.com'
response = request.urlopen(url)
content = response.read().decode('utf-8')
print(content)  # 获取到的内容远远少于http://www.baidu.com,原因是反爬了


'''
UA反爬
UA介绍:User Agent中文名为用户代理,简称 UA,它是一个特殊字符串头,使得服务器能够识别客户使用的操作系统
及版本、CPU 类型、浏览器及版本。浏览器内核、浏览器渲染引擎、浏览器语言、浏览器插件等
UA大全参考:https://blog.csdn.net/Uridis/article/details/86558811
'''
# 请求对象定制
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
request2 = request.Request(url=url, headers=headers)
response2 = request.urlopen(request2)
content2 = response2.read().decode('utf-8')
print(content2)  # ok,获取的内容跟http://www.baidu.com一样了


if __name__ == '__main__':
    start = time.time()
    print('It takes', time.time() - start, "seconds.")

urllib_5_get请求的quote方法

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/23 22:13.
@Author: haifei
"""
import time
from urllib import request, parse


# https://www.baidu.com/s?ie=UTF-8&wd=%E5%91%A8%E6%9D%B0%E4%BC%A6
#url = 'https://www.baidu.com/s?ie=UTF-8&wd=%E5%91%A8%E6%9D%B0%E4%BC%A6'  # 乱码为unicode编码
name = parse.quote('周杰伦')  # 将'周杰伦'转为unicode编码,同上
print(name)  # %E5%91%A8%E6%9D%B0%E4%BC%A6
url = 'https://www.baidu.com/s?ie=UTF-8&wd=' + name  # 效果同上url
print(url)



headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
#  请求对象定制是解决UA反爬的手段
request2 = request.Request(url=url, headers=headers)
response = request.urlopen(request2)
content = response.read().decode('utf-8')
print(content)



if __name__ == '__main__':
    start = time.time()
    print('It takes', time.time() - start, "seconds.")

urllib_6_get请求的urlencode方法

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/23 22:59.
@Author: haifei
"""
import time
from urllib import parse, request


headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}

# urlencode应用场景:多参数
# https://www.baidu.com/s?ie=UTF-8&wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7
# https://www.baidu.com/s?ie=UTF-8&wd=周杰伦&sex=男
url = 'https://www.baidu.com/s?ie=UTF-8&wd=' + parse.quote('周杰伦') + '&sex=' + parse.quote('男')
print('url: ' + url)

data = {
    'wd': '周杰伦',
    'sex': '男'
}
url2 = 'https://www.baidu.com/s?ie=UTF-8&' + parse.urlencode(data)
print('url2: ' + url2)  # 同上url


base_url = 'https://www.baidu.com/s?'
base_data = {
    'wd': 'Lisa',
    'sex': '女',
    'location': '南韩'
}
new_data = parse.urlencode(base_data)
new_url = base_url + new_data
print(new_url)
request2 = request.Request(url=new_url, headers=headers)  # TOD:请求对象定制反爬
content = request.urlopen(request2).read().decode('utf-8')
print(content)


if __name__ == '__main__':
    start = time.time()
    print('It takes', time.time() - start, "seconds.")

urllib_7_post请求百度翻译之普通翻译

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/24 23:02.
@Author: haifei
"""
import json
import time
from urllib import request, parse


headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}

# 谷歌浏览器打开百度翻译F12可找到接口sug(普通翻译)
url = "https://fanyi.baidu.com/sug"

data = {
    'kw': 'spider'
}

# post请求所需参数必须进行编码.urlencode(),且编码之后必须调用.encode('utf-8')
data = parse.urlencode(data).encode('utf-8')
print(data)  # kw=spider

# 请求对象定制
# post请求的参数不会?拼接在URL后面,而是需要放在请求对象定制的参数中,下data参数
_request = request.Request(url=url, data=data, headers=headers)
print(_request)  # <urllib.request.Request object at 0x102ac6250>

# 模拟浏览器向服务器发送post请求
response = request.urlopen(_request)
print(response)  # <http.client.HTTPResponse object at 0x10121a5b0>

# 获取响应数据
content = response.read().decode("utf-8")
print(content)
print(type(content))  # <class 'str'>

# 字符串转字典两种方法:
# 1、字符串转字典dic=eval(str)
dic_content = eval(content)
print(type(dic_content))  # <class 'dict'>
data = dic_content.get('data')[0]
print(data)  # {'k': 'spider', 'v': 'n. 蜘蛛; 星形轮,十字叉; 带柄三脚平底锅; 三脚架'}
print(type(data))  # <class 'dict'>
print(data.get('v'))  # n. 蜘蛛; 星形轮,十字叉; 带柄三脚平底锅; 三脚架

# 2、字符串转json对象(也是字典类型)
json_content = json.loads(content)
print(json_content)  # =dic_content
print(type(json_content))  # <class 'dict'>

if __name__ == '__main__':
    start = time.time()
    print('It takes', time.time() - start, "seconds.")

urllib_8_post请求百度翻译之详细翻译

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/24 23:02.
@Author: haifei
"""
import json
import time
from urllib import request, parse

# F12查看该接口的Headers中Request Headers即为所需请求头信息
# 注意:需要注释掉Accept-Encoding这一行,因为没有utf-8编码,会报错
headers = {
    'Accept': '*/*',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Acs-Token': '1679672667898_1679672667576_SlMSHXMJiE5lO9O3mCbWXoLpxMKuOCuCrmVe6FIg/IKZBgeYHKHsmtqdpt/0wzm4lRYtqHhwdh5bF9qEEols1QlVyi8FUOJsMsWtaiq3LlPe4Bg3rUMLI26ka8WrCqkw4jVHdLC+W6gtaUPft3vRHGatTpVwSwiI1qNsvjl+N7fs0qf1mF//0C3ea6IoZ4/nE1uWLWTzqHkt0TIw/FJlHUt7oNn+5fyrKP1nUBSKU00xpi+awI/Zsv7tlLLNyxrt0+ePrjepVLzrK9kEHr9zNU2Cpqox3Kc88rMb61Vuc8+YJWV4FVvyQZ1+6wQ7aPd+QuAx0RyEXTqU1YoVXFVKbeZviLGgI1POh9075YP89vo=',
    'Connection': 'keep-alive',
    'Content-Length': '116',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Cookie': 'BIDUPSID=B9B52F7273A4D0A02F4224DF0FE584E9; PSTM=1644560257; ZFY=WJR0yuV2wnPtrVSkigGW9zh6r:BS3wlaNLebcRmDOrT4:C; BAIDUID=131FA2C2E20EDCC5307B724B1B8D1609:FG=1; BAIDUID_BFESS=131FA2C2E20EDCC5307B724B1B8D1609:FG=1; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1679671060; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; ab_sr=1.0.1_NGRlOTg4M2IyMmZjMDhkNWQzYWQ2N2EzZmIxYzY3YzVhNTE4YTZmNGNjZTZiZTU4NTQ1ZThhYWNlNjU5Y2YyYWZmZDMyZTAwYjUxMzJjMWExMjVkYzQyZmU4MzVhN2JiZDVkNDBhMjEzYzJmNjZkMTJkODg4ZWNmNGY5YjNlMGRlMWM5NGU0NjE4ZDJiOTc2YTQzNDk5ZTBmYmI4NWU0NQ==; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1679672667',
    'Host': 'fanyi.baidu.com',
    'Origin': 'https://fanyi.baidu.com',
    'Referer': 'https://fanyi.baidu.com/',
    'sec-ch-ua': '"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
}
# 实际上述请求头中除了Cookie外都可以注释掉,Cookie是关键的,反爬

# 谷歌浏览器打开百度翻译F12可找到接口v2transapi(详细翻译)
url = "https://fanyi.baidu.com/v2transapi?from=en&to=zh"

# F12查看该接口的Payload中Form Data即为所需参数
data = {
    'from': 'en',
    'to': 'zh',
    'query': 'girl',
    'simple_means_flag': '3',
    'sign': '780982.985479',
    'token': '9d0251d64cfa1d98e5aab063d19cd487',
    'domain': 'common',
}

# post请求所需参数必须进行编码+encode
data = parse.urlencode(data).encode('utf-8')

# 请求对象定制
_request = request.Request(url=url, data=data, headers=headers)

# 模拟浏览器向服务器发送post请求
response = request.urlopen(_request)
content = response.read().decode('utf-8')
print(content)  # str
print(json.loads(content))  # dic


if __name__ == '__main__':
    start = time.time()
    print('It takes', time.time() - start, "seconds.")


https://www.bilibili.com/video/BV1Db4y1m7Ho

posted @ 2023-03-25 00:28  yub4by  阅读(46)  评论(0编辑  收藏  举报