Python requests模块
一、介绍
介绍:使用requests可以模拟浏览器的请求,比起之前用到的urllib,requests模块的api更加便捷(本质就是封装了urllib3)
注意:requests库发送请求将网页内容下载下来以后,并不会执行js代码,这需要我们自己分析目标站点然后发起新的request请求
安装:pip3 install requests
二、基于GET请求
import requests
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
}
url = 'https://www.baidu.com'
params = {
'query': 'xxx'
}
Cookies={'user_session':'wGMHFJKgDcmRIVvcA14_Wrt_3xaUyJNsBnPbYzEL6L0bHcfc'}
response = requests.get(url=url, params=params, headers=head, cookies=Cookies)
print(response.text)
三、基于POST请求
import requests
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
}
url = 'https://www.baidu.com'
data = {
'query': 'xxx'
}
Cookies={'user_session':'wGMHFJKgDcmRIVvcA14_Wrt_3xaUyJNsBnPbYzEL6L0bHcfc'}
response = requests.post(url=url, data=data, headers=head, cookies=Cookies)
print(response.text)
四、response属性
import requests
respone=requests.get('http://www.jianshu.com')
# respone属性
print(respone.text)
print(respone.content)
print(respone.status_code)
print(respone.headers)
print(respone.cookies)
print(respone.cookies.get_dict())
print(respone.cookies.items())
print(respone.url)
print(respone.history)
print(respone.encoding)
print(respone.json())
五、处理cookie
基于session自动处理cookie
1、创建一个空白的session对象
2、需要使用session对象发起请求,自动捕获服务器产生的cookie并存储在session对象中
3、使用携带cookie的session对象,对目标网址发起请求,此时请求将携带cookie
import requests
if __name__ == '__main__':
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
url = 'https://xueqiu.com/statuses/hot/listV2.json'
params = {
'since_id': '-1',
'max_id': '460965',
'size': '15',
}
# 创建一个空白session对象
session = requests.Session()
# 用session对象发起请求,获取cookie
session.get(url='https://xueqiu.com/', headers=header)
# 用获得cookie的session对象向目标网址发起请求
response = session.get(url=url, headers=header, params=params)
response.encoding = 'utf-8'
for item in response.json().get('items'):
user = item['original_status']['user']['screen_name']
content = item['original_status']['user']['description']
print(f'{user}:{content}')
注意:session对象至少需要发起两次请求
六、代理操作
import random
import requests
from lxml import etree
if __name__ == '__main__':
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
# 获取代理ip url
proxy_url = 'http://webapi.http.zhimacangku.com/getip'
proxy_json = requests.get(url=proxy_url, headers=header).json()
# 构建代理池格式{'http/https':'ip:port'}
proxy_pool = []
for proxy in proxy_json['data']:
proxy_ip = proxy['ip']
proxy_port = proxy['port']
proxy_dic = {
'https': f'{proxy_ip}:{proxy_port}'
}
proxy_pool.append(proxy_dic)
while 1:
url = f'https://www.kuaidaili.com/free/inha/1/'
# 使用random模块中chioce函数随机使用代理池代理访问
response = requests.get(url=url, headers=header, proxies=random.choice(proxy_pool))
response.encoding = 'utf-8'
tree = etree.HTML(response.text)
tr_list = tree.xpath('//*[@id="list"]/table//tr[position()>1]')
for tr in tr_list:
address = tr.xpath('./td[5]/text()')[0]
ip = tr.xpath('./td[1]/text()')[0]
port = tr.xpath('./td[2]/text()')[0]
print(f'{address} {ip}:{port}')
七、编码问题
#编码问题
import requests
response=requests.get('http://www.autohome.com/news')
# response.encoding='gbk' #汽车之家网站返回的页面内容为gb2312编码的,而requests的默认编码为ISO-8859-1,如果不设置成gbk则中文乱码
print(response.text)
八、案例
(一)古诗词网模拟登陆
import requests
from lxml import etree
from verification_code import verification_img_code
if __name__ == '__main__':
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
url = 'https://so.gushiwen.cn/user/login.aspx'
# 创建空白session对象
session = requests.Session()
# 捕获服务器端返回的cookie
main_response = session.get(url=url, headers=header)
main_response.encoding = 'utf-8'
tree = etree.HTML(main_response.text)
# 动态获取参数
__VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
# 下载获取验证码图片
img_code = 'https://so.gushiwen.cn' + tree.xpath('//*[@id="imgCode"]/@src')[0]
img_code_content = session.get(url=img_code, headers=header).content
with open('./img_code.jpg', 'wb') as fp:
fp.write(img_code_content)
# 使用图鉴识别验证码
img_code = verification_img_code('./img_code.jpg', 3)
data = {
'__VIEWSTATE': __VIEWSTATE,
'__VIEWSTATEGENERATOR': 'C93BE1AE',
'from': '',
'email': '*******',
'pwd': '*********',
'code': img_code,
'denglu': '登录'
}
# 发起登录post请求
login_response = session.post(url=url, data=data, headers=header)
login_tree = etree.HTML(login_response.text)
with open('./test.html', 'w', encoding='utf-8') as fp:
fp.write(login_response.text)
(二)爬取防盗链图片
import requests
from lxml import etree
if __name__ == '__main__':
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
# 请求头带上Referer即可爬取防盗链图片
'Referer': 'https://blog.sina.com.cn/s/blog_19d48b90a01030dym.html?tj = 1'
}
url = 'https://blog.sina.com.cn/s/blog_19d48b90a01030dym.html?tj=1'
response = requests.get(url=url, headers=header)
response.encoding = 'utf-8'
tree = etree.HTML(response.text)
div_list = tree.xpath('//*[@id="sina_keyword_ad_area2"]/div')
# 构建图片真实下载地址列表
real_link_list = []
for div in div_list:
img_link = div.xpath('./a/img/@real_src')
real_link_list.extend(img_link)
i = 0
for img in real_link_list:
img_content = requests.get(url=img, headers=header).content
with open(f'./sina_img/{i}.jpg', 'wb') as fp:
fp.write(img_content)
i += 1
print(f'{i} 下载成功!')