Requests 模块
Requests 模块
1 Requests 模块
1.1 简介
Python中封装好的一个基于网络请求的模块。用来模拟浏览器发请求。
Requests支持HTTP连接保持和连接池,支持使用cookie保持会话,支持文件上传,支持自动确定响应内容的编码,支持国际化的 URL 和 POST 数据自动编码。requests 的底层实现其实就是 urllib3
官网:Requests: HTTP for Humans™ — Requests 2.28.1 documentation
1.2 安装方式
pip install requests
2 Requests 处理文本数据
2.1 获取搜狗首页的页面源码数据
import requests
# 指定URL
url = 'https://www.sogou.com/web'
kw = input("请输入要查询的内容:")
params = {
"query": kw,
}
# 解决UA检测
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
}
# 发起请求
response = requests.get(url=url, params=params,headers=headers)
# 修改响应数据的编码格式
response.encoding = "utf-8"
result = response.text
# 持久化存储结果
with open(kw + ".html", 'w', encoding="utf-8") as f:
f.write(result)
2.2 获取豆瓣电影数据
import requests
# 指定URL
url = 'https://movie.douban.com/j/chart/top_list'
start_kw = input("请输入要从第几部电影开始获取:")
limit_kw = input("请输入要获取多少电影数据:")
params = {
"type": "13",
"interval_id": "100:90",
"action": "",
"start": start_kw,
"limit": limit_kw,
}
# 解决UA检测
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
}
# 发起请求
response = requests.get(url=url, params=params, headers=headers)
# 修改响应数据的编码格式
response.encoding = "utf-8"
# 持久化存储结果
with open("douban.txt", 'w', encoding="utf-8") as f:
for movie in response.json():
f.write(movie["title"]+" : " + movie["score"] + "\n")
2.3 获取查询地方的FKC门店数量及地址
import requests
import math
def get_kfc_addr(kw, page="1"):
# 指定URL
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
data = {
"cname": "",
"pid": "",
"keyword": kw,
"pageIndex": page,
"pageSize": "10",
}
# 解决UA检测
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
}
# 发起请求
response = requests.post(url=url, data=data, headers=headers)
# 修改响应数据的编码格式
response.encoding = "utf-8"
return response.json()
kw = input("请输入要查询的地址:")
# 获取当前门店总数
nums = get_kfc_addr(kw)["Table"][0]["rowcount"]
# 得到当前地区页面数量
pages = math.ceil(int(nums) / 10)
# 持久化存储结果
with open(kw + ".txt", 'w', encoding="utf-8") as f:
f.write(f"共计拥有{nums}门店\n")
for i in range(int(nums)):
session = get_kfc_addr(kw, str(i + 1))
for store in session['Table1']:
f.write(store["storeName"] + " : " + store["addressDetail"] + "\n")
3 Requests 处理图像数据
3.1 使用 Requests 模块保存图像数据
import requests
# 指定URL
url = 'https://xiaohua-fd.zol-img.com.cn/t_s600x5000/g2/M00/06/07/ChMlWlyq_pKIfD5WABjHxz2KxPoAAJTtgHk_qkAGMff975.gif'
# 解决UA检测
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
}
# 发起请求
img_data = requests.get(url=url, headers=headers).content
# 持久化存储结果
with open("img.gif", 'wb') as f:
f.write(img_data)
3.2 使用 urllib 模块中 Requests 保存图像数据
from urllib import Requests
# 指定URL
url = 'https://xiaohua-fd.zol-img.com.cn/t_s600x5000/g2/M00/06/07/ChMlWlyq_pKIfD5WABjHxz2KxPoAAJTtgHk_qkAGMff975.gif'
Requests.urlretrieve(url,filename="haha.gif")
- 简单,但不能使用UA伪装
4 数据解析
数据解析的实现方式:
- 正则
- bs4
- xpath
- pyquery
4.1 正则
import requests
from urllib import Requests
import re
import os
import time
dirname = "./imglibs"
if not os.path.exists(dirname):
os.mkdir(dirname)
# 指定URL
url = 'https://xiaohua.zol.com.cn/qutu/%d.html'
# 解决UA检测
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
}
for page in range(1, 3):
new_url = format(url % page)
page_text = requests.get(url=new_url, headers=headers).text
ex = '<div class="summary-text">.*?<img .*?src="(.*?)" alt.*?</div>'
img_lists = re.findall(ex, page_text, re.S)
time.sleep(3)
for img in img_lists:
img_name = img.split('/')[-1]
img_path = dirname + "/" + img_name
Requests.urlretrieve(img, filename=img_path)
time.sleep(1)
print(img_name, "下载成功!!!")
re.S
表示“.”(不包含外侧双引号)的作用扩展到整个字符串,包括“\n”。
4.2 bs4
- 原理:
- 实例化一个BeautifulSoup的对象,将需要被解析的页面源码数据加载到该对象中。
- 调用BeautifulSoup对象中的相关方法和属性进行标签定位和数据提取
- 环境的安装:
pip install bs4
- BeautifulSoup的实例化:
BeautifulSoup(fp,'lxml')
:将本地存储的一个html文档中的数据加载到实例化好的BeautifulSoup对象中BeautifulSoup(page_text,'lxml')
:将从互联网上获取的页面源码数据加载到实例化好的BeautifulSoup对象中
- 输出为列表
4.2.1 bs4 食用方式
from bs4 import BeautifulSoup
f = open("sanguo.html", "r", encoding='utf-8')
# f 是源网页,第二个参数'lxml'是使用 lxml 的解析器
soup = BeautifulSoup(f, 'lxml')
soup.div
soup.find('div',id='main_left')
soup.find('span',class_="nav-arrow")
soup.find_all('img',id='logo')
# id 选择器
soup.select('#logo')
# 类选择器
soup.select('.bookmark-list')
# 类选择器精确指定一个层级
soup.select('.bookmark-list > div > p')
# 类选择器指定多个层级
soup.select('.bookmark-list p')
# 取文本
div_tag = soup.select('#top_left_menu')[0]
# 获取当前标签的文本内容
div_tag.string
# 获取当前标签内所有的文本内容,类型为str
div_tag.text
# 取属性
img_tag = soup.find('img',class_='book-img')
img_tag['src']
img_tag = soup.select('.book-img')[0]
img_tag['src']
4.2.2 利用BS4抓取三国整篇内容
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
}
sg_url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
sg_response = requests.get(url=sg_url,headers=headers)
sg_response.encoding='utf-8'
sg_text = sg_response.text
soup = BeautifulSoup(sg_text, 'lxml')
a_list = soup.select('.book-mulu a')
f_sg = open('sanguo.txt','w',encoding='utf-8')
for a in a_list:
title = a.text
url = 'https://www.shicimingju.com'+a['href']
det_response = requests.get(url=url,headers=headers)
det_response.encoding = 'utf-8'
det_text = det_response.text.replace(u' ', u'')
det_soup = BeautifulSoup(det_text,'lxml')
sg_content = det_soup.find('div',class_='chapter_content').text
f_sg.write(title + '\n' + sg_content + '\n')
f_sg.flush()
print(title,'下载成功!')
print(sg_content)
break
f_sg.close()
4.3 xpath
-
原理:
- 实例化一个etree的对象,然后将即将被解析的页面源码加载到该对象中
- 使用etree对象中的xpath方法,结合着不同形式的xpath表达式,实现标签定位和数据提取
-
环境安装:
pip install lxml
-
etree对象的实例化:
- 本地解析:
etree.parse('test.html')
- Web解析:
etree.HTML(page_text)
- 本地解析:
-
xpath方法的返回值一定是一个列表
-
xpath表达式:
-
表达式 作用 nodename 选取此层级节点下的所有子节点 / 代表从根节点进行选取 // 可以理解为匹配,就是在所有节点中选取此节点,直到匹配为止 . 选取当前节点 … 选取当前节点上一层(上一级目录) @ 选取属性
-
4.3.1 xpath食用方式
from lxml import etree
# tree = etree.parse('sanguo.html')
# 问题:XMLSyntaxError: Opening and ending tag mismatch: img line 75 and div, line 81, column 19
# 这是由于自己html代码书写不规范,不符合xml解析器的使用规范导致
# 解决办法:自己创建解析器,同时增加parser参数
text = 'sanguo.html'
parser = etree.HTMLParser(encoding="utf-8")
html = etree.parse(text, parser=parser)
# 标签定位
# 表示从根节点开始寻找,标签与标签之间/表示一个层级
html.xpath('/html/head/title')
# 从任意节点开始寻找,也就是查找所有的div标签
html.xpath('//title')
# 在html/body下进行匹配寻找标签p
html.xpath('/html/body//p')
# 表示从当前的标签开始寻找div
html.xpath('/html/body/./div')
# 属性定位
# 定位div中属性名为class,属性值为'book-mulu'的div标签
html.xpath('//div[@class="book-mulu"]')
# 索引定位,索引值开始位置为
# 定位div中属性名为class,属性值为'book-mulu'的div标签下的第3个li标签中的a标签
html.xpath('//div[@class="book-mulu"]//li[3]/a')
# 取文本
# 取出div中属性名为class,属性值为'book-mulu'的div标签下的第3个li标签中的a标签文本内容
html.xpath('//div[@class="book-mulu"]//li[3]/a/text()')[0]
# 取出div中属性名为class,属性值为'book-mulu'的div标签下的所有a标签内容
html.xpath('//div[@class="book-mulu"]//a/text()')
# 取出div中属性名为class,属性值为'book-mulu'的div标签下的所有内容
html.xpath('//div[@class="book-mulu"]//text()')
# 取出div中属性名为class,属性值为'book-mulu'的div标签下的所有内容,并拼接为一个字符串
html.xpath('//div[@class="book-mulu"]')[0].xpath("string(.)")
# 取属性值
# 取出div中属性名为class,属性值为'book-mulu'的div标签下的所有a标签href属性值
html.xpath('//div[@class="book-mulu"]//a/@href')
4.3.2 利用Xpath抓取页面内容
# 抓取笑话中的标题、作者、内容
import requests
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
}
url = 'https://xiaohua.zol.com.cn/lengxiaohua/'
xh_response = requests.get(url=url, headers=headers)
xh_response.encoding = 'gbk'
xh_page = xh_response.text
xh = etree.HTML(xh_page)
title = xh.xpath('//ul[@class="article-list"]//span[@class="article-title"]//a/text()')
li_list = xh.xpath('//ul[@class="article-list"]/li')
for li in li_list:
title = li.xpath('.//span[@class="article-title"]//a/text()')[0]
author = li.xpath('.//div[@class="article-source"]')[0].xpath('string(.)')
content = li.xpath('.//div[@class="summary-text"]//text()')
content = ''.join(content)
print(title + " : " + author + "\n" + content + "\n")
4.3.3 利用Xpath抓取美图网图片
import requests
from lxml import etree
import os
import time
dirname = "beautylibs"
if not os.path.exists(dirname):
os.mkdir(dirname)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
}
url = 'http://pic.netbian.com/4kmeinv/index_%d.html'
for page in (1, 4):
if page == 1:
url = 'http://pic.netbian.com/4kmeinv/'
else:
url = format(url % page)
xh_page = requests.get(url=url, headers=headers).text
xh_etree = etree.HTML(xh_page)
img_list = xh_etree.xpath('//ul[@class="clearfix"]/li/a')
for img in img_list:
img_url = "http://pic.netbian.com/" + img.xpath('.//img/@src')[0]
# 若使用中文名则用以下
# img_name = img.xpath('.//b/text()')[0].encode('iso-8859-1').decode('gbk')
img_name = img.xpath('.//img/@src')[0].split('/')[-1]
img_data = requests.get(url=img_url, headers=headers).content
img_path = dirname + '/' + img_name
with open(img_path, 'wb') as f:
f.write(img_data)
print(img_name, '下载成功!')
time.sleep(1)
4.3.4 获取该网页页面中所有城市名称
import requests
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
}
aqi_text = requests.get('https://www.aqistudy.cn/historydata/', headers=headers).text
aqi_etree = etree.HTML(aqi_text)
city_list = aqi_etree.xpath('//div[@class="hot"]/div[2]/ul/li/a | //div[@class="all"]/div[@class="bottom"]//a')
for city in city_list:
city_name = city.xpath('./text()')[0]
print(city_name)
4.4 数据解析综合实验
- 下载
https://sc.chinaz.com/jianli/free.html
网页中所有免费的简历
import requests
from bs4 import BeautifulSoup
from lxml import etree
import os
import time
dir_name = "jianli"
if not os.path.exists(dir_name):
os.mkdir(dir_name)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
}
for page in range(1, 3):
if page == 1:
url = "https://sc.chinaz.com/jianli/free.html"
else:
url = "https://sc.chinaz.com/jianli/free_%d.html" % page
jl_session = requests.session()
jl_page = jl_session.get(url=url, headers=headers).text
jl_soup = BeautifulSoup(jl_page, 'lxml')
jl_items = jl_soup.select('#container a')
for jl_item in jl_items:
det_url = jl_item['href']
det_session = requests.session()
det_page = det_session.get(url=det_url, headers=headers).text
det_etree = etree.HTML(det_page)
doc_url = det_etree.xpath('//*[@id="down"]/div[2]/ul/li[3]/a/@href')[0].encode('iso-8859-1').decode('utf-8')
doc_name = dir_name + '/' + doc_url.split('/')[-1]
doc_data = requests.get(url=doc_url, headers=headers).content
with open(doc_name, 'wb') as df:
df.write(doc_data)
df.flush()
print(doc_url)
time.sleep(1)
5 Requests 高级模块
5.1 代理模块
我们可以对比不同类型的代理的区别,根据代理的匿名程度,代理可以分为如下类别:
- 高度匿名代理:会将数据包原封不动的转发,在服务端看来就好像真的是一个普通客户端在访问,而记录的IP则是代理服务器的IP。
- 普通匿名代理:会在数据包上做一些改动,服务器上有可能发现这是个代理服务器,也有一定几率追查到客户端的真实IP。
- 透明代理:不但改动了数据包,还会告诉服务器客户端的真实IP。
- 间谍代理:指组织或个人创建的用户记录用户传输的数据,然后进行研究、监控等目的的代理服务器。
- 抓取代理IP网站中免费主机名称:
http://www.kxdaili.com/dailiip.html
proxy_pool = {
{'HTTP': '111.225.152.82:8089'},
{'HTTPS': '111.225.152.82:8089'}
}
page_response = requests.get(url=url, headers=headers, proxies=proxy_pool, timeout=5)
5.2 cookie的处理
-
手动处理:将cookie封装到headers中
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/76.0.3809.132 Safari/537.36", "Cookie": "ASPSESSIONIDSQQDDDAS=PKJNIAHDMDCFENGBPLEJHLPN; Hm_lvt_95884c6bb5f83262667b4cf046ba2712=1669879562; ...", }
-
自动处理:session对象。可以创建一个session对象,改对象可以像requests一样进行请求发送。不同之处在于如果在使用session进行请求发送的过程中产生了cookie,则cookie会被自动存储在session对象中。
jl_session = requests.session() jl_page = jl_session.get(url=url, headers=headers).text
5.3 综合实验
5.3.1 抓取IP代理网站上的免费节点,并测试其有效性
import json
import requests
from bs4 import BeautifulSoup
class GetProxyIP(object):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
}
# proxy_pool = [
# {"http": "120.77.172.30:8118"},
# ]
check_url = 'https://www.ip.cn/'
# 验证Proxy IP 有效性
def check_proxy_ip(self, proxies_ip):
res = False
try:
print(proxies_ip)
page_response = requests.get(url=self.check_url, headers=self.headers, proxies=proxies_ip, timeout=5)
if page_response.status_code == 200:
res = True
except Exception as error_info:
res = False
return res
# 抓取免费代理IP列表
def get_proxy_pool(self):
for page in range(1, 2):
if page == 1:
proxy_url = 'http://www.kxdaili.com/dailiip.html'
else:
proxy_url = "http://www.kxdaili.com/dailiip/1/%d.html" % page
proxy_text = requests.get(url=proxy_url, headers=self.headers).text
proxy_soup = BeautifulSoup(proxy_text, 'lxml')
proxy_ip_list = proxy_soup.select("tbody > tr")
# 并生成代理池
proxy_pool = []
for proxy_ip in proxy_ip_list:
ip = proxy_ip.select('td')[0].text
port = proxy_ip.select('td')[1].text
proxy_type = proxy_ip.select('td')[3].text
if "," in proxy_type.strip():
ptype_list = proxy_type.split(',')
for p_type in ptype_list:
tmp_dict = {p_type: ip.strip() + ":" + port.strip()}
# proxy_pool.append(tmp_dict)
if self.check_proxy_ip(tmp_dict):
proxy_pool.append(tmp_dict)
else:
tmp_dict = {proxy_type: ip.strip() + ":" + port.strip()}
proxy_pool.append(tmp_dict)
if self.check_proxy_ip(tmp_dict):
proxy_pool.append(tmp_dict)
print(proxy_pool)
return proxy_pool
def run(self):
with open("proxy_list.txt", 'w', encoding='utf-8') as f:
json.dump(self.get_proxy_pool(), f)
if __name__ == '__main__':
ip = GetProxyIP()
ip.run()
5.3.2 模拟登录古诗文网
- 使用
requests.session()
保存当前会话状态,以便后续操作。 - 古诗文网-古诗文经典传承 (gushiwen.cn)
import ddddocr
import requests
from lxml import etree
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
}
# 1.创建session对象
gsw_session = requests.session()
gsw_text = gsw_session.get(url=url, headers=headers).text
# 2.实例化一个etree对象,对页面动态参数进行数据解析
gsw_etree = etree.HTML(gsw_text)
__VIEWSTATE = gsw_etree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
__VIEWSTATEGENERATOR = gsw_etree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
# 3.提取验证码下载地址并解析验证码
code_src = gsw_etree.xpath('//*[@id="imgCode"]/@src')[0]
code_url = 'https://so.gushiwen.cn/' + code_src
code_data = gsw_session.get(url=code_url, headers=headers).content
ocr = ddddocr.DdddOcr()
code_text = ocr.classification(code_data)
# 4 构造登录请求Payload,发送请求并将结果保存到本地。
data = {
'__VIEWSTATE': __VIEWSTATE,
'__VIEWSTATEGENERATOR': __VIEWSTATEGENERATOR,
'from': 'http://so.gushiwen.cn/user/collect.aspx',
'email': '123456@qq.com',
'pwd': 'fcareypasswd',
'code': code_text,
'denglu': '登录',
}
print(data)
gsw_login_text = gsw_session.post(url=url, headers=headers, data=data, ).text
with open('login.html', 'w', encoding='utf-8') as f:
f.write(gsw_login_text)
5.3.3 抓取梨视频视频
import re
import requests
from lxml import etree
import os
import time
# 创建视频存放目录
dirname = 'pearvideos'
if not os.path.exists(dirname):
os.mkdir(dirname)
# 解决UA检测
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
}
url = 'https://www.pearvideo.com/category_1'
pear_text = requests.get(url, headers).text
# 解析视频列表
pear_etree = etree.HTML(pear_text)
# xpath通用匹配
det_video_list = pear_etree.xpath('//*[@id="listvideoListUl"]/li | //*[@id="categoryList"]/li')
for det_video in det_video_list:
det_video_url = 'https://www.pearvideo.com/' + det_video.xpath('./div/a/@href')[0]
video_title = det_video.xpath('./div/a/div[2]/text()')[0]
video_title = re.findall(r'[^\*"/:?\\|<>]', video_title, re.S)
video_title = ''.join(video_title)
contId = det_video_url.split('_')[1]
videoStatus_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=' + contId
det_video_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.132 Safari/537.36",
'Referer': det_video_url,
}
det_video_text = requests.get(url=videoStatus_url, headers=det_video_headers).json()
video_url = det_video_text['videoInfo']['videos']['srcUrl']
video_data = requests.get(video_url, headers).content
with open(dirname+'/'+video_title + '.mp4', 'wb') as f:
f.write(video_data)
time.sleep(3)