爬取梨视频
代码实现
# 爬取梨视频
import requests
import re
import os
header = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.70',
}
# 第一步:
1、访问首页,选择爬取板块,分析请求url,本例中选择的“万象”
2、分析网站前端是哪个按钮给后端发请求拿回数据:点击加载1次返回24个视频,页面中的a标签跳转到视频详情页
因此,请求 URL: https://www.pearvideo.com/panorama_loading.jsp?start=24&filterIds=
3、利用正则分组,从a标签扣取视频详情页url,放在列表中
params = {'start': '0'}
res = requests.get('https://www.pearvideo.com/panorama_loading.jsp?', headers=header, params=params)
video_url_list = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', res.text)
print(video_url_list) ===> ['video_1772062', 'video_1772052', 'video_1772118', 'video_1772117'...]
# 第二步:
1、在视频详情页源代码中找不到视频地址,说明该页面是通过ajax动态加载(以前详情页搜索video、mp4关键词能找到,现在没有了)
2、通过浏览器调试查看详情页的ajax请求,请求url:’https://www.pearvideo.com/videoStatus.jsp?contId=1731216‘
3、取出'video_1772062',拼在ajax请求后
4、响应分析
{
"resultCode":"1",
"resultMsg":"success", "reqId":"4e3bf737-5b00-4a4c-abd7-a7bf8e0a6ee0",
"systemTime": "1664520525405",
"videoInfo":{"playSta":"1","srcUrl":"https://video.pearvideo.com/mp4/third/20220928/1664520525405-12142151-105901-hd.mp4"}}
}
for video_url_num in video_url_list:
video_url = 'https://www.pearvideo.com/' + video_url_num # 拼接详情页url
header['Referer'] = video_url # 请求头添加Referer,对应详情页url
video_id = video_url_num.split('_')[1]
res_video = requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=%s' % video_id,
headers=header)
system_time = re.findall('"systemTime": "(.*?)"', res_video.text)[0] # 利用正则分组,获取时间戳,从列表中取出字符串
src_url = re.findall('"srcUrl":"(.*?)"', res_video.text)[0] # 利用正则分组,获取到视频地址,从列表中取出字符串
# src_url不能直接访问,网站在src_url中用时间戳替换了cont-id,真实地址在执行js代码的参数中
# 返回数据 https://video.pearvideo.com/mp4/third/20220928/1664521226053-15860034-104415-hd.mp4
# 真实地址 https://video.pearvideo.com/mp4/third/20220928/cont-1771974-15860034-104415-hd.mp4
mp4_url = src_url.replace(system_time, 'cont-%s' % video_id)
print(mp4_url)
# 第三步,下载并保存视频
video_name = mp4_url.rsplit('/', 1)[-1] # 视频地址切割取cont后字符串作为名字,不会重复
file_path = os.path.join(r'videoData', video_name) # 拼接视频存储路径
# 朝真正的视频地址发get请求,获取视频流,下载并保存
res_video_content = requests.get(mp4_url, headers=header)
with open(file_path, 'wb') as f:
for line in res_video_content.iter_content():
f.write(line)