# 全部代码,需先安装以下需要插入的模块
import requests
from bs4 import BeautifulSoup
import os
import time
if not os.path.exists(r'梨视频数据'):
os.mkdir(r'梨视频数据')
def get_video(n):
res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=31&start=%s' % n)
soup = BeautifulSoup(res.text, 'lxml')
li_list = soup.select('li.categoryem')
for li in li_list:
a_tag = li.find(name='a')
a_href_link = a_tag.get('href')
video_id = a_href_link.split('_')[-1]
headers = {
"Referer": "https://www.pearvideo.com/video_%s" % video_id
}
res1 = requests.get('https://www.pearvideo.com/videoStatus.jsp',
params={'contId': video_id},
headers=headers
)
data_dict = res1.json()
src_url = data_dict['videoInfo']['videos']['srcUrl']
systemTime = data_dict['systemTime']
real_url = src_url.replace(systemTime, 'cont-%s' % video_id)
res2 = requests.get(real_url)
file_path = os.path.join(r'梨视频数据', '%s.mp4' % video_id)
with open(file_path, 'wb') as f:
f.write(res2.content)
time.sleep(0.5)
for n in range(12, 48, 12):
get_video(n)
# 插入模块
import requests
from bs4 import BeautifulSoup
import os
import time
if not os.path.exists(r'梨视频数据') # 判断是否有名为梨视频数据的文件夹
os.mkdir(r'梨视频数据') # 创建名为梨视频数据的文件夹
# 1.找规律(如下列两幅图所示):
# 研究表明,该页面视频数据加载一次数据会增加十二个视频,我们在写网页的时候讲start=12改为%s,% n,做一个可控的参数
def get_video(n):
res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=31&start=%s' % n) # 网址为上图所圈url
# 2.随后解析页面数据获取到的详情页面的链接
soup = BeautifulSoup(res.text, 'lxml')
# 3.研究视频详情链接(如下图)
li_list = soup.select('li.categoryem')
# 4.循环获取每个li里面的a标签(如上图)
for li in li_list:
a_tag = li.find(name='a')
a_href_link = a_tag.get('href')
'''研究发现详情页视频数据并不是直接加载的 也就意味着朝上述地址发送get请求没有丝毫作用'''
"""
video_1742158
内部动态请求的地址
https://www.pearvideo.com/videoStatus.jsp?contId=1742158&mrd=0.9094028515390931
contId: 1742158
mrd: 0.7561643508416624 0到1之间的随机小数
动态请求之后返回的核心数据
https://video.pearvideo.com/mp4/adshort/20210920/1632283823415-15771122_adpkg-ad_hd.mp4
真实视频地址
https://video.pearvideo.com/mp4/adshort/20210920/cont-1742158-15771122_adpkg-ad_hd.mp4
"""
# 通过研究发现详情页数据是动态加载的 所以通过network获取到地址
video_id = a_href_link.split('_')[-1]
# 防盗链
headers = {
"Referer": "https://www.pearvideo.com/video_%s" % video_id
}
res1 = requests.get('https://www.pearvideo.com/videoStatus.jsp',
params={'contId': video_id},
headers=headers
)
data_dict = res1.json()
src_url = data_dict['videoInfo']['videos']['srcUrl']
systemTime = data_dict['systemTime']
'''如何替换核心数据 通过研究发现systemTime是关键'''
real_url = src_url.replace(systemTime, 'cont-%s' % video_id)
res2 = requests.get(real_url)
file_path = os.path.join(r'梨视频数据', '%s.mp4' % video_id)
with open(file_path, 'wb') as f:
f.write(res2.content)
time.sleep(0.5) # 停0.5秒继续爬取,不然会被封id
for n in range(12, 48, 12): # 爬取12条到48条,每12条为一节
get_video(n)