爬取梨视频多页视频数据（配图教学） - 簌小颜

# 全部代码，需先安装以下需要插入的模块
import requests
from bs4 import BeautifulSoup
import os
import time

if not os.path.exists(r'梨视频数据'):
    os.mkdir(r'梨视频数据')


def get_video(n):
    res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=31&start=%s' % n)
    soup = BeautifulSoup(res.text, 'lxml')
    li_list = soup.select('li.categoryem')
    for li in li_list:
        a_tag = li.find(name='a')
        a_href_link = a_tag.get('href')
        video_id = a_href_link.split('_')[-1]
        headers = {
            "Referer": "https://www.pearvideo.com/video_%s" % video_id
        }
        res1 = requests.get('https://www.pearvideo.com/videoStatus.jsp',
                            params={'contId': video_id},
                            headers=headers
                            )
        data_dict = res1.json()
        src_url = data_dict['videoInfo']['videos']['srcUrl']
        systemTime = data_dict['systemTime']
        real_url = src_url.replace(systemTime, 'cont-%s' % video_id)
        res2 = requests.get(real_url)
        file_path = os.path.join(r'梨视频数据', '%s.mp4' % video_id)
        with open(file_path, 'wb') as f:
            f.write(res2.content)
        time.sleep(0.5)

    for n in range(12, 48, 12):
        get_video(n)

# 插入模块

import requests

from bs4 import BeautifulSoup

import os

import time

if not os.path.exists(r'梨视频数据')　　# 判断是否有名为梨视频数据的文件夹

os.mkdir(r'梨视频数据')　　# 创建名为梨视频数据的文件夹

# 1.找规律（如下列两幅图所示）：

# 研究表明，该页面视频数据加载一次数据会增加十二个视频，我们在写网页的时候讲start=12改为%s，% n，做一个可控的参数

def get_video(n):

res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=31&start=%s' % n)　　# 网址为上图所圈url

# 2.随后解析页面数据获取到的详情页面的链接

soup = BeautifulSoup(res.text, 'lxml')

# 3.研究视频详情链接（如下图）

li_list = soup.select('li.categoryem')

# 4.循环获取每个li里面的a标签（如上图）

for li in li_list:

a_tag = li.find(name='a')

a_href_link = a_tag.get('href')

'''研究发现详情页视频数据并不是直接加载的也就意味着朝上述地址发送get请求没有丝毫作用'''
"""
video_1742158
内部动态请求的地址
https://www.pearvideo.com/videoStatus.jsp?contId=1742158&mrd=0.9094028515390931
contId: 1742158
mrd: 0.7561643508416624 0到1之间的随机小数

动态请求之后返回的核心数据
https://video.pearvideo.com/mp4/adshort/20210920/1632283823415-15771122_adpkg-ad_hd.mp4
真实视频地址
https://video.pearvideo.com/mp4/adshort/20210920/cont-1742158-15771122_adpkg-ad_hd.mp4
"""

# 通过研究发现详情页数据是动态加载的所以通过network获取到地址
　　video_id = a_href_link.split('_')[-1]
# 防盗链
headers = {
　　　　　　"Referer": "https://www.pearvideo.com/video_%s" % video_id
　　　　　　}
res1 = requests.get('https://www.pearvideo.com/videoStatus.jsp',
params={'contId': video_id},
headers=headers
)
data_dict = res1.json()
src_url = data_dict['videoInfo']['videos']['srcUrl']
systemTime = data_dict['systemTime']

'''如何替换核心数据通过研究发现systemTime是关键'''
real_url = src_url.replace(systemTime, 'cont-%s' % video_id)

res2 = requests.get(real_url)
file_path = os.path.join(r'梨视频数据', '%s.mp4' % video_id)
with open(file_path, 'wb') as f:
f.write(res2.content)
time.sleep(0.5)　　# 停0.5秒继续爬取，不然会被封id

for n in range(12, 48, 12):　　# 爬取12条到48条，每12条为一节
get_video(n)

发表于 2021-09-23 20:49 簌小颜阅读(106) 评论(0) 编辑收藏举报