# 全部代码,需先安装以下需要插入的模块
import requests
from bs4 import BeautifulSoup
import os
import time

if not os.path.exists(r'梨视频数据'):
    os.mkdir(r'梨视频数据')


def get_video(n):
    res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=31&start=%s' % n)
    soup = BeautifulSoup(res.text, 'lxml')
    li_list = soup.select('li.categoryem')
    for li in li_list:
        a_tag = li.find(name='a')
        a_href_link = a_tag.get('href')
        video_id = a_href_link.split('_')[-1]
        headers = {
            "Referer": "https://www.pearvideo.com/video_%s" % video_id
        }
        res1 = requests.get('https://www.pearvideo.com/videoStatus.jsp',
                            params={'contId': video_id},
                            headers=headers
                            )
        data_dict = res1.json()
        src_url = data_dict['videoInfo']['videos']['srcUrl']
        systemTime = data_dict['systemTime']
        real_url = src_url.replace(systemTime, 'cont-%s' % video_id)
        res2 = requests.get(real_url)
        file_path = os.path.join(r'梨视频数据', '%s.mp4' % video_id)
        with open(file_path, 'wb') as f:
            f.write(res2.content)
        time.sleep(0.5)

    for n in range(12, 48, 12):
        get_video(n)

 

 

 

 

 # 插入模块

import requests

from bs4 import BeautifulSoup

import os

import time

 

if not os.path.exists(r'梨视频数据')  # 判断是否有名为梨视频数据的文件夹

os.mkdir(r'梨视频数据')  # 创建名为梨视频数据的文件夹

 

# 1.找规律(如下列两幅图所示):

 

 

 

 

 

 # 研究表明,该页面视频数据加载一次数据会增加十二个视频,我们在写网页的时候讲start=12改为%s,% n,做一个可控的参数

def get_video(n):

res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=31&start=%s' % n)  # 网址为上图所圈url

 

# 2.随后解析页面数据获取到的详情页面的链接

soup = BeautifulSoup(res.text, 'lxml')

 

# 3.研究视频详情链接(如下图)

li_list = soup.select('li.categoryem')

# 4.循环获取每个li里面的a标签(如上图)

for li in li_list:

a_tag = li.find(name='a')

a_href_link = a_tag.get('href')

 

'''研究发现详情页视频数据并不是直接加载的 也就意味着朝上述地址发送get请求没有丝毫作用'''
"""
video_1742158
内部动态请求的地址
https://www.pearvideo.com/videoStatus.jsp?contId=1742158&mrd=0.9094028515390931
contId: 1742158
mrd: 0.7561643508416624 0到1之间的随机小数

动态请求之后返回的核心数据
https://video.pearvideo.com/mp4/adshort/20210920/1632283823415-15771122_adpkg-ad_hd.mp4
真实视频地址
https://video.pearvideo.com/mp4/adshort/20210920/cont-1742158-15771122_adpkg-ad_hd.mp4
"""

 

# 通过研究发现详情页数据是动态加载的 所以通过network获取到地址
  video_id = a_href_link.split('_')[-1]
# 防盗链
headers = {
      "Referer": "https://www.pearvideo.com/video_%s" % video_id
      }
res1 = requests.get('https://www.pearvideo.com/videoStatus.jsp',
params={'contId': video_id},
headers=headers
)
data_dict = res1.json()
src_url = data_dict['videoInfo']['videos']['srcUrl']
systemTime = data_dict['systemTime']

 

'''如何替换核心数据 通过研究发现systemTime是关键'''
real_url = src_url.replace(systemTime, 'cont-%s' % video_id)

res2 = requests.get(real_url)
file_path = os.path.join(r'梨视频数据', '%s.mp4' % video_id)
with open(file_path, 'wb') as f:
f.write(res2.content)
time.sleep(0.5)  # 停0.5秒继续爬取,不然会被封id

for n in range(12, 48, 12):  # 爬取12条到48条,每12条为一节
get_video(n)