import re
import os
import requests
# 第一步: https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=4&start=0
# 第二步: https://www.pearvideo.com/ + 'video_1688698/'
# 第三步: ldUrl="",srcUrl="http://video.pearvideo.com/mp4/third/20200731/cont-1689259-12308265-175332-hd.mp4",vdoUrl=srcUrl
# 第四步: http://video.pearvideo.com/mp4/third/20200731/cont-1689259-12308265-175332-hd.mp4
video_dir_path = os.path.join(os.path.dirname(__file__), 'video')
if not os.path.isdir(video_dir_path):
os.mkdir(video_dir_path)
# 请求分类id为5下的页面
category_response = requests.post('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=4&start=0')
video_name_list = re.findall(r'<a href="(.*?)" class="vervideo-lilink actplay">', category_response.text)
for video_name in video_name_list:
# 请求分类id为5下页面中不同的视频页面
video_response = requests.get(f'https://www.pearvideo.com/{video_name}/')
video_mp4_list = re.findall(r'ldUrl="",srcUrl="(.*?)",vdoUrl=srcUrl', video_response.text)
for video_mp4 in video_mp4_list:
# 请求分类id为5下页面中不同的视频页面中的视频
video_mp4_response = requests.get(video_mp4)
# 使用rsplit将视频路径末尾部分分割作为视频文件名
video_name = video_mp4.rsplit('/', 1)[-1]
video_file_path = os.path.join(video_dir_path, video_name)
with open(video_file_path, 'wb') as f:
for line in video_mp4_response.iter_content():
f.write(line)
print('爬取完毕!')
break # 不break就爬的太凶狠了
break