Python 爬虫 梨视频批量下载
进入主页面
-
获取视频详情页链接
点击链接,进入详情页
- 获取视频地址
- 若不点击播放,仅能从html中获取封面地址
- 打开网络抓包工具,刷新页面,从ajax请求,发现response携带mp4链接
- 若不点击播放,仅能从html中获取封面地址
获取视频数据,保存到本地,(用视频标题命名,可能会出错)
获取更多分页
- 在首页面利用鼠标下滑展开更多视频
- 发现ajax请求,发现通过更改start值可以获得不同页面,解析response获取新页面视频链接
- 创建headers,params,进行get请求
创建线程池加快下载
import requests
from lxml import etree
import time
import os
from multiprocessing import Pool
filepath = 'D:/PYT/video/'
def get_page_text(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
}
page_text = requests.get(url=url,headers=headers).text
return page_text
def get_video_page_url(page_text):
tree = etree.HTML(page_text)
li_list = tree.xpath('//li[@class="categoryem"]')
video_page_urls = []
print('url list:\n')
for li in li_list:
video_page_url =li.xpath('./div/a/@href')[0]
video_page_urls.append('https://www.pearvideo.com/' + video_page_url)
print(video_page_url)
return video_page_urls
def get_video_url(url):
#获取视频下载链接
headers = {
'Referer': url,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
}
contId = url.split('_')[-1]
video_url = 'https://www.pearvideo.com/videoStatus.jsp'
para = {
'contId': contId,
'mrd': '0.38177687506833946'
}
mp4_url_response = requests.get(url=video_url,headers=headers,params=para).json()
mp4_url = mp4_url_response['videoInfo']['videos']['srcUrl']
mp4_url = mp4_url.replace(mp4_url.split('-')[0].split('/')[-1],'cont-' + str(contId))
print(mp4_url)
return mp4_url
def get_title(url):
#获取中文标题
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
}
video_page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(video_page_text)
titlte = tree.xpath('//div[@class="box-left clear-mar"]/h1/text()')[0]
return titlte
def load_more_page_urls(base_page_url,no):
#获取加载出的视频页链接
headers = {
'Referer': base_page_url,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
}
load_url = 'https://www.pearvideo.com/category_loading.jsp?'
params = {
'reqType': '5',
'categoryId': '135',
'start': str(12*int(no)),
'mrd': '0.5547564352582317'
}
load_page_text = requests.get(url=load_url,headers=headers,params=params).text
tree = etree.HTML(load_page_text)
li_list = tree.xpath('//li[@class="categoryem"]')
video_urls = []
print('url list:')
for li in li_list:
url = li.xpath('./div[@class="vervideo-bd"]/a/@href')[0]
video_urls.append('https://www.pearvideo.com/' + url)
print(url)
return video_urls
def get_video_data(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
}
video_data = requests.get(url=url,headers=headers).content
return video_data
def creatPath(path):
if not os.path.exists(path):
os.makedirs(path)
def download_video(url):
time.sleep(0.5)
down_url = get_video_url(url)
title = 'video_' + url.split('_')[-1]
video_data = get_video_data(down_url)
fp = open(filepath + title + '.mp4', 'wb')
fp.write(video_data)
print("...download over..." + title)
def main():
pool = Pool(4)
creatPath(filepath)
# pagenum = input("pagenum:")
# 爬取页面数
pagenum = 5
curren_page_no = 0
# 视频链接集
video_page_urls = []
base_page_url = 'https://www.pearvideo.com/category_135'
while curren_page_no < pagenum:
#获取多个页面
video_page_urls.clear()
if (curren_page_no == 0):
base_page_url_text = get_page_text(base_page_url)
video_page_urls = get_video_page_url(base_page_url_text)
else:
video_page_urls = load_more_page_urls(base_page_url, curren_page_no)
curren_page_no = int(curren_page_no) + 1
pool.map(download_video,video_page_urls)
# for url in video_page_urls:
# download_video(url)
if __name__ == '__main__':
main()
本文来自博客园,作者:w0000,转载请注明原文链接:https://www.cnblogs.com/w0000/p/lsp_dld.html