python爬取梨视频--2021.7.15
老师讲了关于子页面下载,但没有主页面获取子页面的内容,内容不全面我补充了下
感谢B站Up
下面老师的内容是up主的只是搬运,并完善了下 ,后期我会完善用多进程分布式进行视频下载,提高下载能力欢迎大家对我的程序多进行改造,并且教教我
2021.7.15
#===========================通过主页代码-获取子页面url===================== import requests from lxml import etree a=int(input('请输入爬取视频的分类,举例:31是汽车,8科技,4娱乐,59是音乐,5是生活..............等等,剩下的我就不提示了请输入:')) b=int(input("请输入想要下载此分类的视频页数,一页是12个视频,没有区间选择是我懒,请输入:")) b1=b*12+1 for h in range(1,b1,12): home_cat_url='https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId={0}&start={1}'.format(a,h) head={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.52', 'Referer': 'https://www.pearvideo.com/category_31' } home_dat={ 'reqType': '5', 'categoryId': '31', 'start': '24', } home_cat_page=requests.get(home_cat_url,headers=head,data=home_dat) home_cat_page.encoding='utf-8' tree=etree.HTML(home_cat_page.text) divs=tree.xpath('/html/body/li/div/a/@href') for i in divs: y=i.split('_')[1] #===========================页面拼接================================= ancienturl='https://www.pearvideo.com/video_'+y #======================将主页面产生的分页传入子页面下载功能 下面是老师的功能我就不改了==== contid=ancienturl.split('_')[1] vidostatus_jsp_url = f'https://www.pearvideo.com/videoStatus.jsp?contId={contid}&mrd=0.24120654366668948' head = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.52', #防盗链 'Referer': ancienturl } js_page=requests.get(vidostatus_jsp_url,headers=head) dict=js_page.json() #返回值加入字典中 trueurl=dict['videoInfo']['videos']['srcUrl'] #连续切片 systemtime=dict['systemTime'] addurl=trueurl.replace(systemtime,f'cont-{contid}') #链接替换 with open('./video/{}.mp4'.format(contid),'wb')as a: #文件写入 a.write(requests.get(addurl).content) print('真在下载第{0}页'.format(b))
#===========================通过主页代码-获取子页面url=====================
import requests
from lxml import etree
a=int(input('请输入爬取视频的分类,举例:31是汽车,8科技,4娱乐,59是音乐,5是生活..............等等,剩下的我就不提示了请输入:'))
b=int(input("请输入想要下载此分类的视频页数,一页是12个视频,没有区间选择是我懒,请输入:"))
b1=b*12+1
for h in range(1,b1,12):
home_cat_url='https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId={0}&start={1}'.format(a,h)
head={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.52',
'Referer': 'https://www.pearvideo.com/category_31'
}
home_dat={
'reqType': '5',
'categoryId': '31',
'start': '24',
}
home_cat_page=requests.get(home_cat_url,headers=head,data=home_dat)
home_cat_page.encoding='utf-8'
tree=etree.HTML(home_cat_page.text)
divs=tree.xpath('/html/body/li/div/a/@href')
for i in divs:
y=i.split('_')[1]
#===========================页面拼接=================================
ancienturl='https://www.pearvideo.com/video_'+y
#======================将主页面产生的分页传入子页面下载功能 下面是老师的功能我就不改了====
contid=ancienturl.split('_')[1]
vidostatus_jsp_url = f'https://www.pearvideo.com/videoStatus.jsp?contId={contid}&mrd=0.24120654366668948'
head = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.52',
#防盗链
'Referer': ancienturl
}
js_page=requests.get(vidostatus_jsp_url,headers=head)
dict=js_page.json() #返回值加入字典中
trueurl=dict['videoInfo']['videos']['srcUrl'] #连续切片
systemtime=dict['systemTime']
addurl=trueurl.replace(systemtime,f'cont-{contid}') #链接替换
with open('./video/{}.mp4'.format(contid),'wb')as a: #文件写入
a.write(requests.get(addurl).content)
print('真在下载第{0}页'.format(b))