b站路飞学城python课梨视频项目代码
1 import requests 2 from lxml import etree 3 import random 4 import os 5 from multiprocessing.dummy import Pool 6 7 if __name__ == '__main__': 8 # 生成一个存视频的文件夹 9 if not os.path.exists('./video'): 10 os.mkdir('./video') 11 12 url = 'https://www.pearvideo.com/category_5' 13 headers = { 14 "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400' 15 } 16 # proxies={'https': '62.210.38.37:3838'} 代理ip,用了太慢 17 response = requests.get(url=url, headers=headers) 18 page_text = response.text 19 20 tree = etree.HTML(page_text) 21 li_list = tree.xpath('//*[@id="listvideoListUl"]/li') 22 23 urls = [] # 储存所有视频的连接和名字 24 for li in li_list: 25 new_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0] 26 new_name = li.xpath('./div/a/div[2]/text()')[0] + '.mp4' 27 # 这个方法行不通。因为mp4是动态加载出来的,因此需要抓包ajax请求中的url,不知道怎么用python抓包,用浏览器的抓包工具 28 new_page_text = requests.get(url=new_url, headers=headers).text 29 new_tree = etree.HTML(new_page_text) 30 name = new_tree.xpath('//*[@id="detailsbd"]/div[1]/div[2]/div/div[1]/h1/text()')[0] 31 # print(name) 32 33 # 通过抓包ajax得到一个可以发送的url和请求伪装的视频的url, 34 id_ = str(li.xpath('./div/a/@href')[0]).split('_')[1] 35 # 可发送请求的url 36 ajax_url = 'https://www.pearvideo.com/videoStatus.jsp?' 37 params = { 38 'contId': id_, 39 'mrd': str(random.random()) 40 } 41 ajax_headers = { 42 "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400', 43 'Referer': 'https://www.pearvideo.com/video_' + id_ 44 } 45 # 加了'Referer': 'https://www.pearvideo.com/video_1708144'后就不会显示该视频已下架了 46 dic_obj = requests.get(url=ajax_url, params=params, headers=ajax_headers).json() 47 video_url = dic_obj["videoInfo"]['videos']["srcUrl"] 48 49 # 此处视频地址做了加密即ajax中得到的地址需要加上cont-,并且修改一段数字为id才是真地址 50 # 真地址:"https://video.pearvideo.com/mp4/third/20201120/cont-1708144-10305425-222728-hd.mp4" 51 # 伪地址:"https://video.pearvideo.com/mp4/third/20201120/1606132035863-10305425-222728-hd.mp4" 52 53 # 得到真url,做字符串处理 54 video_true_url = '' 55 s_list = str(video_url).split('/') 56 # print(s_list) 57 for i in range(0, len(s_list)): 58 if i < len(s_list) - 1: 59 video_true_url += s_list[i] + '/' 60 else: 61 ss_list = s_list[i].split('-') 62 # print(ss_list) 63 for j in range(0, len(ss_list)): 64 if j == 0: 65 video_true_url += 'cont-' + id_ + '-' 66 elif j == len(ss_list) - 1: 67 video_true_url += ss_list[j] 68 else: 69 video_true_url += ss_list[j] + '-' 70 # print(video_true_url) 71 72 dic = { 73 'name': name, 74 'url': video_true_url 75 } 76 urls.append(dic) 77 78 # 使用线程池对视频数据进行请求(较为耗时的阻塞操作) 79 def get_video_data(dic_): 80 url_ = dic_['url'] 81 print(dic_['name'], '正在下载.....') 82 video_data = requests.get(url=url_, headers=headers).content 83 video_path = './video/' + dic_['name'] 84 with open(video_path, 'wb') as fp: 85 fp.write(video_data) 86 print(dic_['name'], '下载成功!!!!!') 87 88 89 pool = Pool(4) 90 pool.map(get_video_data, urls) 91 92 pool.close() 93 pool.join()