python下载视频
由于自己平时喜欢在b站看看番剧、学习视频之类的,恰好这段时间等了好几年的Re0 season2开始更新了,打算将视频全部保存到本地,于是打开pycharm开始干活…
- 下载非大会员视频,最高1080p画质
- 使用多线程下载加快下载速度
- 鉴于本地网络环境不佳、考虑加入断点续传功能
- 支持终端^c退出
b站的视频信息全部保存在网页源码中的window.__playinfo__
变量中,其实就是个json格式的字符串, 所以我们用正则表达式很容易提取出来,如下图:
获得视频播放地址后如果直接在浏览器中请求服务器会返回403 Forbidden,打开开发者模式发现b站的请求方式为option,所以我们也照着这样去申请服务器资源。
注意我们写脚本去爬取视频的时候需要填入自己的cookie,否则有些视频无法获取,当然如果你有大会员也可以下载大会员专属视频。
补充一点:b站视频都是视频流和音频流分开存储的,所以将视频和音频下载完了之后,最后用 ffmpeg合并视频、音频。
三、编码细节
为了支持断点续传、我们需要记录每次请求头中的'Range': 'bytes=xx-yy'
其中xx,yy为我们自定义的请求范围。所以我们再写个日志类用于记录请求信息,使用json格式保存到本地,方便操作。
为了实现ctrl-c结束程序,使用signal模块捕获signal.SIGINT signal.SIGTERM
即ctrl-c信号、终端信号,用全局变量is_exit
控制线程是否退出,当程序接收到信号时is_exit
设置为True
线程退出。注意线程创建start之后不要jion方法阻塞,我们轮询的方式判断thread_list[i].is_alive()
,如果所以线程运行完毕,程序就正常向下执行。
如果程序因为网络故障某个线程退出之后,其他线程并未设置退出,继续进行下载,直到下载完成或报错为止。
四、具体实现
""" =================================== -*- coding:utf-8 -*- Author :GadyPu E_mail :Gadypy@gmail.com Time :2020/9/3 0003 上午 11:44 FileName :bili_downloader.py =================================== """ import sys import signal import requests import time import math import os import json import re import threading import copy from lxml import etree from decimal import Decimal requests.packages.urllib3.disable_warnings() is_exit = False class Logger(object): def __init__(self, *args): if len(args) == 1: with open(os.path.join(os.getcwd(), args[0] + '.log'), 'r', encoding = 'utf-8') as rf: self.log_file = json.loads(rf.read()) else: self.log_file = { 'time': None, 'filename': args[0], 'filesize': args[1], 'downsize': 0, 'url': args[2], 'threads': [] } for i in range(args[3]): self.log_file['threads'].append({ 'start': 0, 'end': 0, 'finshed': False }) self.lock = threading.Lock() def update_log(self, start, end, id, step, finshed = False): with self.lock: self.log_file['threads'][id]['start'] = start self.log_file['threads'][id]['end'] = end self.log_file['downsize'] += step if finshed: self.log_file['threads'][id]['finshed'] = True def load_bound(self): with self.lock: return copy.deepcopy(self.log_file['threads']) def get_cur_size(self): with self.lock: return int(self.log_file['downsize']) def get_tot_size(self): with self.lock: return int(self.log_file['filesize']) def get_file_name(self): with self.lock: return self.log_file['filename'] def get_req_url(self): with self.lock: return self.log_file['url'] def save_log(self): with self.lock: self.log_file['time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) file_path = os.path.join(os.getcwd(), self.log_file['filename'] + '.log') with open(file_path, 'w+', encoding = 'utf-8') as wf: wf.write(json.dumps(self.log_file)) def failed(self): with self.lock: for i in self.log_file['threads']: if not i['finshed']: return True return False def show_log(self): print(self.log_file) def clear(self): file_path = os.path.join(os.getcwd(), self.log_file['filename'] + '.log') if os.path.exists(file_path): os.remove(file_path) class DownThread(threading.Thread): def __init__(self, headers: dict, log: Logger, id: int, start, end, url, session): super().__init__() self.headers = copy.deepcopy(headers) self.file_name = log.get_file_name() self.thread_id = id self.log = log self.__start = start self.__end = end self.url = url self.session = session def run(self): fp = open(self.file_name, 'rb+') __start, __end = self.__start, self.__end __step = 1024 * 512 finshed = True b_read = 0 print(f'thread[{self.thread_id}] 开始下载 Range:{__start}-{__end}') t1 = time.time() i, tm = 0, (__end - __start + 1) // __step + 1 while __start <= __end: __to = __end if __start + __step > __end else __start + __step self.headers.update({'Range': 'bytes=' + str(__start) + '-' + str(__to)}) try: res = self.session.get(url = self.url, headers = self.headers, verify = False, timeout = 5) if res.status_code in [200, 206]: fp.seek(__start) fp.tell() fp.write(res.content) #print(f'thread[{self.thread_id}] 正在下载 Range:{__start}-{__to}') print('\rthread[{}] 正在下载{:^3.0f}%'.format(self.thread_id, (i / tm) * 100), end = '') i += 1 b_read += len(res.content) #self.log.update_log(__start, __end, self.thread_id, len(res.content)) if is_exit: print('\n', f'接收到^c信号 thread[{self.thread_id}]退出') self.log.update_log(__start, __end, self.thread_id, b_read) finshed = False break except Exception as e: print(e) print('\n', f'thread[{self.thread_id}]:网络错误请重新下载') self.log.update_log(__start, __end, self.thread_id, b_read) finshed = False break __start = __to + 1 if finshed: print('\n', f'thread[{self.thread_id}]->下载完成 花费:{round(time.time() - t1, 2)}s') self.log.update_log(__start - 1, __end, self.thread_id, b_read, True) fp.close() class DownBilibili(object): def __init__(self, url, thread_num, path, sesdata): self.headers = { 'Cookie': "SESSDATA=96b2608a%2C1617615923%2C7e662*a1;", 'origin': 'https://www.bilibili.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36' } self.url = url self.pat_src = r'window.__playinfo__=(.*?)</script>' self.pat_tit = r'<title.*>(.*?)</title>' self.pat_inf = r'window.__INITIAL_STATE__=(.*?)</script>' self.video_url = '' self.audio_url = '' self.titles = '' self.sesdata = sesdata self.thread_num = thread_num self.log = None self.out_put = [] if path != '': os.chdir(path) def get_file_size(self, e: int) -> str: if e <= 0: return '' t = ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"] n = math.floor(math.log2(e) / math.log2(1024)) return str(Decimal(e / math.pow(1024, n)).quantize(Decimal("0.00"))) + t[n] def get_video_info(self): try: self.headers.update({'Referer': self.url}) if self.sesdata: self.headers.update({'Cookie': self.sesdata}) resp = requests.get(url = self.url, headers = self.headers, verify = False) self.titles = re.findall(self.pat_tit, resp.text)[0].split('_')[0] except Exception as err: print(err, '\n无法获取视频信息请重试') return False try: info_json = json.loads(re.findall(self.pat_src, resp.text)[0]) self.video_url = info_json['data']['dash']['video'][0]['baseUrl'] self.audio_url = info_json['data']['dash']['audio'][0]['baseUrl'] except: bv_info = json.loads(re.findall(self.pat_inf, resp.text)[0].split(';')[0]) ep_id = self.url.split('/')[-1] ep_id = ep_id[2:] if ep_id.startswith('ep') else '' vip_api = f"https://api.bilibili.com/pgc/player/web/playurl?cid={bv_info['epList'][0]['cid']}&bvid={bv_info['epList'][0]['bvid']}&ep_id={ep_id}&fnval=80" info_json = requests.get(url = vip_api, headers = self.headers, verify = False).json() if info_json['message'] != 'success': print('大会员专属视频,请填入SESSDATA后重试!') return False self.video_url = info_json['result']['dash']['video'][0]['baseUrl'] self.audio_url = info_json['result']['dash']['audio'][0]['baseUrl'] return True def get_download_size(self, src_url): try: session = requests.session() self.headers.update({'referer': self.url}) session.options(url = src_url, headers = self.headers, verify = False) self.headers.update({'Range': 'bytes=' + str(0) + '-' + str(1024)}) res = session.get(url = src_url, headers = self.headers, verify = False) size = int(res.headers['Content-Range'].split('/')[1]) return size, session except Exception: print('无法获取服务器资源请重试') return -1, None def handler(self, signum, frame): global is_exit is_exit = True def assigned_tasks(self, size: int, log_exit: bool, src_url: str, session): step = size // self.thread_num thread_list = [] if log_exit: bound = self.log.load_bound() for i in range(self.thread_num): __session = copy.copy(session) if log_exit: __x, __y = bound[i]['start'], bound[i]['end'] # if __x == __y: # continue t = DownThread(self.headers, self.log, i, __x, __y, src_url, __session) else: start = 0 if not i else end + 1 end = start + step if i != self.thread_num - 1 else size - 1 t = DownThread(self.headers, self.log, i, start, end, src_url, __session) t.setDaemon(True) t.start() thread_list.append(t) #[_.join() for _ in thread_list] signal.signal(signal.SIGINT, self.handler) signal.signal(signal.SIGTERM, self.handler) while True: alive = False for i in range(self.thread_num): alive |= thread_list[i].is_alive() if not alive: break time.sleep(0.5) def download(self, src_url, file_name): global is_exit is_exit = False size, session = self.get_download_size(src_url) if size == -1: print('无法获取服务器资源请重试') return False log_exit = os.path.exists(os.path.join(os.getcwd(), file_name + '.log')) if log_exit: self.log = Logger(file_name) if not self.log.failed(): return True cur = self.log.get_cur_size() url = self.log.get_req_url() print(f'继续上次下载进度[已下载:{self.get_file_size(cur)} {round(cur / size * 100, 2)}%]', url) print('开始重新下载') else: print(f'开始下载【{file_name}】、文件大小为{self.get_file_size(size)}') self.log = Logger(file_name, size, src_url, self.thread_num) with open(file_name, 'wb') as fp: fp.truncate(size) self.assigned_tasks(size, log_exit, src_url, session) self.log.save_log() if self.log.failed(): return False return True def work(self): res = self.get_video_info() if not res: print('无法获取视频下载链接请重试') return None self.titles = re.sub('[\/:*?"<>|]', '-' , self.titles) self.out_put = [self.titles + '-video_out.m4s', self.titles + '-audio_out.m4s'] for i in range(2): if not i: finshed = self.download(self.video_url, self.out_put[i]) else: finshed = self.download(self.audio_url, self.out_put[i]) if not finshed: print(f'【{self.out_put[i]}】' + '下载失败请重试') return None self.merge() print('Done!') def merge(self): print('开始合并视频音频文件') os.popen('ffmpeg -i ' + f'"{self.out_put[0]}"' + ' -i ' + f'"{self.out_put[1]}"' + ' -codec copy ' + f'"{self.titles}.mp4"').read() print('合并完成开始删除缓存文件') [os.remove(os.path.join(os.getcwd(), i)) for i in self.out_put] [os.remove(os.path.join(os.getcwd(), i + '.log')) for i in self.out_put] def print_usage(): print('''usage: bili_downloader [OPTION]... A simple tools for download bilibili videos. Support ctrl-c cancel task and restart task. You can run it by command line arguments. download options: -b bilibili videos id. eg: 'BV1J5411b7XQ' -t download threads number, default 4 -d the path that you want to save videos, default current dir path -s input your cookies, eg: 'SESSDATA=xx' to get the vip videos -h show this help message and exit ps: If you need to download a member video please enter your 'SESSDATA', otherwise you'd better not do that. Refer: <https://www.cnblogs.com/GadyPu/> ''') if __name__ == '__main__': url = 'https://www.bilibili.com/video/' args = sys.argv length = len(args) if length == 1: print_usage() else: kwargs = { 'bv': '', 'num': 4, 'path': '', 'sesdata': '' } if '-b' in args: id = args.index('-b') if id != length - 1: bv = args[id + 1] kwargs['bv'] = bv if bv.startswith('ss') or bv.startswith('ep'): url = 'https://www.bilibili.com/bangumi/play/' if '-h' in args: print_usage() sys.exit() if '-t' in args: id = args.index('-t') if id != length - 1: num = int(args[id + 1]) kwargs['num'] = num if '-d' in args: id = args.index('-d') if id != length - 1 and os.path.exists(args[id + 1]): kwargs['path'] = args[id + 1] if '-s' in args: id = args.index('-s') if id != length - 1: sesdata = args[id + 1] kwargs['sesdata'] = sesdata if kwargs['bv'] == '': print_usage() sys.exit() d = DownBilibili(url + kwargs['bv'], kwargs['num'], kwargs['path'], kwargs['sesdata']) d.work()
如果出现:“[WinError 10048] 通常每个套接字地址(协议/网络地址/端口)只允许使用一次“,关闭终端重新打开终端再运行程序即可。
本人没有大会员所以某些番剧不能第一时间观看,所以又在互联网上找了其它网站,简单分析了一下他的视频存储方式保存在m3u8文件里,具体实现如下,支持断点续传,有兴趣的小伙伴可以看一下(此网站需要自备梯子)
""" =================================== -*- coding:utf-8 -*- Author :GadyPu E_mail :Gadypy@gmail.com Time :2020/8/22 0022 下午 01:41 FileName :fetch_video.py =================================== """ import os import re import sys import time import requests import urllib.parse import threading from queue import Queue import warnings warnings.filterwarnings("ignore") class FetchVideos(object): def __init__(self, url): self.url = url self.m3u8_url = None self.que = Queue() self.headers = { 'Referer': url, 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36', 'Connection': 'close' } self.is_write = False self.lock = threading.Lock() self.m4s_headers_url = None self.m4s_headers_bytes = b'' self.pat = r'mac_url=unescape\((.*)\)' self.download_failed_lists = []#'faild.txt' def unescape(self, _str): _str = urllib.parse.unquote(_str) return re.sub(r'%u([a-fA-F0-9]{4}|[a-fA-F0-9]{2})', lambda m: chr(int(m.group(1), 16)), _str) def get_m3u8_url(self): try: response = requests.get(url = self.url, headers = self.headers, verify = False) res = re.findall(self.pat, response.text)[0] res = self.unescape(res).split(r"'")[1].split('#')[-1] self.m3u8_url = res[res.find('$') + 1:] print('the m3u8 url is:' + self.m3u8_url) except: print('cannot get the m3u8_url...') sys.exit() def get_m4s_headers(self): try: response = requests.get(url = self.m4s_headers_url, headers = self.headers, verify = False) self.m4s_headers_bytes = response.content except: print('cannot get m4s headers...') sys.exit() def init_ts_url(self): page = 1 try: response = requests.get(url = self.m3u8_url, headers = self.headers, verify = False) api_url = self.m3u8_url[: self.m3u8_url.rfind('/') + 1] time.sleep(0.5) resp = requests.get(url = api_url + response.text.split('\n')[2], headers = self.headers, verify = False) print(resp.url) for i in resp.text.split('\n'): if i.endswith('.m4s'): self.que.put((page, api_url + i)) page += 1 elif i.startswith('#EXT-X-MAP:URI='): self.m4s_headers_url = api_url + i.split('#EXT-X-MAP:URI=')[1].split('"')[1] print(self.m4s_headers_url) print(f'total {self.que.qsize()} files to be downloading...') except: print('cannot get m4s_url_lists...') sys.exit() def download_ts_thread(self, path): while not self.que.empty(): d = self.que.get() try: resp = requests.get(url = d[1], headers = self.headers,\ verify = False, stream = True, timeout = 6) if resp.status_code == 200: file_path = os.path.join(path, f'{d[0]}.m4s') content = b'' for data in resp.iter_content(1024): if data: content += data with open(file_path, 'wb') as wf: wf.write(self.m4s_headers_bytes + content) print(f'{d[0]}.m4s is download complished...') time.sleep(0.2) else: print(f'the {d[0]}.m4s download failed...') with self.lock: self.download_failed_lists.append(d) except: print(f'the {d[0]}.m4s download failed...') if not self.is_write: with open('header.bin', 'wb') as wf: wf.write(self.m4s_headers_bytes) self.is_write = True with self.lock: self.download_failed_lists.append(d) def merge_ts_files(self, filename): print('start merging...') dir = os.getcwd() + r'\download\videos' merge_txt = [_ for _ in os.listdir(dir)] merge_txt.sort(key = lambda x: int(x.split('.')[0])) src = '' first = True for i in merge_txt: if first: src = i first = False else: src = src + '+' + i os.chdir(dir) os.system('copy /b ' + src + f' {filename}.mp4') print('video merge completed...') def down_faild(self, lists: list): for i in lists: if i: d = i.split(' ') self.que.put((d[0], d[1])) #print(d) with open('header.bin', 'rb') as rf: self.m4s_headers_bytes = rf.read() def remove_temp_files(self): path_text = os.path.join(os.getcwd(), 'failed.txt') path_bin = os.path.join(os.getcwd(), 'header.bin') if os.path.exists(path_text): os.remove(path_text) if os.path.exists(path_bin): os.remove(path_bin) def run(self): failed = False self.get_m3u8_url() with open('failed.txt', 'a+') as rf: rf.seek(0) lists = rf.read() if len(lists) > 1 and lists.split('\n')[0][:86] == self.m3u8_url[:86]: lists = lists.split('\n') self.down_faild(lists[1:]) failed = True else: self.init_ts_url() self.get_m4s_headers() down_path = os.getcwd() + r'\download\videos' if not os.path.exists(down_path): os.makedirs(down_path) if not failed: [os.remove(os.path.join(down_path, i)) for i in os.listdir(down_path)] thred_list = [] print('start downloading...') for i in range(3): t = threading.Thread(target = self.download_ts_thread, args = (down_path, )) t.setDaemon(True) thred_list.append(t) t.start() [t.join() for t in thred_list] if not len(self.download_failed_lists): self.remove_temp_files() self.merge_ts_files(self.m3u8_url.split('/')[4]) else: with open('failed.txt', 'w+', encoding = 'utf-8') as wf: wf.write(self.m3u8_url + '\n') [wf.write(str(i[0]) + ' ' + i[1] + '\n') for i in self.download_failed_lists] print(f'total {len(self.download_failed_lists)} files download failed...') if __name__ == '__main__': #Re0 season2 ep9 url = 'https://www.olevod.com/?m=vod-play-id-19657-src-1-num-9.html' d = FetchVideos(url = url) d.run()
五、运行效果
1、https://www.cnblogs.com/W-Y-C/p/13406392.html
2、https://www.cnblogs.com/yuanyb/p/12296815.html
3、https://blog.csdn.net/weixin_43107613/article/details/104754426
七、更新日志
- 2020.10.13 如果有大会员可以下载仅大会员可见的视频,默认最高画质,填入相应'SESSDATA'即可