案例:抓取每日影视的m3u8视频
1 思路分析
练习:抓取 每日影视 首页/域名 https://sp.weoknow.com/
无耻之徒视频页:https://sp.weoknow.com/index.php/vod/play/id/28124/sid/1/nid/1.html
# 分析
0:获得 视频m3u8的入口
https://sp.weoknow.com/index.php/vod/play/id/28124/sid/1/nid/1.html
返回:正则匹配
"url":"https://v4.dious.cc/share/CjqUrWJmQFUs4Tab" # 本集的
"url_next":"https://v4.dious.cc/share/hVE09pB2daIjWASR" # 下一集的
1: 获得 index.m3u8的 url
https://v4.dious.cc/share/CjqUrWJmQFUs4Tab
返回: 正则匹配
var playlist = '[{"url":"/20220515/jXanPsgX/1200kb/hls/index.m3u8"}]'; # 其实可以直接到 真正index.m3u8这里
var main = "/20220515/jXanPsgX/index.m3u8";
2: 获得 带加密的 index.m3u8的 url : # 这步可以省略,因为第一步 可直接获得真实index.m3u8
https://v4.dious.cc/20220515/jXanPsgX/index.m3u8
返回:
#EXTM3U
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1200000,RESOLUTION=1280x720 # 指的是m3u8的视频参数,带宽为1200kb,分辨率为1280P
/20220515/jXanPsgX/1200kb/hls/index.m3u8
3:获得 后续的 ts文件的url
https://v4.dious.cc/20220515/jXanPsgX/1200kb/hls/index.m3u8
返回:
#EXTM3U
#EXT-X-VERSION:3
#EXT-X-TARGETDURATION:6
#EXT-X-PLAYLIST-TYPE:VOD
#EXT-X-MEDIA-SEQUENCE:0
#EXT-X-KEY:METHOD=AES-128,URI="/20220515/jXanPsgX/1200kb/hls/key.key" # 加密方法为 AES-128,及解密key文件的位置地址
#EXTINF:3.127,
/20220515/jXanPsgX/1200kb/hls/FmqG3vxv.ts
#EXTINF:3.127,
/20220515/jXanPsgX/1200kb/hls/ffzf4KLr.ts
2 完整代码
import os.path
import random
import threading
import time
import requests
import re
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
def get_main_ts_url(url):
"""
获取视频的ts入口页,eg:https://v4.dious.cc/share/GDdSkH33j9Wn8Akx
:param url:视频播放页的url
:return: ts_url的前缀域名, 本集的ts入口页, 下一集的ts入口页
"""
html = requests.get(url, headers=headers).text
main_ts_url = re.search('"url":"(https:.+?)"', html, re.M).group(1).replace('\\', '')
ts_domain = main_ts_url.rsplit('/', 2)[0]
next_main_ts_url = re.search('"url_next":"(https:.+?)"', html, re.M).group(1).replace('\\', '')
return ts_domain, main_ts_url, next_main_ts_url
def get_index_m3u8_url(ts_domain, main_ts_url):
"""
获取每集视频的index_m3u8的 请求url
:param ts_domain: ts_url的前缀域名
:param main_ts_url: 请求ts入口页的url
:return: 每集的 index_m3u8的 请求url
"""
html = requests.get(main_ts_url, headers=headers).text
index_m3u8_url = ts_domain + re.search('"url":"(.+?)"', html, re.M).group(1).replace('\\', '')
return index_m3u8_url
def get_ts_url(index_m3u8_path, ts_domain):
"""
:param index_m3u8_path: index.m3u8的路径
:param ts_domain: ts_url的前缀域名
:return: 后续ts文件的url列表,解密文件key的url
"""
ts_url_list = []
with open(index_m3u8_path, 'r', encoding='utf-8') as f:
for line in f:
if line.startswith('#EXT-X-KEY'):
key_url = ts_domain + re.search('URI="(.+)"', line).group(1)
elif line.startswith('/'):
ts_url = ts_domain + line.strip('\n')
ts_url_list.append(ts_url)
return ts_url_list, key_url
def download_file(ts_dir_path, url, filename):
"""
下载单个文件,eg: index.e3u8 和 key.m3u8
:param ts_dir_path: 每集的ts目录
:param url: 下载文件的url
:param filename: 保存的文件名
:return: 下载完成的文件完整路径
"""
if not os.path.isdir(ts_dir_path):
os.makedirs(ts_dir_path)
html = requests.get(url, headers=headers)
path = f'{ts_dir_path}/{filename}'
with open(path, 'wb') as f:
f.write(html.content)
return path
def download_video_one(ts_dir_path, ts_url, filename):
"""
下载单个的ts文件
:param ts_url: ts文件路径
:param ts_dir_path: 每集的ts目录
:param filename: 按照0-n的保存ts文件 eg: 0.ts
:return:
"""
# print(f'{ts_dir_path}/{filename}.ts')
if not os.path.exists(f'{ts_dir_path}/{filename}.ts'):
time.sleep(random.randint(1, 3))
html = requests.get(ts_url, headers=headers)
print(os.getpid(), threading.current_thread().name, ts_url)
with open(f'{ts_dir_path}/{filename}.ts', 'wb') as f:
f.write(html.content)
def start_thread(ts_dir_path, ts_url_list):
"""
开启多线程下载ts文件
:param ts_dir_path: 每集的ts目录
:param ts_url_list: ts文件的url列表
:return:
"""
with ThreadPoolExecutor(max_workers=50) as executor:
for index, ts_url in enumerate(ts_url_list):
executor.submit(download_video_one, ts_dir_path, ts_url, index)
def update_index_m3u8(ts_dir_path, index_m3u8_path):
"""
修改下载好的index.m3u8文件,将解密key文件和 ts文件的路径 指向本地文件所在
:param ts_dir_path: ts文件路径
:param index_m3u8_path: index.m3u8文件路径
:return:
"""
i = 0
with open(index_m3u8_path, 'r', encoding='utf-8') as f:
data = f.readlines()
with open(index_m3u8_path, 'w') as f:
for line in data:
if line.startswith('#EXT-X-KEY'):
line = re.sub('"(.+?)"', f'"{ts_dir_path}\\key.m3u8"'.replace('\\', '/'), line)
elif line.startswith('/'):
line = f'{ts_dir_path}\\{i}.ts\n'
i += 1
f.write(line)
def merge_video(ts_dir_path, video_name):
"""
合并ts文件,成mp4视频
:param ts_dir_path:
:param video_name:
:return:
"""
if not os.path.exists(ts_dir_path + '\\' + f'{video_name}.mp4'):
os.chdir(ts_dir_path)
os.system(f'ffmpeg -i index.m3u8 -c copy {video_name}.mp4')
print(f'{video_name}.mp4 已经存在了!')
def remove_ts(ts_dir_path):
"""
删除ts文件
:param ts_dir_path:
:return:
"""
for *_, filenames in os.walk(ts_dir_path):
for file_name in filenames:
if re.match(r'(^\d+.ts)|(^.*.m3u8)', file_name):
# 删除匹配到的文件
os.remove(ts_dir_path + '\\' + file_name)
def run(ts_dir_path, url, video_name):
"""
执行下载每集视频
:param ts_dir_path:
:param url: 每集的视频页url
:param video_name: 合成mp4的视频名字
:return:
"""
print(os.getpid(), f'开始下载第{video_name}集')
ts_domain, main_ts_url, _ = get_main_ts_url(url)
index_m3u8_url = get_index_m3u8_url(ts_domain, main_ts_url)
index_m3u8_path = download_file(ts_dir_path, index_m3u8_url, 'index.m3u8') # 下载index.m3u8文件
ts_url_list, key_url = get_ts_url(index_m3u8_path, ts_domain)
download_file(ts_dir_path, key_url, 'key.m3u8') # 下载key.m3u8文件
start_thread(ts_dir_path, ts_url_list) # 下载ts文件
update_index_m3u8(ts_dir_path, index_m3u8_path)
merge_video(ts_dir_path, video_name)
# remove_ts(ts_dir_path)
def start():
"""
开启多进程,下载多页
:return:
"""
root_path = 'E:\\Shameless\\'
with ProcessPoolExecutor(max_workers=2) as executor:
for i in range(2, 7):
url = f'https://sp.weoknow.com/index.php/vod/play/id/28124/sid/1/nid/{i}.html'
ts_dir_path = os.path.join(root_path, str(i), 'ts')
executor.submit(run, ts_dir_path, url, str(i))
if __name__ == '__main__':
start()