Python m3u8视频爬取
一、爬取未加密m3u8视频
import os
import requests
import aiohttp
import asyncio
from urllib.parse import urljoin
# 利用协程异步爬取数据
async def get_ts_data(url, headers):
async with aiohttp.ClientSession() as session:
async with await session.get(url=url, headers=headers) as response:
return await response.read(), url
# 回调函数
def download_data(t):
file_name = t.result()[1].split('?')[0].split('/')[-1]
if not os.path.exists(f'./video/{file_name}'):
with open(f'./video/{file_name}', 'wb') as fp:
fp.write(t.result()[0])
print(f'{file_name} 下载成功!')
if __name__ == '__main__':
if not os.path.exists('./video'):
os.mkdir('./video')
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
m3u8_url = 'https://hw-vod.cdn.huya.com/leaf/1048585/59a91b252545dcc3e53b1e831b45d090/52151257/4296105_0eb951d28243f1ebb8e24172d34a1845_264_720_1.m3u8?hyvid=831457097&hyauid=1239534962032&hyroomid=1239534962032&hyratio=1300&hyscence=vod&appid=66&domainid=25&srckey=NjZfMjVfODMxNDU3MDk3&bitrate=178&client=110&definition=1300&pid=1239534962032&scene=vod&vid=831457097&u=0&t=100&sv=2303021737'
m3u8_text = requests.get(url=m3u8_url, headers=header).text.strip()
fp = open('./file.txt', 'w')
ts_list = []
video_path = os.path.join(os.getcwd(), 'video')
for ts in m3u8_text.split('\n'):
if not ts.startswith('#'):
# 拼接url
ts_url = urljoin(m3u8_url, ts)
ts_list.append(ts_url)
ts_name = ts.split('?')[0].split('/')[-1]
# 写入ts文件合成格式file.txt文件
fp.write(f"file '{os.path.join(video_path, ts_name)}'\n")
fp.close()
tasks = []
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
for ts in ts_list:
task = asyncio.ensure_future(get_ts_data(url=ts, headers=header), loop=loop)
tasks.append(task)
task.add_done_callback(download_data)
loop.run_until_complete(asyncio.wait(tasks))
with open('./file.txt', 'r') as fp:
if len(os.listdir('./video')) == len(fp.readlines()):
# 合成ts文件编程mp4视频,需将ffmpeg添加环境变量
os.system('ffmpeg -f concat -safe 0 -i file.txt -c copy ./111/out.mp4')
print('合成视频成功')
else:
print('ts合成视频有缺漏!')
二、爬取加密m3u8视频
爬取视频前需先安pycryptodome模块
pip install pycryptodome
import re
import os
import asyncio
import requests
import aiohttp
from urllib.parse import urljoin
from Crypto.Cipher import AES
async def get_ts_data(url, headers, key, iv, index):
async with aiohttp.ClientSession() as session:
async with await session.get(url=url, headers=headers) as response:
return [await response.read(), url, key, iv, index]
def download_ts(t):
ts_data = t.result()[0]
key = t.result()[2]
iv = t.result()[3]
if not os.path.exists(f'./txclass/{t.result()[4]}.ts'):
aes = AES.new(key=key, mode=AES.MODE_CBC, iv=iv)
ts_data = aes.decrypt(ts_data)
with open(f'./txclass/{t.result()[4]}.ts', 'wb') as fp:
fp.write(ts_data)
print(f'{t.result()[4]}.ts 下载成功!')
if __name__ == '__main__':
def download_file():
if not os.path.exists('./txclass'):
os.mkdir('./txclass')
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
url = 'https://1258712167.vod2.myqcloud.com/5a81e359vodtranssh1258712167/ff94f629387702305245426145/drm/voddrm.token.dWluPTE0NDExNTM3Nzg4NjU5MTI1NTtleHQ9MWMzNmNmYzRjODJkNzQ5NmM1ZGViZDBhYmU3ZjUwMjExY2VmMzdlZTQyZjg3MjI4YzlmNTMwNzZlOThiZGQ0NDFhMWM2NGIwMjA5ZDcxZTA1MDg2ZTIwNDYyMDNhZWZiYmU4NjQzYzFiN2NhMjQ5MGJmYmIzZjZlYzk4MWVkNjRkMGE0OWI1OTQ5ZjI1ZTJjO3VpZF90eXBlPTEwMTI7dWlkX29yaWdpbl91aWRfdHlwZT0xMDEyO3VpZF9vcmlnaW5fYXV0aF90eXBlPTEwMTI7dWlkX2FwcGlkPTE0MDAwMDAwMDg7Y2lkPTU3NzIxNTc7dGVybV9pZD0xMDU5Nzk3Mjc7dm9kX3R5cGU9MDtwbGF0Zm9ybT0z.master_playlist.m3u8?t=64120083&exper=0&us=1840130506774873878&sign=ac3fcfb0eb9802cc8daab1c621e4979c'
m3u8_text = requests.get(url=url, headers=header).text.strip()
m3u8_url_list = []
for item in m3u8_text.split('\n'):
if not item.startswith('#'):
m3u8_url_list.append(item)
m2_text = requests.get(url=urljoin(url, m3u8_url_list[0]), headers=header).text.strip()
key_url = re.findall('URI="(.*?)"', m2_text, re.S)[0]
key = requests.get(url=key_url, headers=header).content
iv = b'0000000000000000'
m2_url_list = []
fp = open('./file.txt', 'w')
index = 0
for item in m2_text.split('\n'):
if not item.startswith('#'):
m2_url_list.append(urljoin(url, item))
fp.write(f"file '{os.path.join(os.path.join(os.getcwd(), 'txclass'), str(index))}.ts'\n")
index += 1
fp.close()
tasks = []
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
index = 0
for m2_url in m2_url_list:
if os.path.exists(f'./txclass/{index}.tx'):
index += 1
continue
task = asyncio.ensure_future(get_ts_data(m2_url, header, key, iv, str(index)), loop=loop)
tasks.append(task)
task.add_done_callback(download_ts)
index += 1
loop.run_until_complete(asyncio.wait(tasks))
fp = open('./file.txt', 'r')
print(f'总数据:{len(fp.readlines())},已下载:{len(os.listdir("./txclass"))}')
if len(fp.readlines()) - len(os.listdir('./txclass')) <= 3:
os.system('ffmpeg -f concat -safe 0 -i file.txt -c copy ./111/tx_class.mp4')
print('视频合成成功!')
fp.close()
return
else:
print('ts视频缺失,无法合成!正在重新下载缺失部分ts视频文件...')
fp.close()
download_file()
# 递归爬取失败的ts文件
download_file()
三、使用ffmpeg进行视频合并
# 合成ts文件编程mp4视频,需将ffmpeg添加环境变量
os.system('ffmpeg -f concat -safe 0 -i file.txt -c copy ./111/out.mp4')