Python m3u8视频爬取

一、爬取未加密m3u8视频

import os

import requests
import aiohttp
import asyncio
from urllib.parse import urljoin


# 利用协程异步爬取数据
async def get_ts_data(url, headers):
    async with aiohttp.ClientSession() as session:
        async with await session.get(url=url, headers=headers) as response:
            return await response.read(), url


# 回调函数
def download_data(t):
    file_name = t.result()[1].split('?')[0].split('/')[-1]
    if not os.path.exists(f'./video/{file_name}'):
        with open(f'./video/{file_name}', 'wb') as fp:
            fp.write(t.result()[0])
        print(f'{file_name} 下载成功!')


if __name__ == '__main__':
    if not os.path.exists('./video'):
        os.mkdir('./video')

    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
    }
    m3u8_url = 'https://hw-vod.cdn.huya.com/leaf/1048585/59a91b252545dcc3e53b1e831b45d090/52151257/4296105_0eb951d28243f1ebb8e24172d34a1845_264_720_1.m3u8?hyvid=831457097&hyauid=1239534962032&hyroomid=1239534962032&hyratio=1300&hyscence=vod&appid=66&domainid=25&srckey=NjZfMjVfODMxNDU3MDk3&bitrate=178&client=110&definition=1300&pid=1239534962032&scene=vod&vid=831457097&u=0&t=100&sv=2303021737'

    m3u8_text = requests.get(url=m3u8_url, headers=header).text.strip()

    fp = open('./file.txt', 'w')
    ts_list = []
    video_path = os.path.join(os.getcwd(), 'video')
    for ts in m3u8_text.split('\n'):
        if not ts.startswith('#'):
            # 拼接url
            ts_url = urljoin(m3u8_url, ts)
            ts_list.append(ts_url)
            ts_name = ts.split('?')[0].split('/')[-1]
            # 写入ts文件合成格式file.txt文件
            fp.write(f"file '{os.path.join(video_path, ts_name)}'\n")
    fp.close()

    tasks = []
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    for ts in ts_list:
        task = asyncio.ensure_future(get_ts_data(url=ts, headers=header), loop=loop)
        tasks.append(task)
        task.add_done_callback(download_data)

    loop.run_until_complete(asyncio.wait(tasks))

    with open('./file.txt', 'r') as fp:
        if len(os.listdir('./video')) == len(fp.readlines()):
            # 合成ts文件编程mp4视频,需将ffmpeg添加环境变量
            os.system('ffmpeg -f concat -safe 0 -i file.txt -c copy ./111/out.mp4')
            print('合成视频成功')
        else:
            print('ts合成视频有缺漏!')

二、爬取加密m3u8视频

爬取视频前需先安pycryptodome模块

pip install pycryptodome
import re
import os

import asyncio
import requests
import aiohttp
from urllib.parse import urljoin
from Crypto.Cipher import AES


async def get_ts_data(url, headers, key, iv, index):
    async with aiohttp.ClientSession() as session:
        async with await session.get(url=url, headers=headers) as response:
            return [await response.read(), url, key, iv, index]


def download_ts(t):
    ts_data = t.result()[0]
    key = t.result()[2]
    iv = t.result()[3]
    if not os.path.exists(f'./txclass/{t.result()[4]}.ts'):
        aes = AES.new(key=key, mode=AES.MODE_CBC, iv=iv)
        ts_data = aes.decrypt(ts_data)
        with open(f'./txclass/{t.result()[4]}.ts', 'wb') as fp:
            fp.write(ts_data)
        print(f'{t.result()[4]}.ts 下载成功!')


if __name__ == '__main__':
    def download_file():
        if not os.path.exists('./txclass'):
            os.mkdir('./txclass')
        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
        }
        url = 'https://1258712167.vod2.myqcloud.com/5a81e359vodtranssh1258712167/ff94f629387702305245426145/drm/voddrm.token.dWluPTE0NDExNTM3Nzg4NjU5MTI1NTtleHQ9MWMzNmNmYzRjODJkNzQ5NmM1ZGViZDBhYmU3ZjUwMjExY2VmMzdlZTQyZjg3MjI4YzlmNTMwNzZlOThiZGQ0NDFhMWM2NGIwMjA5ZDcxZTA1MDg2ZTIwNDYyMDNhZWZiYmU4NjQzYzFiN2NhMjQ5MGJmYmIzZjZlYzk4MWVkNjRkMGE0OWI1OTQ5ZjI1ZTJjO3VpZF90eXBlPTEwMTI7dWlkX29yaWdpbl91aWRfdHlwZT0xMDEyO3VpZF9vcmlnaW5fYXV0aF90eXBlPTEwMTI7dWlkX2FwcGlkPTE0MDAwMDAwMDg7Y2lkPTU3NzIxNTc7dGVybV9pZD0xMDU5Nzk3Mjc7dm9kX3R5cGU9MDtwbGF0Zm9ybT0z.master_playlist.m3u8?t=64120083&exper=0&us=1840130506774873878&sign=ac3fcfb0eb9802cc8daab1c621e4979c'

        m3u8_text = requests.get(url=url, headers=header).text.strip()
        m3u8_url_list = []
        for item in m3u8_text.split('\n'):
            if not item.startswith('#'):
                m3u8_url_list.append(item)

        m2_text = requests.get(url=urljoin(url, m3u8_url_list[0]), headers=header).text.strip()
        key_url = re.findall('URI="(.*?)"', m2_text, re.S)[0]
        key = requests.get(url=key_url, headers=header).content
        iv = b'0000000000000000'

        m2_url_list = []
        fp = open('./file.txt', 'w')
        index = 0
        for item in m2_text.split('\n'):
            if not item.startswith('#'):
                m2_url_list.append(urljoin(url, item))
                fp.write(f"file '{os.path.join(os.path.join(os.getcwd(), 'txclass'), str(index))}.ts'\n")
                index += 1

        fp.close()

        tasks = []
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        index = 0
        for m2_url in m2_url_list:
            if os.path.exists(f'./txclass/{index}.tx'):
                index += 1
                continue
            task = asyncio.ensure_future(get_ts_data(m2_url, header, key, iv, str(index)), loop=loop)
            tasks.append(task)
            task.add_done_callback(download_ts)
            index += 1

        loop.run_until_complete(asyncio.wait(tasks))

        fp = open('./file.txt', 'r')
        print(f'总数据:{len(fp.readlines())},已下载:{len(os.listdir("./txclass"))}')
        if len(fp.readlines()) - len(os.listdir('./txclass')) <= 3:
            os.system('ffmpeg -f concat -safe 0 -i file.txt -c copy ./111/tx_class.mp4')
            print('视频合成成功!')
            fp.close()
            return
        else:
            print('ts视频缺失,无法合成!正在重新下载缺失部分ts视频文件...')
            fp.close()
            download_file()


    # 递归爬取失败的ts文件
    download_file()

三、使用ffmpeg进行视频合并

# 合成ts文件编程mp4视频,需将ffmpeg添加环境变量
os.system('ffmpeg -f concat -safe 0 -i file.txt -c copy ./111/out.mp4')
posted @ 2023-03-08 02:43  与鹿逐秋  阅读(643)  评论(0编辑  收藏  举报