Python爬取m3u8视频（多线程）- OK好用，代码在文章尾

摘自：https://www.cnblogs.com/python147/p/14511627.html

1.前言

本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理。

PS：如有需要Python学习资料的小伙伴可以点击下方链接自行获取

Python免费学习资料、代码以及交流解答点击即可加入

爬取视频的时候发现，现在的视频都是经过加密（m3u8），不再是mp4或者avi链接直接在网页显示，都是经过加密形成ts文件分段进行播放。

今天就教大家如果通过python爬取下载m3u8加密视频。

2.分析网页

1.电影视频来源


http://www.caisetv.com/

2.分析m3u8加密目录


http://www.caisetv.com/dongzuopian/chaidanzhuanjia/0-1.html

在视频播放的页面，通过F12可以查看网络数据包


https://xigua-cdn.haima-zuida.com/20210219/19948_fcbc225a/1000k/hls/index.m3u8

这里的ts就电影的加密分段视频


https://xigua-cdn.haima-zuida.com/20210219/19948_fcbc225a/1000k/hls/

上面的m3u8链接掉index.m3u8后，在拼上075a34cccdd000000.ts等ts名称就是分段视频的链接

如下所示：


https://xigua-cdn.haima-zuida.com/20210219/19948_fcbc225a/1000k/hls/075a34cccdd000000.ts

通过浏览器把这个分段视频下载后打开：

所以只要把所有的ts下载并合并就是完整的电影视频！！！

3.下载ts

1.下载ts分段视频

刚刚已经把ts的所有名称下载下来了

接下来通过python代码去读取这个文件，提取出名称，拼接链接后下载保存到一个文件夹里！


headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',}

###下载ts文件
def download(url,name):
    r = requests.get(url, headers=headers)
    with open(name+"", "wb") as code:
        code.write(r.content)

with open("index.m3u8","r") as f:
    ts_list = f.readlines()

#去掉前面没用的信息
ts_list = ts_list[5:]
urlheader="https://xigua-cdn.haima-zuida.com/20210219/19948_fcbc225a/1000k/hls/"
count = 0
for i in ts_list:
    if "#" not in i:
        i = i.replace("\n","")
        download(urlheader+""+i,"cdzj2/"+str(count)+".ts")
        count = count+1
        print(count)

这样就可以把ts文件全部下载下来，但是一个一个下载很慢，下面通过多线程下载，提升下载速度！！！

2.多线程下载ts视频


for i in ts_list:
    if "#" not in i:
        i = i.replace("\n","")
        n = i[-7:]
        threading.Thread(target=download, args=(urlheader+""+i,"cdzj2/"+str(n),)).start()
        #download(urlheader+""+i,"cdzj2/"+str(count)+".ts")

通过多线程很快就可以将这些ts文件下载到本地！！！

4.合并ts

cmd合并文件

copy /b   *.ts   new.ts
ffmpeg -i new.ts -c copy -map 0:v -map 0:a -bsf:a aac_adtstoasc output.mp4

ffmpeg下载
官网：https://ffmpeg.org/


通过这个命令（cmd终端中运行），在含有ts文件的文件夹中就可以将ts文件合并（按名称顺序进行排列合并），并保存成new.mp4

5.总结

1.分析m3u8加密文件
2.python下载ts文件
3.cmd合并ts保存成mp4格式

6.我的代码（全）

# -*- coding:utf-8 -*-
import os
import re
import requests
import shutil
import time
import threading
from pprint import pprint


def get_web_root(url):
    # url = 'http://segmentfault.com/blog/biu/1190000000330941'

    reg = r'^(https?:\/\/[a-z0-9\-\.]+)[\/\?]?'
    m = re.match(reg, url)
    web_root = m.groups()[0] if m else ''
    # print(web_root)
    # print(web_root[web_root.rfind('.', 0, web_root.rfind('.')) + 1:])
    return web_root


def download(url, name):
    """下载ts文件"""
    # print("downloading %s ==> %s" % (url, name))

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', }
    r = requests.get(url, headers=headers)
    with open(name + "", "wb") as code:
        code.write(r.content)


def parse_m3u8(web_root, urlheader, m3u8_local):
    ts_list = []
    with open(m3u8_local, "r") as f:
        m3u8_lines = f.readlines()

    line = 0
    count = 0
    for each_line in m3u8_lines:
        ts = each_line.strip().replace("\n", "")

        line += 1
        if "#" not in ts and len(ts) > 0 and ts.endswith(".ts"):
            count = count + 1
            local_ts_name = str(count).zfill(10) + ".ts"
            if ts.startswith("/"):
                ts_url = web_root + ts
            else:
                ts_url = urlheader + "/" + ts
            ts_list.append((count, line, ts, ts_url, local_ts_name))

    return ts_list


def m3u8_download_multi_thread(m3u8, mp4, thread_max=10):
    ts_in = mp4 + "/" + "ts"
    if not os.path.exists(ts_in):
        os.makedirs(ts_in)  # mkdir -p ./a/b/c

    web_root = get_web_root(m3u8)
    urlheader = os.path.dirname(m3u8)
    m3u8_local = mp4 + "/index.m3u8"

    download(m3u8, m3u8_local)

    ts_list = parse_m3u8(web_root, urlheader, m3u8_local)
    # pprint(ts_list)

    ts_len_max = 0

    # 多线程
    for index, line, ts, ts_url, local_ts_name in ts_list:
        while thread_max <= len(threading.enumerate()):
            time.sleep(1)
        # if ts_len_max != 0:
        ts_len_max = len(ts) + 2

        print("line[%4d]=%*s ==> %s [%4d/%4d]" % (line, 0 - ts_len_max, ts, local_ts_name, index, len(ts_list)))

        try:
            threading.Thread(target=download, args=(ts_url, ts_in + "/" + local_ts_name,)).start()
            # download(urlheader+""+i,"cdzj2/"+str(count)+".ts")
        except Exception as e:
            print("threading.cnt=%d" % len(threading.enumerate()))

    # 等待所有子线程结束
    while 1 < len(threading.enumerate()):
        time.sleep(1)
        # print("thread_count=%d" % len(threading.enumerate()))


def m3u8_download_single_thread(m3u8, mp4):
    ts_in = mp4 + "/" + "ts"
    if not os.path.exists(ts_in):
        os.makedirs(ts_in)

    urlheader = os.path.dirname(m3u8)
    m3u8_local = mp4 + "/index.m3u8"

    download(m3u8, m3u8_local)

    with open(m3u8_local, "r") as f:
        ts_list = f.readlines()

    # 单线程
    line = 0
    for i in ts_list:
        line = line + 1
        if "#" not in i:
            print("line[%d]=%s" % (line, i))
            i = i.replace("\n", "")
            n = os.path.basename(i)
            download(urlheader + "/" + i, ts_in + "/" + str(n) + ".ts")


def ts_join(mp4):
    if os.path.exists(mp4 + "/" + mp4):
        os.remove(mp4 + "/" + mp4)

    os.chdir(mp4 + "/ts")

    # copy /b   *.ts   new.mp4
    with os.popen("copy /b   *.ts   all.ts", "r") as p:
        l = p.read()
        print(l)

    # 下载位置为https://ffmpeg.org/download.html
    # 再选Windows builds by BtbN
    # 再选 ffmpeg-master-latest-win64-gpl.zip
    # https://objects.githubusercontent.com/github-production-release-asset-2e65be/292087234/baee1565-da90-4c04-975a-e61f92024e54?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230530%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230530T154418Z&X-Amz-Expires=300&X-Amz-Signature=45c813ea40ddc5c06691e76c148e23c36737f42716e6bb0a831ebd5f3ec6e5c5&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=292087234&response-content-disposition=attachment%3B%20filename%3Dffmpeg-master-latest-win64-gpl.zip&response-content-type=application%2Foctet-stream
    os.system("E:/ffmpeg_OK/ffmpeg-master-latest-win64-gpl/bin/ffmpeg.exe  -i all.ts -c copy -map 0:v -map 0:a -bsf:a aac_adtstoasc " + mp4)

    shutil.move(mp4, "..")
    os.remove("./all.ts")


def ffmpeg_i_c_copy(m3u8, mp4):
    if not os.path.exists(mp4):
        os.makedirs(mp4)  # mkdir -p ./a/b/c

    if os.path.exists(mp4 + "/" + mp4):
        os.remove(mp4 + "/" + mp4)

    os.chdir(mp4)

    # ".\ffmpeg.exe -i https://liverec.video.sina.com.cn/3e9/102911_fhd.m3u8 -c copy xx.mp4"
    os.system("E:/ffmpeg_OK/ffmpeg-master-latest-win64-gpl/bin/ffmpeg.exe  -i " + m3u8 + " -c copy " + mp4)


def down_m3u8(m3u8, mp4):
    m3u8_download_multi_thread(m3u8, mp4, 40)
    # m3u8_download_single_thread(m3u8, mp4)
    ts_join(mp4)


def down_m3u8_2(m3u8, mp4):
    ffmpeg_i_c_copy(m3u8, mp4)


def test_1():
    m3u8 = "https://p2.bdstatic.com/rtmp.liveshow.lss-user.baidubce.com/live/stream_bduid_3041161111_7873701187/merged_1669738224764_293317_1160_30460.m3u8"
    mp4 = "XXX.mp4"
    down_m3u8(m3u8, mp4)



if __name__ == '__main__':
    test_1()

posted @ 2022-11-30 02:42 LiuYanYGZ 阅读(1792) 评论(0) 收藏举报

刷新页面返回顶部

LiuYanYGZ

问号惊叹号

Python爬取m3u8视频（多线程）- OK好用，代码在文章尾

1.前言

2.分析网页

1.电影视频来源

2.分析m3u8加密目录

3.下载ts

1.下载ts分段视频

2.多线程下载ts视频

4.合并ts

cmd合并文件

5.总结

6.我的代码（全）

公告

LiuYanYGZ

问号 惊叹号

Python爬取m3u8视频（多线程）- OK好用，代码在文章尾

1.前言

2.分析网页

1.电影视频来源

2.分析m3u8加密目录

3.下载ts

1.下载ts分段视频

2.多线程下载ts视频

4.合并ts

cmd合并文件

5.总结

6.我的代码（全）

公告

问号惊叹号