使用python爬取网站电影

获取电影的.m3u8文件，使用python 携程爬取电影

1.找到需要爬取得电影url，点击f12 =>network=>fetch/xhr 找到index.m3u8文件，
https://vip.ffzy-online2.com/20230416/32868_9cb1983d/index.m3u8

2.使用以下脚本爬取整部电影

import requests
from lxml import etree
from urllib.parse import urljoin
import re
import asyncio
import aiohttp
import aiofiles
from Crypto.Cipher import AES  # 需要安装PCrypto   pip install pycryptodome
import os,time

async def download_one(ts_url,session,sem):
    async with sem:
        for i in range(3):
            try:
                async with session.get(ts_url) as resp:
                    content = await resp.content.read()
                    file_name = ts_url.split("/")[-1]
                    async with aiofiles.open(f"video_1/{file_name}",mode="wb") as f:
                        await f.write(content)
                    print(ts_url,"下载成功")
                    break
            except Exception as e:
                print(e)
                print(ts_url,"下载失败")
async def download_all_ts():
    sem = asyncio.Semaphore(200)
    tasks = []
    async with aiohttp.ClientSession() as session:
        with open ("m3u8.txt", mode="r", encoding='utf-8') as f:
            for line in f:
                if line.startswith("#"):
                    continue
                line = line.strip()
                t = asyncio.create_task(download_one(line,session,sem))
                tasks.append(t)
        await asyncio.wait(tasks)
def get_key():
    obj = re.compile(r'#EXT-X-KEY:METHOD=AES-128,URI="(?P<key_url>.*?)"', re.S)
    with open("m3u8.txt",mode="r",encoding='utf-8') as f:
        for line in f:
            result = obj.search(line)
            if result:
                key_url = result.group("key_url")
                resp = requests.get(key_url)
                key_content = resp.content
                resp.close()
                return key_content
async def desc_one_ts(file_name,key):
    aes = AES.new(key=key,mode=AES.MODE_CBC,IV=b'0000000000000000')
    async with aiofiles.open(f"./video_1/{file_name}",mode="rb") as f1,\
    aiofiles.open(f"./video_3/{file_name}",mode="wb") as f2:
        content = await f1.read()
        desc_content = aes.decrypt(content)
        await f2.write(desc_content)
    print(file_name,"解密成功")
    # time.sleep(1)
    

async def desc_all_ts(key):
    tasks=[]
    with open("m3u8.txt",mode="r",encoding='utf-8') as f:
        for line in f:
            if line.startswith("#"):
                continue
            line = line.strip()
            file_name = line.split("/")[-1]
            t = asyncio.create_task(desc_one_ts(file_name,key))
            tasks.append(t)
    await asyncio.wait(tasks)

def merge():
    print("记载m3u8")
    file_list=[]
    with open("m3u8.txt",mode="r",encoding='utf-8') as f:
        for line in f:
            if line.startswith("#"):
                continue
            line = line.strip()
            file_name = line.split("/")[-1]
            file_list.append(file_name)
    print("记载m3u8，成功")

    os.chdir("./video_3")
    tmp = []
    n =1
    for i in range(len(file_list)):
        file_name = file_list[i]
        tmp.append(file_name)
        if len(tmp) == 50:
            cmd = f"copy /b {'+'.join(tmp)}   {n}.ts"
            r = os.popen(cmd)
            print(r.read())
            tmp = []
            n +=1
    cmd = f"copy /b {'+'.join(tmp)}  {n}.ts"
    r = os.popen(cmd)
    print(r.read())
    n +=1

    sec_tmp = []
    for i in range(1,n):
        sec_tmp.append(f"{i}.ts")
    cmd = f"copy /b {'+'.join(sec_tmp)}  mm.mp4"
    r = os.popen(cmd)
    print(r.read())

    os.chdir("../")


def main():
    url = "https://qq.1080tg.com/20211108/867XNBBq/hls/index.m3u8"  #这里url改为第一步获取得m3u8文件路径
    heads = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0"
    }
    resp = requests.get(url,headers=heads)
    with open("m3u8.txt",mode="wb") as f:
        f.write(resp.content)
        print("m3u8文件下载成功")
    asyncio.run(download_all_ts())
    key = get_key()
    asyncio.run(desc_all_ts(key))
    merge()


if __name__ == '__main__':
    main()