

from threading import Thread,current_thread
import time

def task(n):
    print("%s is running" %current_thread().name)
    print("%s is end" %current_thread().name)

if __name__ == '__main__':
    t1 = Thread(target=task,args=(3,))
    t2 = Thread(target=task,args=(5,))
    t3 = Thread(target=task,args=(100,))
    t3.daemon = True  # 守护线程,守护主线程的生命周期

    print("主") # 主线程5秒钟结束****


import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin
import os

def get_html():
    for i in range(1, 4):
        url = f'https://www.woyaogexing.com/tupian/index_{i}.html'
        headers = {
            "Referer": 'https://www.baidu.com/link?',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        response = requests.get(url, headers=headers)
        response.encoding = 'utf-8'
        res = response.text
        tree = etree.HTML(res)
        result = tree.xpath('//div[@class="pMain pMain_1"]//div/a//img/@src')
        url_list = [urljoin(url, i) for i in result]
        with ThreadPoolExecutor(max_workers=10) as t:
            for url in url_list:
                t.submit(download_img, url)

def download_img(url):
    headers = {
        "Referer": 'https://www.baidu.com/link?',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    response = requests.get(url, headers=headers)
    # 文件名称
    file_name = url.split('/')[-1]
    directory = 'tutu'
    if not os.path.exists(directory):
    with open(f'tutu/{file_name}', 'wb') as f:

if __name__ == '__main__':


# 爬取电影排行
import requests
from concurrent.futures import ThreadPoolExecutor
from lxml import etree
import time

def if_xpath(y):
    s = "".join(y)
    return s.strip()

def get_year(year):
    f = open(f"nf/{year}.cvs", "w", encoding="utf-8")
    url = f"http://www.boxofficecn.com/boxoffice{year}"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36"
    # 获取页面源代码
    resp = requests.get(url, headers=headers)
    resp_text = resp.text

    # 解析
    x_path = etree.HTML(resp_text)  # type: etree._Element
    e_path = x_path.xpath("//table/tbody/tr")[1:]
    for item in e_path:
        num = item.xpath("./td[1]//text()")
        year = item.xpath("./td[2]//text()")
        name = item.xpath("./td[3]//text()")
        money = item.xpath("./td[4]//text()")

        num = if_xpath(num)
        year = if_xpath(year)
        name = if_xpath(name)
        money = if_xpath(money)

        # print(num, year, name, money)

if __name__ == '__main__':
    # start = time.time()
    # for i in range(1994, 2023):
    #     get_year(i)
    # stop = time.time()
    # print(stop - start)
    start = time.time()
    with ThreadPoolExecutor(16) as t:
        for i in range(1994, 2023):
            t.submit(get_year, i)
    stop = time.time()
    print(stop - start)


import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin
import os

class ImgDownloader:
    def __init__(self, baseurl, num_pages=1):
        self.url = baseurl
        self.num_pages = num_pages
        self.headers = {
            "Referer": 'https://www.baidu.com/link?',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'

    def get_html(self, url):
        response = requests.get(url, headers=self.headers)
        response.raise_for_status()  # 抛出异常如果请求失败
        response.encoding = 'utf-8'
        return response.text

    def parse_img_urls(self, html):
        etree_html = etree.HTML(html)
        img_urls = etree_html.xpath('//div[@class="pMain pMain_1"]//div/a//img/@src')
        return [urljoin(self.url, img_url) for img_url in img_urls]

    def download_img(self, img_url):
        file_name = os.path.basename(img_url)  # 使用 URL 的最后一部分作为文件名
        directory = 'tutu'
        if not os.path.exists(directory):
        response = requests.get(img_url, headers=self.headers)
        response.raise_for_status()  # 确保请求成功
        with open(os.path.join(directory, file_name), 'wb') as f:
        print(f'下载完成: {file_name}')

    def download_all_imgs(self):
        for page in range(2, self.num_pages + 1):
            page_url = f"{self.url}/index_{page}.html"  # 假设的分页URL结构
            print(f"正在下载第 {page} 页的图片...")
            html = self.get_html(page_url)
            img_urls = self.parse_img_urls(html)
            with ThreadPoolExecutor(max_workers=10) as executor:
                for img_url in img_urls:
                    executor.submit(self.download_img, img_url)

if __name__ == '__main__':
    downloader = ImgDownloader(baseurl='https://www.woyaogexing.com/tupian', num_pages=10)


import requests
from concurrent.futures import ThreadPoolExecutor
from lxml import etree
from multiprocessing import Queue  # 队列
from multiprocessing import Process  # 进程
import time

def get_img_url(q):
    for item in range(1, 3):

        url = f"https://www.pkdoutu.com/article/list/?page={item}"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36"
        resp = requests.get(url, headers=headers)
        resp_text = resp.text
        # 解析
        x_path = etree.HTML(resp_text)  # type: etree._Element
        img_urls = x_path.xpath("//div[@class='col-sm-9 center-wrap']/a/@href")
        for img_url in img_urls:
            # print(img_url)

            # 获取当前页下一页
            resps = requests.get(img_url, headers=headers)
            resps_text = resps.text
            x_paths = etree.HTML(resps_text)
            img_urlss = x_paths.xpath("//li[@class='list-group-item']//a/img/@src")
            # with ThreadPoolExecutor(16) as t:
            for imgs in img_urlss:
                # download_img(imgs)
                q.put(imgs)  # 固定的
            # t.submit(download_img, imgs)

# 第二个进程. 只负责下载图片
def img_process(q):  # 从队列中提取url. 进行下载
    with ThreadPoolExecutor(10) as t:  # ?
        while 1:  # 这边不确定有多少个. 那就一直拿
            imgs = q.get()  # 没有问题. 这里面, get是一个阻塞的逻辑
            if imgs == '滚蛋吧.没了':
            # 在进程中开启多线程
            t.submit(download_img, imgs)

def download_img(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36"
    resp = requests.get(url, headers=headers)
    file_name = url.split("/")[-1]
    with open(f"tu/{file_name}", "wb") as w:

if __name__ == '__main__':
    # get_img_url()
    # 准备队列
    s1 = time.time()
    q = Queue()  # 主进程 水
    p1 = Process(target=get_img_url, args=(q,))  # 单独开辟一个内存 阿大
    p2 = Process(target=img_process, args=(q,))  # 单独开辟一个内存 阿二


    p1.join()  # 主进程等待子进程跑完
    p2.join()  # 主进程等待子进程跑完

    s2 = time.time()
    print(s2 - s1)



import requests
from lxml import etree
import asyncio
import aiohttp
import aiofiles
import os

# 1. 拿到主页面的源代码 (不需要异步)
# 2. 拿到页面源代码之后. 需要解析出 <卷名>, <章节, href>
# 3. 协程下载txt

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"

def get_chaptor_info(url):
    resp = requests.get(url, headers=headers)
    resp.encoding = "UTF-8"
    page_source = resp.text
    # 开始解析
    tree = etree.HTML(page_source)
    result = []
    divs = tree.xpath("//div[@class='mulu']")  # 每一个div就是一卷
    for div in divs:
        trs = div.xpath(".//table/tr")  # 一堆tr
        juan_name = trs[0].xpath(".//a/text()")
        juan_name = "".join(juan_name).strip().replace(":", "_")

        for tr in trs[1:]:  # 除了第一个以外的

            tds = tr.xpath("./td")
            for td in tds:
                txt = td.xpath(".//text()")
                href = td.xpath(".//@href")

                txt = "".join(txt).replace(" ", "").strip()
                href = "".join(href)
                dic = {
                    "chapter_name": txt,
                    "chapter_url": href,
                    "juan_name": juan_name
    return result

async def download_one(url, file_path):
    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=headers) as resp:
            page_source = await resp.text(encoding="utf-8")
            # 拿到文章
            tree = etree.HTML(page_source)
            content = tree.xpath("//div[@class='content']//p//text()")
            content = "".join(content).replace("\n", "").replace("\r", "").replace(" ", "").strip()

            # 寫入文件
            async with aiofiles.open(file_path, mode="w", encoding="utf-8") as f:
                await f.write(content)

    print("恭喜你。 下載了一篇文章!", file_path)

async def download_chapter(chaptor_list):
    tasks = []
    for chaptor in chaptor_list:  # {juan: xxx, name:xxx, href: xxx}
        juan = chaptor['juan_name']  # 文件夹名
        name = chaptor['chapter_name']  # 文件名  前言.txt
        url = chaptor['chapter_url']  # 用来下载 -> 异步任务

        if not os.path.exists(juan):  # 判斷文件夾是否存在
            os.makedirs(juan)  # 如果不存在就創建

        # 給出文件的真正的保存路徑
        file_path = f"{juan}/{name}.txt"  # 74
        f = download_one(url, file_path)
        t = asyncio.create_task(f)
        break  # 测试的时候
    await asyncio.wait(tasks)

def main():
    url = "https://www.mingchaonaxieshier.com/"
    chaptor_list = get_chaptor_info(url)
    # print(chaptor_list)
    # 开始上协程. 进行异步下载

if __name__ == '__main__':


# 整体步骤 => 秀人集
4.追加到一个空字典里 返回函数的值
5.异步中循环取到字典的key 判断取到字典目录的名字是否存在 不存在创建

import aiohttp
import requests
from lxml import etree
from urllib.parse import urljoin
import os
import aiofiles
import asyncio
import random
import time

ua_list = [
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) Gecko/20100101 Firefox/61.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv: Gecko/20110303 Firefox/3.6.15"

header = {
    "User-Agent": random.choice(ua_list)

url = "https://www.xiurenb.com/"

def img_url(url):
    # 获取页面
    restul = []
    resp = requests.get(url, headers=header)
    resp.encoding = 'utf-8'

    # 解析首页
    page_source = etree.HTML(resp.text)  # type: etree._Element
    home_url = page_source.xpath("//ul/li[@class='i_list list_n2']/a")
    for u in home_url:
        href = u.xpath("./@href")[0]
        titles = u.xpath("./@title")[0]
        # 首页url
        new_url = urljoin(url, href)
        # 目录名称
        folder_names = titles.replace("\r", "").replace("\n", "").replace(" ", "").split("]")[-1]
        dic = {
            "dir_url": new_url,
            "dir_name": folder_names,
    return restul

async def download_one(urls, name):
    for i in range(5):  # 重复5次
            async with aiohttp.ClientSession() as session:
                async with session.get(urls, headers=header) as resp:
                    page_source = await resp.text(encoding="utf-8")
                    num_list = etree.HTML(page_source)
                    num_list_url = num_list.xpath("//div[@class='main_inner']//div[@class='page'][1]/a")[1:-1]
                    for nums in num_list_url:
                        href = nums.xpath("./@href")
                        if href:
                            href_list = href[0]
                        # 分页url
                        new_href = urljoin(url, href_list)
                        async with aiohttp.ClientSession() as sessions:
                            async with sessions.get(new_href, headers=header) as resps:
                                jpg_source = await resps.text(encoding='utf-8')
                                subpage_page = etree.HTML(jpg_source)  # type: etree._Element
                                subpage_url = subpage_page.xpath("//div[@class='main_left']//p/img")
                                for img_list in subpage_url:
                                    src_url = img_list.xpath("./@src")[0]
                                    # 图片url
                                    new_src = urljoin(url, src_url)
                                    # 文件名
                                    file_name = new_src.split("/")[-1]
                                    async with aiohttp.ClientSession() as session_jpg:
                                        async with session_jpg.get(new_src, headers=header) as resp_list:
                                            jpg_urls = await resp_list.content.read()
                                            async with aiofiles.open(f"{name}/{file_name}", mode="wb") as f:
                                                await f.write(jpg_urls)
            print("下载图片完成", new_src)
        except Exception as e:
            print("请求超时错误", e)

async def task(char_set):
    tasks = []
    for i in char_set:
        name = i['dir_name']
        urls = i['dir_url']

        # 判断目录是否存在
        if not os.path.exists(name):

        f = download_one(urls, name)
        # 提交任务
        t = asyncio.create_task(f)
        # 追加到列表里
    # 统一任务执行
    await asyncio.wait(tasks)

def main():
    char_set = img_url(url)

    # 运行协程任务
    event_loop = asyncio.get_event_loop()

if __name__ == '__main__':
    start = time.time()
    stop = time.time()
    print(stop - start)


# 整体步骤 => 网吧电影
1. 想办法找到M3U8文件
2. 判别(人工)是否需要下载第二层M3U8
3. 提取ts文件的下载路径
4. 下载
5. 判别是否需要解密
6. 如果需要解密, 拿到秘钥
7. 解密
8. 根据M3U8的正确顺序来合并所有的ts文件 => MP4
import requests
from lxml import etree
import re
from urllib.parse import urljoin
import os  # 执行cmd/控制台上的命令

import asyncio
import aiohttp
import aiofiles

from Crypto.Cipher import AES  # pip install pycryptodome

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"

def get_iframe_src(url): # 拿到iframe的src
    resp = requests.get(url, headers=headers)
    tree = etree.HTML(resp.text)
    src = tree.xpath("//iframe/@src")[0]  # 谁报错了. 问这里. 拉出去!!!!!5分钟
    return src

def get_m3u8_url(url):
    resp = requests.get(url, headers=headers)
    obj = re.compile(r'url: "(?P<m3u8>.*?)"', re.S)
    m3u8 = obj.search(resp.text).group("m3u8")  # B
    return m3u8

def download_m3u8(url):  # https://a.ak-kk.com/20211030/89ZfL7VX/index.m3u8
    resp = requests.get(url, headers=headers)
    with open("first.m3u8", mode="w", encoding="utf-8") as f:
    # 这个位置的错误. 价值5分钟
    with open("first.m3u8", mode='r', encoding="utf-8") as f2:
        for line in f2:  # 一行一行的读
            if line.startswith("#"):  # 以#开头
                continue  # 拜拜
            # 此时的line就是第二层M3U8的地址
            line = line.strip()  # 注意要strip()  否则会有意想不到的收获

            line = urljoin(url, line)  # 拼接一下
            # 下载第二层M3U8
            resp = requests.get(line, headers=headers)
            with open("second.m3u8", mode="w", encoding="utf-8") as f3:
                break  # 可以加, 也可以不加

async def download_one(url, sem):
    async with sem:  # 使用信号量控制访问频率
        file_name = url.split("/")[-1]
        file_path = "./解密前/"+file_name
        print(file_name, "开始工作了!")
        for i in range(10):  # 重试10次
                async with aiohttp.ClientSession() as session:
                    async with session.get(url, headers=headers) as resp:
                        content = await resp.content.read()
                        # 写入文件
                        async with aiofiles.open(file_path, mode="wb") as f:
                            await f.write(content)
                print(file_name, "下载完成!")
            except Exception as e:
                print(file_name, "出错了, 马上重试", e)   # 给个提示. 看到错误信息

async def download_all_videos():
    # 信号量, 用来控制协程的并发量
    sem = asyncio.Semaphore(100)  # 网吧电影中极个别电影需要控制在5左右
    # 1. 读取文件
    tasks = []
    with open("second.m3u8", mode="r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("#"):
            line = line.strip()  # 不写. 你会得到意想不到的收获
            # 此时line就是下载地址
            # 2. 创建任务
            t = asyncio.create_task(download_one(line, sem))
    # 3. 统一等待
    await asyncio.wait(tasks)

def get_key():
    with open("second.m3u8", mode="r", encoding="utf-8") as f:
        file_content = f.read()  # 读取到所有内容
        obj = re.compile(r'URI="(?P<key_url>.*?)"')
        key_url = obj.search(file_content).group("key_url")
        resp = requests.get(key_url, headers=headers)  # 发请求, 拿秘钥
        return resp.content  # 直接拿字节. 为了解密的时候. 直接丢进去就可以了.

async def desc_one(file_path, key):
    file_name = file_path.split("/")[-1]
    new_file_path = "./解密后/" + file_name
    # 解密
    async with aiofiles.open(file_path, mode="rb") as f1,\
            aiofiles.open(new_file_path, mode="wb") as f2:
        content = await f1.read()
        # 解密
        # 固定逻辑, 创建一个加密器
        aes = AES.new(key=key, mode=AES.MODE_CBC, IV=b"0000000000000000")
        new_content = aes.decrypt(content)
        await f2.write(new_content)  # 写入新文件
    print(new_file_path, "解密成功")

# 解密的协程逻辑
# 读M3U8文件. 拿到文件名称和路径
# 每个ts文件一个任务
# 在每个任务中. 解密即可
async def desc_all(key):
    tasks = []
    with open("second.m3u8", mode="r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("#"):
            line = line.strip()
            file_name = line.split("/")[-1]
            file_path = "./解密前/"+file_name
            # 创建任务. 去解密
            t = asyncio.create_task(desc_one(file_path, key))
    await asyncio.wait(tasks)

def merge():
    # 视频片段合成
    # B站视频. 不适用这个.
    # 需要一个命令
    # windows: copy /b a.ts+b.ts+c.ts xxx.mp4
    # linux/mac: cat a.ts b.ts c.ts > xxx.mp4
    # 共同的坑:
    # 1. 执行命令 太长了不行. 需要分段合并
    # 2. 执行命令的时候. 容易出现乱码. 采用popen来执行命令. 就可以避免乱码
    # 3. 你只需要关注. 是否合并成功了
    # os.system("dir")  # 会有乱码
    # r = os.popen("dir")
    # print(r.read())  # 可以暂时性的避免乱码

    # 拿到所有文件名.和正确的合并顺序
    file_list = []
    with open("second.m3u8", mode="r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("#"):
            line = line.strip()
            file_name = line.split("/")[-1]

    # 进入到文件夹内
    os.chdir("./解密后")  # 更换工作目录
    # file_list  所有文件名称

    # 分段合并
    n = 1
    temp = []  # [a.ts, b.ts, c.ts]  =?=>  a.ts+b.ts+c.ts
    for i in range(len(file_list)):
        # 每 20 个合并一次
        file_name = file_list[i]
        if i != 0 and i % 20 == 0:  # 20和一次(第一次合并有21个)
            # 可以合并一次了
            cmd = f"copy /b {'+'.join(temp)} {n}.ts"
            r = os.popen(cmd)
            temp = []  # 新列表
            n = n + 1
    # 需要把剩余的ts进行合并
    cmd = f"copy /b {'+'.join(temp)} {n}.ts"
    r = os.popen(cmd)
    n = n + 1

    # 第二次大合并  1.ts + 2.ts + 3.ts xxx.mp4
    last_temp = []
    for i in range(1, n):
    # 最后一次合并
    cmd = f"copy /b {'+'.join(last_temp)} 春夏秋冬又一春.mp4"
    r = os.popen(cmd)
    # 回来
    os.chdir("../")  # ../ 上层文件夹

def main():
    url = "http://www.wbdy.tv/play/63690_1_1.html"
    # 1.拿到iframe的src属性值
    src = get_iframe_src(url)
    # 2. 发送请求到iframe的src路径. 获取到M3U8地址
    src = urljoin(url, src)
    m3u8_url = get_m3u8_url(src)
    # 3. 下载m3u8文件
    # 4. 下载视频. 上协程下载视频
    event_loop = asyncio.get_event_loop()
    # 5. 拿秘钥
    key = get_key()
    # 6. 解密
    event_loop = asyncio.get_event_loop()

    # 合成

if __name__ == '__main__':
