多线程、线程池、多进程、协程

多线程

from threading import Thread,current_thread
import time

def task(n):
    print("%s is running" %current_thread().name)
    time.sleep(n)
    print("%s is end" %current_thread().name)


if __name__ == '__main__':
    t1 = Thread(target=task,args=(3,))
    t2 = Thread(target=task,args=(5,))
    t3 = Thread(target=task,args=(100,))
    t3.daemon = True  # 守护线程,守护主线程的生命周期

    t1.start()
    t2.start()
    t3.start()
    print("主") # 主线程5秒钟结束****

多线程案例

import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin
import os


def get_html():
    for i in range(1, 4):
        url = f'https://www.woyaogexing.com/tupian/index_{i}.html'
        headers = {
            "Referer": 'https://www.baidu.com/link?',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.encoding = 'utf-8'
        res = response.text
        tree = etree.HTML(res)
        result = tree.xpath('//div[@class="pMain pMain_1"]//div/a//img/@src')
        url_list = [urljoin(url, i) for i in result]
        with ThreadPoolExecutor(max_workers=10) as t:
            for url in url_list:
                t.submit(download_img, url)


def download_img(url):
    headers = {
        "Referer": 'https://www.baidu.com/link?',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    # 文件名称
    file_name = url.split('/')[-1]
    directory = 'tutu'
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(f'tutu/{file_name}', 'wb') as f:
        f.write(response.content)
    print('一张图片下载完成')


if __name__ == '__main__':
    get_html()

线程池

# 爬取电影排行
import requests
from concurrent.futures import ThreadPoolExecutor
from lxml import etree
import time


def if_xpath(y):
    s = "".join(y)
    return s.strip()


def get_year(year):
    f = open(f"nf/{year}.cvs", "w", encoding="utf-8")
    url = f"http://www.boxofficecn.com/boxoffice{year}"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36"
    }
    # 获取页面源代码
    resp = requests.get(url, headers=headers)
    resp_text = resp.text

    # 解析
    x_path = etree.HTML(resp_text)  # type: etree._Element
    e_path = x_path.xpath("//table/tbody/tr")[1:]
    for item in e_path:
        num = item.xpath("./td[1]//text()")
        year = item.xpath("./td[2]//text()")
        name = item.xpath("./td[3]//text()")
        money = item.xpath("./td[4]//text()")

        num = if_xpath(num)
        year = if_xpath(year)
        name = if_xpath(name)
        money = if_xpath(money)

        # print(num, year, name, money)
        f.write(f"{num},{year},{name},{money}\n")


if __name__ == '__main__':
    # start = time.time()
    # for i in range(1994, 2023):
    #     get_year(i)
    # stop = time.time()
    # print(stop - start)
    start = time.time()
    with ThreadPoolExecutor(16) as t:
        for i in range(1994, 2023):
            t.submit(get_year, i)
    stop = time.time()
    print(stop - start)

线程池面向对象案例

import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin
import os


class ImgDownloader:
    def __init__(self, baseurl, num_pages=1):
        self.url = baseurl
        self.num_pages = num_pages
        self.headers = {
            "Referer": 'https://www.baidu.com/link?',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }

    def get_html(self, url):
        response = requests.get(url, headers=self.headers)
        response.raise_for_status()  # 抛出异常如果请求失败
        response.encoding = 'utf-8'
        return response.text

    def parse_img_urls(self, html):
        etree_html = etree.HTML(html)
        img_urls = etree_html.xpath('//div[@class="pMain pMain_1"]//div/a//img/@src')
        return [urljoin(self.url, img_url) for img_url in img_urls]

    def download_img(self, img_url):
        file_name = os.path.basename(img_url)  # 使用 URL 的最后一部分作为文件名
        directory = 'tutu'
        if not os.path.exists(directory):
            os.makedirs(directory)
        response = requests.get(img_url, headers=self.headers)
        response.raise_for_status()  # 确保请求成功
        with open(os.path.join(directory, file_name), 'wb') as f:
            f.write(response.content)
        print(f'下载完成: {file_name}')

    def download_all_imgs(self):
        for page in range(2, self.num_pages + 1):
            page_url = f"{self.url}/index_{page}.html"  # 假设的分页URL结构
            print(f"正在下载第 {page} 页的图片...")
            html = self.get_html(page_url)
            img_urls = self.parse_img_urls(html)
            with ThreadPoolExecutor(max_workers=10) as executor:
                for img_url in img_urls:
                    executor.submit(self.download_img, img_url)


if __name__ == '__main__':
    downloader = ImgDownloader(baseurl='https://www.woyaogexing.com/tupian', num_pages=10)
    downloader.download_all_imgs()

多进程

import requests
from concurrent.futures import ThreadPoolExecutor
from lxml import etree
from multiprocessing import Queue  # 队列
from multiprocessing import Process  # 进程
import time


def get_img_url(q):
    for item in range(1, 3):

        url = f"https://www.pkdoutu.com/article/list/?page={item}"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36"
        }
        resp = requests.get(url, headers=headers)
        resp_text = resp.text
        # 解析
        x_path = etree.HTML(resp_text)  # type: etree._Element
        img_urls = x_path.xpath("//div[@class='col-sm-9 center-wrap']/a/@href")
        for img_url in img_urls:
            # print(img_url)

            # 获取当前页下一页
            resps = requests.get(img_url, headers=headers)
            resps_text = resps.text
            x_paths = etree.HTML(resps_text)
            img_urlss = x_paths.xpath("//li[@class='list-group-item']//a/img/@src")
            # with ThreadPoolExecutor(16) as t:
            for imgs in img_urlss:
                print(imgs)
                # download_img(imgs)
                q.put(imgs)  # 固定的
        q.put("滚蛋吧.没了")
            # t.submit(download_img, imgs)


# 第二个进程. 只负责下载图片
def img_process(q):  # 从队列中提取url. 进行下载
    with ThreadPoolExecutor(10) as t:  # ?
        while 1:  # 这边不确定有多少个. 那就一直拿
            imgs = q.get()  # 没有问题. 这里面, get是一个阻塞的逻辑
            if imgs == '滚蛋吧.没了':
                break
            # 在进程中开启多线程
            t.submit(download_img, imgs)


def download_img(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36"
    }
    resp = requests.get(url, headers=headers)
    file_name = url.split("/")[-1]
    with open(f"tu/{file_name}", "wb") as w:
        w.write(resp.content)


if __name__ == '__main__':
    # get_img_url()
    # 准备队列
    s1 = time.time()
    q = Queue()  # 主进程 水
    p1 = Process(target=get_img_url, args=(q,))  # 单独开辟一个内存 阿大
    p2 = Process(target=img_process, args=(q,))  # 单独开辟一个内存 阿二

    p1.start()
    p2.start()

    p1.join()  # 主进程等待子进程跑完
    p2.join()  # 主进程等待子进程跑完

    s2 = time.time()
    print(s2 - s1)

协程

小说下载案例

import requests
from lxml import etree
import asyncio
import aiohttp
import aiofiles
import os

# 1. 拿到主页面的源代码 (不需要异步)
# 2. 拿到页面源代码之后. 需要解析出 <卷名>, <章节, href>
# 3. 协程下载txt

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
}


def get_chaptor_info(url):
    resp = requests.get(url, headers=headers)
    resp.encoding = "UTF-8"
    page_source = resp.text
    # 开始解析
    tree = etree.HTML(page_source)
    result = []
    divs = tree.xpath("//div[@class='mulu']")  # 每一个div就是一卷
    for div in divs:
        trs = div.xpath(".//table/tr")  # 一堆tr
        juan_name = trs[0].xpath(".//a/text()")
        juan_name = "".join(juan_name).strip().replace(":", "_")

        for tr in trs[1:]:  # 除了第一个以外的

            tds = tr.xpath("./td")
            for td in tds:
                txt = td.xpath(".//text()")
                href = td.xpath(".//@href")

                txt = "".join(txt).replace(" ", "").strip()
                href = "".join(href)
                dic = {
                    "chapter_name": txt,
                    "chapter_url": href,
                    "juan_name": juan_name
                }
                result.append(dic)
    return result


async def download_one(url, file_path):
    print("我要下載文章了")
    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=headers) as resp:
            page_source = await resp.text(encoding="utf-8")
            # 拿到文章
            tree = etree.HTML(page_source)
            content = tree.xpath("//div[@class='content']//p//text()")
            content = "".join(content).replace("\n", "").replace("\r", "").replace(" ", "").strip()

            # 寫入文件
            async with aiofiles.open(file_path, mode="w", encoding="utf-8") as f:
                await f.write(content)

    print("恭喜你。 下載了一篇文章!", file_path)


async def download_chapter(chaptor_list):
    tasks = []
    for chaptor in chaptor_list:  # {juan: xxx, name:xxx, href: xxx}
        juan = chaptor['juan_name']  # 文件夹名
        name = chaptor['chapter_name']  # 文件名  前言.txt
        url = chaptor['chapter_url']  # 用来下载 -> 异步任务

        if not os.path.exists(juan):  # 判斷文件夾是否存在
            os.makedirs(juan)  # 如果不存在就創建

        # 給出文件的真正的保存路徑
        file_path = f"{juan}/{name}.txt"  # 74
        f = download_one(url, file_path)
        t = asyncio.create_task(f)
        tasks.append(t)
        break  # 测试的时候
    await asyncio.wait(tasks)


def main():
    url = "https://www.mingchaonaxieshier.com/"
    chaptor_list = get_chaptor_info(url)
    # print(chaptor_list)
    # 开始上协程. 进行异步下载
    asyncio.run(download_chapter(chaptor_list))


if __name__ == '__main__':
    main()

图片下载案例

"""
# 整体步骤 => 秀人集
1.拿到首页html页面的url和目录名称
2.拿到分页列表
3.在拿到分页下的图片url和文件名称
4.追加到一个空字典里 返回函数的值
5.异步中循环取到字典的key 判断取到字典目录的名字是否存在 不存在创建
6.创建单个异步下载任务
7.提交任务执行
"""

import aiohttp
import requests
from lxml import etree
from urllib.parse import urljoin
import os
import aiofiles
import asyncio
import random
import time

ua_list = [
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) Gecko/20100101 Firefox/61.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15"
]

header = {
    "User-Agent": random.choice(ua_list)
}

url = "https://www.xiurenb.com/"


def img_url(url):
    # 获取页面
    restul = []
    resp = requests.get(url, headers=header)
    resp.encoding = 'utf-8'

    # 解析首页
    page_source = etree.HTML(resp.text)  # type: etree._Element
    home_url = page_source.xpath("//ul/li[@class='i_list list_n2']/a")
    for u in home_url:
        href = u.xpath("./@href")[0]
        titles = u.xpath("./@title")[0]
        # 首页url
        new_url = urljoin(url, href)
        # 目录名称
        folder_names = titles.replace("\r", "").replace("\n", "").replace(" ", "").split("]")[-1]
        dic = {
            "dir_url": new_url,
            "dir_name": folder_names,
        }
        restul.append(dic)
    return restul


async def download_one(urls, name):
    print("开始下载图片")
    for i in range(5):  # 重复5次
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(urls, headers=header) as resp:
                    page_source = await resp.text(encoding="utf-8")
                    num_list = etree.HTML(page_source)
                    num_list_url = num_list.xpath("//div[@class='main_inner']//div[@class='page'][1]/a")[1:-1]
                    for nums in num_list_url:
                        href = nums.xpath("./@href")
                        if href:
                            href_list = href[0]
                        # 分页url
                        new_href = urljoin(url, href_list)
                        async with aiohttp.ClientSession() as sessions:
                            async with sessions.get(new_href, headers=header) as resps:
                                jpg_source = await resps.text(encoding='utf-8')
                                subpage_page = etree.HTML(jpg_source)  # type: etree._Element
                                subpage_url = subpage_page.xpath("//div[@class='main_left']//p/img")
                                for img_list in subpage_url:
                                    src_url = img_list.xpath("./@src")[0]
                                    # 图片url
                                    new_src = urljoin(url, src_url)
                                    # 文件名
                                    file_name = new_src.split("/")[-1]
                                    async with aiohttp.ClientSession() as session_jpg:
                                        async with session_jpg.get(new_src, headers=header) as resp_list:
                                            jpg_urls = await resp_list.content.read()
                                            async with aiofiles.open(f"{name}/{file_name}", mode="wb") as f:
                                                await f.write(jpg_urls)
            print("下载图片完成", new_src)
            break
        except Exception as e:
            print("请求超时错误", e)


async def task(char_set):
    tasks = []
    for i in char_set:
        name = i['dir_name']
        urls = i['dir_url']

        # 判断目录是否存在
        if not os.path.exists(name):
            os.makedirs(name)

        f = download_one(urls, name)
        # 提交任务
        t = asyncio.create_task(f)
        # 追加到列表里
        tasks.append(t)
    # 统一任务执行
    await asyncio.wait(tasks)


def main():
    char_set = img_url(url)

    # 运行协程任务
    event_loop = asyncio.get_event_loop()
    event_loop.run_until_complete(task(char_set))


if __name__ == '__main__':
    start = time.time()
    main()
    stop = time.time()
    print(stop - start)

视频下载案列

"""
# 整体步骤 => 网吧电影
1. 想办法找到M3U8文件
2. 判别(人工)是否需要下载第二层M3U8
3. 提取ts文件的下载路径
4. 下载
5. 判别是否需要解密
6. 如果需要解密, 拿到秘钥
7. 解密
8. 根据M3U8的正确顺序来合并所有的ts文件 => MP4
"""
import requests
from lxml import etree
import re
from urllib.parse import urljoin
import os  # 执行cmd/控制台上的命令

import asyncio
import aiohttp
import aiofiles

from Crypto.Cipher import AES  # pip install pycryptodome

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
}


def get_iframe_src(url): # 拿到iframe的src
    resp = requests.get(url, headers=headers)
    tree = etree.HTML(resp.text)
    src = tree.xpath("//iframe/@src")[0]  # 谁报错了. 问这里. 拉出去!!!!!5分钟
    return src


def get_m3u8_url(url):
    resp = requests.get(url, headers=headers)
    obj = re.compile(r'url: "(?P<m3u8>.*?)"', re.S)
    m3u8 = obj.search(resp.text).group("m3u8")  # B
    return m3u8


def download_m3u8(url):  # https://a.ak-kk.com/20211030/89ZfL7VX/index.m3u8
    resp = requests.get(url, headers=headers)
    with open("first.m3u8", mode="w", encoding="utf-8") as f:
        f.write(resp.text)
    # 这个位置的错误. 价值5分钟
    with open("first.m3u8", mode='r', encoding="utf-8") as f2:
        for line in f2:  # 一行一行的读
            if line.startswith("#"):  # 以#开头
                continue  # 拜拜
            # 此时的line就是第二层M3U8的地址
            line = line.strip()  # 注意要strip()  否则会有意想不到的收获

            line = urljoin(url, line)  # 拼接一下
            # 下载第二层M3U8
            resp = requests.get(line, headers=headers)
            with open("second.m3u8", mode="w", encoding="utf-8") as f3:
                f3.write(resp.text)
                break  # 可以加, 也可以不加


async def download_one(url, sem):
    async with sem:  # 使用信号量控制访问频率
        file_name = url.split("/")[-1]
        file_path = "./解密前/"+file_name
        print(file_name, "开始工作了!")
        for i in range(10):  # 重试10次
            try:
                async with aiohttp.ClientSession() as session:
                    async with session.get(url, headers=headers) as resp:
                        content = await resp.content.read()
                        # 写入文件
                        async with aiofiles.open(file_path, mode="wb") as f:
                            await f.write(content)
                print(file_name, "下载完成!")
                break
            except Exception as e:
                print(file_name, "出错了, 马上重试", e)   # 给个提示. 看到错误信息


async def download_all_videos():
    # 信号量, 用来控制协程的并发量
    sem = asyncio.Semaphore(100)  # 网吧电影中极个别电影需要控制在5左右
    # 1. 读取文件
    tasks = []
    with open("second.m3u8", mode="r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("#"):
                continue
            line = line.strip()  # 不写. 你会得到意想不到的收获
            # 此时line就是下载地址
            # 2. 创建任务
            t = asyncio.create_task(download_one(line, sem))
            tasks.append(t)
    # 3. 统一等待
    await asyncio.wait(tasks)


def get_key():
    with open("second.m3u8", mode="r", encoding="utf-8") as f:
        file_content = f.read()  # 读取到所有内容
        obj = re.compile(r'URI="(?P<key_url>.*?)"')
        key_url = obj.search(file_content).group("key_url")
        resp = requests.get(key_url, headers=headers)  # 发请求, 拿秘钥
        return resp.content  # 直接拿字节. 为了解密的时候. 直接丢进去就可以了.


async def desc_one(file_path, key):
    file_name = file_path.split("/")[-1]
    new_file_path = "./解密后/" + file_name
    # 解密
    async with aiofiles.open(file_path, mode="rb") as f1,\
            aiofiles.open(new_file_path, mode="wb") as f2:
        content = await f1.read()
        # 解密
        # 固定逻辑, 创建一个加密器
        aes = AES.new(key=key, mode=AES.MODE_CBC, IV=b"0000000000000000")
        new_content = aes.decrypt(content)
        await f2.write(new_content)  # 写入新文件
    print(new_file_path, "解密成功")

# 解密的协程逻辑
# 读M3U8文件. 拿到文件名称和路径
# 每个ts文件一个任务
# 在每个任务中. 解密即可
async def desc_all(key):
    tasks = []
    with open("second.m3u8", mode="r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("#"):
                continue
            line = line.strip()
            file_name = line.split("/")[-1]
            file_path = "./解密前/"+file_name
            # 创建任务. 去解密
            t = asyncio.create_task(desc_one(file_path, key))
            tasks.append(t)
    await asyncio.wait(tasks)


def merge():
    # 视频片段合成
    # B站视频. 不适用这个.
    # 需要一个命令
    # windows: copy /b a.ts+b.ts+c.ts xxx.mp4
    # linux/mac: cat a.ts b.ts c.ts > xxx.mp4
    # 共同的坑:
    # 1. 执行命令 太长了不行. 需要分段合并
    # 2. 执行命令的时候. 容易出现乱码. 采用popen来执行命令. 就可以避免乱码
    # 3. 你只需要关注. 是否合并成功了
    # os.system("dir")  # 会有乱码
    # r = os.popen("dir")
    # print(r.read())  # 可以暂时性的避免乱码

    # 拿到所有文件名.和正确的合并顺序
    file_list = []
    with open("second.m3u8", mode="r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("#"):
                continue
            line = line.strip()
            file_name = line.split("/")[-1]
            file_list.append(file_name)

    # 进入到文件夹内
    os.chdir("./解密后")  # 更换工作目录
    # file_list  所有文件名称

    # 分段合并
    n = 1
    temp = []  # [a.ts, b.ts, c.ts]  =?=>  a.ts+b.ts+c.ts
    for i in range(len(file_list)):
        # 每 20 个合并一次
        file_name = file_list[i]
        temp.append(file_name)
        if i != 0 and i % 20 == 0:  # 20和一次(第一次合并有21个)
            # 可以合并一次了
            cmd = f"copy /b {'+'.join(temp)} {n}.ts"
            r = os.popen(cmd)
            print(r.read())
            temp = []  # 新列表
            n = n + 1
    # 需要把剩余的ts进行合并
    cmd = f"copy /b {'+'.join(temp)} {n}.ts"
    r = os.popen(cmd)
    print(r.read())
    n = n + 1

    # 第二次大合并  1.ts + 2.ts + 3.ts xxx.mp4
    last_temp = []
    for i in range(1, n):
        last_temp.append(f"{i}.ts")
    # 最后一次合并
    cmd = f"copy /b {'+'.join(last_temp)} 春夏秋冬又一春.mp4"
    r = os.popen(cmd)
    print(r.read())
    # 回来
    os.chdir("../")  # ../ 上层文件夹


def main():
    url = "http://www.wbdy.tv/play/63690_1_1.html"
    # 1.拿到iframe的src属性值
    src = get_iframe_src(url)
    print(src)
    # 2. 发送请求到iframe的src路径. 获取到M3U8地址
    src = urljoin(url, src)
    m3u8_url = get_m3u8_url(src)
    print(m3u8_url)
    # 3. 下载m3u8文件
    download_m3u8(m3u8_url)
    # 4. 下载视频. 上协程下载视频
    event_loop = asyncio.get_event_loop()
    event_loop.run_until_complete(download_all_videos())
    # 5. 拿秘钥
    key = get_key()
    # 6. 解密
    event_loop = asyncio.get_event_loop()
    event_loop.run_until_complete(desc_all(key))
    print("全部完成")

    # 合成
    merge()


if __name__ == '__main__':
    main()
posted @   沈忻凯  阅读(164)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 一个费力不讨好的项目,让我损失了近一半的绩效!
· 清华大学推出第四讲使用 DeepSeek + DeepResearch 让科研像聊天一样简单!
· 实操Deepseek接入个人知识库
· CSnakes vs Python.NET:高效嵌入与灵活互通的跨语言方案对比
· Plotly.NET 一个为 .NET 打造的强大开源交互式图表库
点击右上角即可分享
微信分享提示