线程、进程、协程

1. 多线程（单线程、多线程）

# 线程、 进程
# 线程是执行单位  进程 -> 公司  线程 -> 员工
# 进程是资源单位 （每一个进程里面至少有一个线程）

# 单线程
def func():
    for i in range(1000):
        print('func', i)

# 启动一个进程会默认有一个主线程
if __name__ == '__main__':
    func()

    for i in range(1000):
        print('main', i)

# 多线程  写法一
from threading import Thread

def func():
    for i in range(1000):
        print('func', i)

# 启动一个进程会默认有一个主线程
if __name__ == '__main__':
    t = Thread(target=func) # 给线程安排任务
    t.start() # 告诉线程可以开始工作了，具体执行时间是由CPU决定的

    for i in range(1000):
        print('main', i)

from threading import Thread

# 多线程  写法二
class MyThread(Thread):
    def run(self): # 固定的  当线程执行时，被执行的就是 run()
        for i in range(1000):
            print('子线程', i)

if __name__ == '__main__':
    t = MyThread()
    # t.run() # 方法的调用  ->  单线程
    t.start() # 开启线程

    for i in range(1000):
        print('主线程', i)

# 多线程 传递参数
from threading import Thread

def func(name):
    for i in range(1000):
        print(name, i)

if __name__ == '__main__':
    # args 是元组 必须有 ,
    t1 = Thread(target=func, args=('周杰伦',))
    t1.start() # 开启线程

    t2 = Thread(target=func, args=('王力宏',))
    t2.start()  # 开启线程

2. 进程

# 多进程
from multiprocessing import Process


def func():

    for i in range(1000):
        print('子进程', i)


if __name__ == '__main__':
    p = Process(target=func)
    p.start()

    for i in range(1000):
        print('主进程', i)

3. 线程池和进程池

# 线程池： 一次性开辟开辟一些线程， 用户直接给线程池提交任务  线程任务的调度交给线程池来完成
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

def fn(name):
    for i in range(100):
        print(name, i)

if __name__ == '__main__':
    # 创建多个 进程池 只需要将 ThreadPoolExecutor(50) 换成 ProcessPoolExecutor(50) 即可
    # 创建线程池
    with ThreadPoolExecutor(50) as t:
        for i in range(100):
            t.submit(fn, name=f'线程{i}')

    # 等待线程池中任务全部执行完毕，才会继续执行（守护）
    print('线程池执行完毕！！！')

4. 线程池和进程池实战

# 1. 如何提取单个页面的数据
# 2. 使用线程池，多个页面同时抓取
import requests
from lxml import etree
import csv
from concurrent.futures import ThreadPoolExecutor

f = open('data.csv', mode='w', encoding='utf-8')
csvwrite = csv.writer(f)

def down_one_page(url):
    # 获取页面源代码
    resp = requests.get(url)
    resp.encoding = 'utf-8'
    # print(resp.text)

    html = etree.HTML(resp.text)
    # 这里 table 是列表（我们需要列表里第一个数据所以需要加 [0]）
    table = html.xpath('/html/body/div[1]/div[2]/div[1]/div/div[2]/table[1]')[0]
    trs = table.xpath('./tbody/tr')[1:] # 方法一
    # trs = table.xpath('./tbody/tr[position()>1]')  # 方法二
    # print(len(trs)) # 数据的长度
    # 拿数据
    for tr in trs:
        txt = tr.xpath('./td/p//text()')
        # 对数据做一些简单的处理（数据有多的符号才处理  否则不处理哈）
        # txt = (item.replace('//', '').replace('\\', '') for item in txt)

        # 把数据存放在文件中
        csvwrite.writerow(txt)
        # print(txt)
    print(url, '提取完毕')

if __name__ == '__main__':
    # for i in range(1, 200):  # 单线程  效率及其底下  将 200 修改为 14369页 数据呢？？？
    #     down_one_page(f'http://zhongdapeng.com/shucaijiage/{i}.html')

    # 创建线程池
    with ThreadPoolExecutor(50) as t:
        for i in range(1, 200):
            # 把下载任务提交给线程池
            t.submit(down_one_page, f'http://zhongdapeng.com/shucaijiage/1141.html')

    print('全部下载完成！！！')

5. 协程的概念

import time

def func():
    print('我爱黎明')
    time.sleep(3) # 程序处于堵塞状态，CPU不为我工作
    print('我真的爱黎明')

if __name__ == '__main__':

    func()

# input()  #  程序处于堵塞状态
# requests.get(bilibile) 在网络请求返回数据之前，程序也是处于堵塞状态的

# 一般情况下， 当程序处于 IO操作时，线程都会处于阻塞状态
# IO操作： 输入或输出的时候

# 协程： 当程序遇到 IO 操作的时候，可以选择性的切换到其它任务上

# 在微观上是一个任务一个任务的进行切换，切换条件一般就是IO操作
# 在宏观上，我们能看到的就是多个任务一起执行

# 上方所讲的一切，都是在单线程的情况下

6. 协程程序

# Python 编写协程程序
import asyncio
import time

# 多任务异步
async def func1():
    print('你好啊，你叫小张！')
    # time.sleep(3) # 当程序出现同步操作的时候，异步就中断了
    await asyncio.sleep(3) # 异步操作的代码
    print('你好啊，你叫小张！')

async def func2():
    print('你好啊，你叫小明！')
    # time.sleep(2)
    await asyncio.sleep(2)  # 异步操作的代码
    print('你好啊，你叫小明！')

async def func3():
    print('你好啊，你叫小红！')
    # time.sleep(4)
    await asyncio.sleep(4)  # 异步操作的代码
    print('你好啊，你叫小红！')

async def main():
    # 写法一
    # f1 = func1()
    # await f1  # await 放在协程对象的前面（也必须放在async函数的里面）

    # 写法二（推荐）
    task = [
        # 使用asyncio.create_task()创建task对象
        asyncio.create_task(func1()),
        asyncio.create_task(func2()),
        asyncio.create_task(func3())
    ]

    await asyncio.wait(task)

if __name__ == '__main__':

    t1 = time.time()
    # 一次性启动多个任务（协程）
    asyncio.run(main())

    t2 = time.time()
    print(t2 - t1, '时间差')  # 执行任务花了多少时间

7. 协程爬虫领域的应用模板

import asyncio
import time

# 在爬虫领域的应用
async def download(url):
    print("准备开始下载")
    await asyncio.sleep(2)  # 网络请求  requests.get()
    print("下载完成")


async def main():
    urls = [
        "http://www.baidu.com",
        "http://www.bilibili.com",
        "http://www.163.com"
    ]

    # 准备异步协程对象列表
    tasks = []
    for url in urls:
        d = asyncio.create_task(download(url))
        tasks.append(d)

    # tasks = [asyncio.create_task(download(url)) for url in urls]  # 这么干也行哦~

    # 一次性把所有任务都执行
    await asyncio.wait(tasks)

if __name__ == '__main__':
    t1 = time.time()

    asyncio.run(main())
    t2 = time.time()
    print(t2 - t1, '时间差')  # 执行任务花了多少时间

8. aiohttp模块的使用

# requests.get() 同步的操作 -> 异步操作（aiohttp模块）
# pip install aiohttp

import asyncio
import aiohttp
import requests

urls = [
    "https://www.toopic.cn/public/uploads/small/1658043292312165804329268.png",
    "https://www.toopic.cn/public/uploads/image/20200411/20200411153721_42156.jpg",
    "https://www.toopic.cn/public/uploads/small/1638860271747163886027116.jpg"
]

async def aiodownload(url):
    # 起名字
    name = url.rsplit('/', 1)[1]

    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            # 发送请求
            with open(name, mode='wb') as f:
                f.write(await resp.content.read()) # 读取任务是异步的，需要用 await 挂起

    print(name, '搞定')
    # 保存图片内容
    # 保存到本地

async def main():
    tasks = []

    for url in urls:
        # 创建一个 协程对象
        d = asyncio.create_task(aiodownload(url))
        tasks.append(d)

    # 添加到 异步任务中
    await asyncio.wait(tasks)

if __name__ == '__main__':
    # 解决报错
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    loop.run_until_complete(main())

9. 爬取一部小说百度小说西游记

# 所有章节的内容 (名称、cid)
# https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"4306063500"}

# 章节的具体内容
# https://dushu.baidu.com/api/pc/getChapterContent?data={"book_id":"4306063500","cid":"4306063500|1569782244","need_bookinfo":1}

# 异步文件操作
# pip install aiofiles

import requests
import asyncio
import aiohttp
import aiofiles
import json

"""
1. 同步操作： 访问getCatalog 获取所有(章节、cid)
2. 异步操作： 访问getChapterContent 下载所有的内容
"""

async def aiodownload(cid, b_id, title):
    data = {
        "book_id": b_id,
        "cid": f"{b_id}|{cid}",
        "need_bookinfo": 1
    }
    # 转换为json字符串
    data = json.dumps(data)
    # print(data, 'json字符串')
    url = f"https://dushu.baidu.com/api/pc/getChapterContent?data={data}"
    # 发送异步请求（获取到小说的内容）
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            dic = await resp.json()
            # dic['data']['novel']['content'] # 小说的内容

            # print(title, type(title), '标题')
            # 异步写入文件
            async with aiofiles.open('西游记小说/' + title + '.docx', mode='w', encoding='utf-8') as f:
                await f.write(dic['data']['novel']['content'])
                print(f'{title}  已下载完成')


async def getCatalog(url):
    resp = requests.get(url)
    # print(resp.json())
    dic = resp.json()

    tasks = []
    for item in dic['data']['novel']['items']: # item 是对应章节名称和cid的
        title = item['title']
        cid = item['cid']
        # print(cid, title, '名称，cid')
        # 准备异步任务 每一个 cid 都是一个异步任务
        # 创建 协程对象
        d = asyncio.create_task(aiodownload(cid, b_id, title))

        tasks.append(d)

    await asyncio.wait(tasks)


if __name__ == '__main__':
    b_id = "4306063500"
    url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"'+ b_id + '"}'

    # 解决报错
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    loop.run_until_complete(getCatalog(url))

    print('over_all', '全部下载已完成')

posted @ 2022-11-12 14:48 伴你如风阅读(19) 评论(0) 编辑收藏举报

刷新页面返回顶部

伴你如风

护你如影、伴你如风

线程、进程、协程

1. 多线程（单线程、多线程）

2. 进程

3. 线程池和进程池

4. 线程池和进程池实战

5. 协程的概念

6. 协程程序

7. 协程爬虫领域的应用模板

8. aiohttp模块的使用

9. 爬取一部小说百度小说西游记

公告

伴你如风

护你如影、伴你如风

线程、 进程、 协程

1. 多线程 （单线程、多线程）

2. 进程

3. 线程池 和 进程池

4. 线程池 和 进程池实战

5. 协程的概念

6. 协程程序

7. 协程爬虫领域的应用模板

8. aiohttp模块的使用

9. 爬取一部小说 百度小说 西游记

公告

线程、进程、协程

1. 多线程（单线程、多线程）

3. 线程池和进程池

4. 线程池和进程池实战

9. 爬取一部小说百度小说西游记