协程

引入：无论多线程还是多进程其实没有解决一个性能相关的问题，IO阻塞，无论是多进程还是多线程，在遇到IO阻塞时都会被操作系统强行剥夺走CPU的执行权限(使得cup执行其他操作，其他操作可能是我们程序的其他部分，也可能是其他的应用程序)，我们自己程序的执行效率因此就降低了下来。
在python3.5之后新增了asyncio模块，可以帮我们检测IO（只能是网络IO【HTTP连接就是网络IO操作】），实现应用程序级别的切换（异步IO）。
1、协程基本语法

import asyncio

async def func():
    print('我是任务')

if __name__ == '__main__':
    #func()   协程函数加上括号，产生一个协程对象
    #执行协程函数的固定逻辑
    # 1、创建好协程对象
    # 2、用asyncio包来运行
    f = func()

    #运行方案有两种
    #1 、 直接run   直接run，windows中容易出现"Event is Closed"
    # asyncio.run(f)
    #2、 需要获取一个事件循环的东西
    #创建事件循环
    event_loop = asyncio.get_event_loop()
    #运行协程对象，直到结束
    event_loop.run_until_complete(f)

2、协程的用法和await

import asyncio


async def faker1():
    print("任务1开始")
    await asyncio.sleep(1)
    print("任务1完成")
    return "任务1结束"


async def faker2():
    print("任务2开始")
    await asyncio.sleep(2)
    print("任务2完成")
    return "任务2结束"


async def faker3():
    print("任务3开始")
    await asyncio.sleep(3)
    print("任务3完成")
    return "任务3结束"


async def main():
    tasks = [
        asyncio.create_task(faker3()),
        asyncio.create_task(faker1()),
        asyncio.create_task(faker2()),
    ]
    # 方案一, 用wait, 返回的结果在result中
    result, pending = await asyncio.wait(tasks)
    for r in result:
        print(r.result())
        
    # 方案二, 用gather, 返回的结果在result中, 结果会按照任务添加的顺序来返回数据
    # 	return_exceptions如果任务在执行过程中报错了. 返回错误信息. 
    result = await asyncio.gather(*tasks, return_exceptions=True)
    for r in result:
        print(r)


if __name__ == '__main__':
    asyncio.run(main())

3、固定的爬虫模板

import requests,time
import asyncio

async def get_page_inf0(url):
    print("发请求到", url)
    await asyncio.sleep(3)  # ????
    print("拿到了页面源代码")
    return " 我就是源代码 "

async def main():
    urls = [
        "http://www.baidu.com",
        "http://www.taobao.com",
        "http://www.google.com",
    ]
    tasks = []
    for url in urls:
        f = get_page_inf0(url)
        t = asyncio.create_task(f)  
        tasks.append(t)
    await asyncio.wait(tasks)

if __name__ == '__main__':
    start = time.time()
    event_loop = asyncio.get_event_loop()
    event_loop.run_until_complete(main())
    end = time.time()
    print(f"总共耗时{end - start}")

4、aiohttp模块基本使用
- 4.1：基本框架

with aiohttp.ClientSession() as sess:
       #基于请求对象发起请求
       #此处的get是发起get请求，常用参数：url,headers,params,proxy
       #post方法发起post请求，常用参数：url,headers,data,proxy
       #发现处理代理的参数和requests不一样（注意），此处处理代理使用proxy='http://ip:port'
        with sess.get(url=url) as response:
           page_text = response.text()
           #text():获取字符串形式的响应数据
           #read()：获取二进制形式的响应数据
           return page_text

4.2：补充细节
- 在每一个with前加上async关键字
- 在阻塞操作前加上await关键字

async def get_request(url):
    #requests是不支持异步的模块
    # response = await requests.get(url=url)
    # page_text = response.text
    #创建请求对象（sess）
    async with aiohttp.ClientSession() as sess:
        #基于请求对象发起请求
        #此处的get是发起get请求，常用参数：url,headers,params,proxy
        #post方法发起post请求，常用参数：url,headers,data,proxy
        #发现处理代理的参数和requests不一样（注意），此处处理代理使用proxy='http://ip:port'
        async with await sess.get(url=url) as response:
            page_text = await response.text()
            #text():获取字符串形式的响应数据
            #read()：获取二进制形式的响应数据
            return page_text

5、爬取小说示例

"""
需求：爬取明朝那些事，总共7卷
每一卷单独保存一个文件夹，每一章保存为一个文件

分析：
1、首先获取每一卷的名称、每一章的名称，每一章详情的url
2、数据格式为
    ["juan_name": 万国来朝,chapter_name：第一章, chapter_url:xxx]
    ["juan_name": 万国来朝,chapter_name：第二章, chapter_url:xxx]
    ["juan_name": 万国来朝,chapter_name：第三章, chapter_url:xxx]
    ["juan_name": 最后一章,chapter_name：第一章, chapter_url:xxx]
    ....
3、使用协程获取每一章的详情数据，并且保存为文件
"""
import requests
from lxml import etree
import asyncio
import aiohttp
import aiofiles
import os,time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
}
def get_chapter_info(url):
    resp = requests.get(url=url,headers = headers)
    resp.encoding = 'utf-8'
    tree = etree.HTML(resp.text)
    #获取所有的divs标签
    divs = tree.xpath("//div[@class='mulu']")   #每个div就是一卷
    result = []
    for div in divs:
        trs = div.xpath(".//table/tr")   #一堆tr
        # 第一个tr标签是卷名
        juan_name = trs[0].xpath(".//a/text()")
        #处理卷名，有些特殊字符在保存为文件名时有问题
        juan_name = "".join(juan_name).replace("：","_").strip()
        # print(juan_name)
        for tr in trs[1:]:
            tds = tr.xpath(".//td")
            for td in tds:
                txt = td.xpath(".//a/text()")
                href = td.xpath(".//a/@href")

                txt = "".join(txt).strip().replace(" ","")
                href = "".join(href).strip()
                # print(txt,href)

                dic = {
                    "chapter_name":txt,
                    "chapter_url":href,
                    "juan_name":juan_name
                }
                result.append(dic)
    return result

async def download_chapter_one(url,file_path):
    print("我要开始下载了")
    async with aiohttp.ClientSession() as session:
        async with session.get(url=url,headers=headers) as resp:
            page_soure = await resp.text(encoding="utf-8")
            tree = etree.HTML(page_soure)
            text = tree.xpath("//div[@class='content']//p//text()")
            text = "".join(text).strip().replace("\r","")

            #写入文件
            async with aiofiles.open(file_path,"w",encoding='utf-8') as f:
                await f.write(text)

async def download_chapter(chapter_lst):
    tasks = []
    for chapter in chapter_lst:
        juan = chapter["juan_name"]
        name = chapter["chapter_name"]
        url = chapter["chapter_url"]
        #r如果卷名文件夹不存在，就新建
        if not os.path.exists(juan):
            os.makedirs(juan)
        #文件保存路径
        file_path = f"{juan}/{name}.txt"
        f = download_chapter_one(url,file_path)
        t = asyncio.create_task(f)
        tasks.append(t)

        # break   #测试代码的时候，不注释
    await asyncio.wait(tasks)


def main():
    url = 'https://www.mingchaonaxieshier.com/'
    chapter_lst = get_chapter_info(url)
    # print(chapter_lst)
    event_loop = asyncio.get_event_loop()
    event_loop.run_until_complete(download_chapter(chapter_lst))
    # asyncio.run(download_chapter(chapter_lst))


if __name__ == '__main__':
    start = time.time()
    main()
    end = time.time()
    print(f"下载完毕总共耗时：{end - start}")

posted @ 2023-02-17 23:10 Tony_xiao 阅读(29) 评论(0) 编辑收藏举报

刷新页面返回顶部

Tony_xiao

协程

公告