- 引入:无论多线程还是多进程其实没有解决一个性能相关的问题,IO阻塞,无论是多进程还是多线程,在遇到IO阻塞时都会被操作系统强行剥夺走CPU的执行权限(使得cup执行其他操作,其他操作可能是我们程序的其他部分,也可能是其他的应用程序),我们自己程序的执行效率因此就降低了下来。
- 在python3.5之后新增了asyncio模块,可以帮我们检测IO(只能是网络IO【HTTP连接就是网络IO操作】),实现应用程序级别的切换(异步IO)。
- 1、协程基本语法
import asyncio
async def func():
print('我是任务')
if __name__ == '__main__':
f = func()
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(f)
import asyncio
async def faker1():
print("任务1开始")
await asyncio.sleep(1)
print("任务1完成")
return "任务1结束"
async def faker2():
print("任务2开始")
await asyncio.sleep(2)
print("任务2完成")
return "任务2结束"
async def faker3():
print("任务3开始")
await asyncio.sleep(3)
print("任务3完成")
return "任务3结束"
async def main():
tasks = [
asyncio.create_task(faker3()),
asyncio.create_task(faker1()),
asyncio.create_task(faker2()),
]
result, pending = await asyncio.wait(tasks)
for r in result:
print(r.result())
result = await asyncio.gather(*tasks, return_exceptions=True)
for r in result:
print(r)
if __name__ == '__main__':
asyncio.run(main())
import requests,time
import asyncio
async def get_page_inf0(url):
print("发请求到", url)
await asyncio.sleep(3) # ????
print("拿到了页面源代码")
return " 我就是源代码 "
async def main():
urls = [
"http://www.baidu.com",
"http://www.taobao.com",
"http://www.google.com",
]
tasks = []
for url in urls:
f = get_page_inf0(url)
t = asyncio.create_task(f)
tasks.append(t)
await asyncio.wait(tasks)
if __name__ == '__main__':
start = time.time()
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(main())
end = time.time()
print(f"总共耗时{end - start}")
with aiohttp.ClientSession() as sess:
#基于请求对象发起请求
#此处的get是发起get请求,常用参数:url,headers,params,proxy
#post方法发起post请求,常用参数:url,headers,data,proxy
#发现处理代理的参数和requests不一样(注意),此处处理代理使用proxy='http://ip:port'
with sess.get(url=url) as response:
page_text = response.text()
#text():获取字符串形式的响应数据
#read():获取二进制形式的响应数据
return page_text
- 4.2:补充细节
- 在每一个with前加上async关键字
- 在阻塞操作前加上await关键字
async def get_request(url):
#requests是不支持异步的模块
# response = await requests.get(url=url)
# page_text = response.text
#创建请求对象(sess)
async with aiohttp.ClientSession() as sess:
#基于请求对象发起请求
#此处的get是发起get请求,常用参数:url,headers,params,proxy
#post方法发起post请求,常用参数:url,headers,data,proxy
#发现处理代理的参数和requests不一样(注意),此处处理代理使用proxy='http://ip:port'
async with await sess.get(url=url) as response:
page_text = await response.text()
#text():获取字符串形式的响应数据
#read():获取二进制形式的响应数据
return page_text
"""
需求:爬取明朝那些事,总共7卷
每一卷单独保存一个文件夹,每一章保存为一个文件
分析:
1、首先获取每一卷的名称、每一章的名称,每一章详情的url
2、数据格式为
["juan_name": 万国来朝,chapter_name:第一章, chapter_url:xxx]
["juan_name": 万国来朝,chapter_name:第二章, chapter_url:xxx]
["juan_name": 万国来朝,chapter_name:第三章, chapter_url:xxx]
["juan_name": 最后一章,chapter_name:第一章, chapter_url:xxx]
....
3、使用协程获取每一章的详情数据,并且保存为文件
"""
import requests
from lxml import etree
import asyncio
import aiohttp
import aiofiles
import os,time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
}
def get_chapter_info(url):
resp = requests.get(url=url,headers = headers)
resp.encoding = 'utf-8'
tree = etree.HTML(resp.text)
divs = tree.xpath("//div[@class='mulu']")
result = []
for div in divs:
trs = div.xpath(".//table/tr")
juan_name = trs[0].xpath(".//a/text()")
juan_name = "".join(juan_name).replace(":","_").strip()
for tr in trs[1:]:
tds = tr.xpath(".//td")
for td in tds:
txt = td.xpath(".//a/text()")
href = td.xpath(".//a/@href")
txt = "".join(txt).strip().replace(" ","")
href = "".join(href).strip()
dic = {
"chapter_name":txt,
"chapter_url":href,
"juan_name":juan_name
}
result.append(dic)
return result
async def download_chapter_one(url,file_path):
print("我要开始下载了")
async with aiohttp.ClientSession() as session:
async with session.get(url=url,headers=headers) as resp:
page_soure = await resp.text(encoding="utf-8")
tree = etree.HTML(page_soure)
text = tree.xpath("//div[@class='content']//p//text()")
text = "".join(text).strip().replace("\r","")
async with aiofiles.open(file_path,"w",encoding='utf-8') as f:
await f.write(text)
async def download_chapter(chapter_lst):
tasks = []
for chapter in chapter_lst:
juan = chapter["juan_name"]
name = chapter["chapter_name"]
url = chapter["chapter_url"]
if not os.path.exists(juan):
os.makedirs(juan)
file_path = f"{juan}/{name}.txt"
f = download_chapter_one(url,file_path)
t = asyncio.create_task(f)
tasks.append(t)
await asyncio.wait(tasks)
def main():
url = 'https://www.mingchaonaxieshier.com/'
chapter_lst = get_chapter_info(url)
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(download_chapter(chapter_lst))
if __name__ == '__main__':
start = time.time()
main()
end = time.time()
print(f"下载完毕总共耗时:{end - start}")
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· .NET Core 中如何实现缓存的预热?
· 三行代码完成国际化适配,妙~啊~
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?