14、爬虫-异步协程爬取西游记小说-aiohttp-aiofiles
注意:有关所有的IO操作都要加上:await 挂起
""" https://dushu.baidu.com/pc/detail?gid=4306063500 #小说网站 https://dushu.baidu.com/api/pc/getCatalog?data={%22book_id%22:%224306063500%22} #可以拿到每个章节的bookid、cid(每个章节的id)、这里%22相当于 " 号 https://dushu.baidu.com/api/pc/getChapterContent?data={%22book_id%22:%224306063500%22,%22cid%22:%224306063500|1569782244%22,%22need_bookinfo%22:1} #此链接可以获取章节内容 可替换链接中的"cid":"%22"4306063500|1569782244" 来获取每个章节的内容 """ import requests import asyncio import aiohttp import aiofiles import json #获取小说的标题和各个章节的cid async def get_Catalog(url): response = requests.get(url) #将获取的文本转为json格式 dict_text = response.json() #print(text_json) tasks = [] #用于保存每个连接 #循环取小说的标题和cid items = dict_text['data']['novel']['items'] for item in items: title = item['title'] cid = item['cid'] tasks.append(aiodownload(cid, book_id, title)) #print(title, cid) #创建异步协程任务 await asyncio.wait(tasks) async def aiodownload(cid, book_id, title): #https://dushu.baidu.com/api/pc/getChapterContent?data={%22book_id%22:%224306063500%22,%22cid%22:%224306063500|1569782244%22,%22need_bookinfo%22:1} #章节连接 #{"book_id%22:%224306063500","cid":"4306063500|1569782244","need_bookinfo":1} data = { "book_id": book_id, "cid": f"{book_id}|{cid}", "need_bookinfo": 1 } #将json转为字符串格式 data = json.dumps(data) #得到每个章节的连接 url = f"https://dushu.baidu.com/api/pc/getChapterContent?data={data}" #请求获取章节的内容 async with aiohttp.ClientSession() as session: async with session.get(url) as response: dict_text = await response.json() #将获取的文本转为json、注意这里在请求连接的时候要用到async 的await挂起功能 #使用aiofiles模块处理async请求到的文本内容并写入文本中 async with aiofiles.open(f"E:\\learn\\python\\爬虫学习库\\协程\\西游记\\{title}", "w", encoding="utf-8") as file: await file.write(dict_text['data']['novel']['content']) #从json中获取章节内容 if __name__ == '__main__': book_id = "4306063500" url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"4306063500"}' #或者 : url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"'+{book_id}+'"}' #将book_id作为变量拼接 +号是拼接 #get_Catalog(url) loop = asyncio.get_event_loop() loop.run_until_complete(get_Catalog(url)) #asyncio.run(get_Catalog(url))
本文作者:little小新
本文链接:https://www.cnblogs.com/littlecc/p/17978465
版权声明:本作品采用知识共享署名-非商业性使用-禁止演绎 2.5 中国大陆许可协议进行许可。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步