14、爬虫-异步协程爬取西游记小说-aiohttp-aiofiles
注意:有关所有的IO操作都要加上:await 挂起
"""
https://dushu.baidu.com/pc/detail?gid=4306063500 #小说网站
https://dushu.baidu.com/api/pc/getCatalog?data={%22book_id%22:%224306063500%22} #可以拿到每个章节的bookid、cid(每个章节的id)、这里%22相当于 " 号
https://dushu.baidu.com/api/pc/getChapterContent?data={%22book_id%22:%224306063500%22,%22cid%22:%224306063500|1569782244%22,%22need_bookinfo%22:1} #此链接可以获取章节内容 可替换链接中的"cid":"%22"4306063500|1569782244" 来获取每个章节的内容
"""
import requests
import asyncio
import aiohttp
import aiofiles
import json
#获取小说的标题和各个章节的cid
async def get_Catalog(url):
response = requests.get(url)
#将获取的文本转为json格式
dict_text = response.json()
#print(text_json)
tasks = [] #用于保存每个连接
#循环取小说的标题和cid
items = dict_text['data']['novel']['items']
for item in items:
title = item['title']
cid = item['cid']
tasks.append(aiodownload(cid, book_id, title))
#print(title, cid)
#创建异步协程任务
await asyncio.wait(tasks)
async def aiodownload(cid, book_id, title):
#https://dushu.baidu.com/api/pc/getChapterContent?data={%22book_id%22:%224306063500%22,%22cid%22:%224306063500|1569782244%22,%22need_bookinfo%22:1} #章节连接
#{"book_id%22:%224306063500","cid":"4306063500|1569782244","need_bookinfo":1}
data = {
"book_id": book_id,
"cid": f"{book_id}|{cid}",
"need_bookinfo": 1
}
#将json转为字符串格式
data = json.dumps(data)
#得到每个章节的连接
url = f"https://dushu.baidu.com/api/pc/getChapterContent?data={data}"
#请求获取章节的内容
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
dict_text = await response.json() #将获取的文本转为json、注意这里在请求连接的时候要用到async 的await挂起功能
#使用aiofiles模块处理async请求到的文本内容并写入文本中
async with aiofiles.open(f"E:\\learn\\python\\爬虫学习库\\协程\\西游记\\{title}", "w", encoding="utf-8") as file:
await file.write(dict_text['data']['novel']['content']) #从json中获取章节内容
if __name__ == '__main__':
book_id = "4306063500"
url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"4306063500"}'
#或者 : url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"'+{book_id}+'"}' #将book_id作为变量拼接 +号是拼接
#get_Catalog(url)
loop = asyncio.get_event_loop()
loop.run_until_complete(get_Catalog(url))
#asyncio.run(get_Catalog(url))