14、爬虫-异步协程爬取西游记小说-aiohttp-aiofiles

 

注意:有关所有的IO操作都要加上:await 挂起 

"""
https://dushu.baidu.com/pc/detail?gid=4306063500   #小说网站
https://dushu.baidu.com/api/pc/getCatalog?data={%22book_id%22:%224306063500%22}   #可以拿到每个章节的bookid、cid(每个章节的id)、这里%22相当于 " 号

https://dushu.baidu.com/api/pc/getChapterContent?data={%22book_id%22:%224306063500%22,%22cid%22:%224306063500|1569782244%22,%22need_bookinfo%22:1}   #此链接可以获取章节内容 可替换链接中的"cid":"%22"4306063500|1569782244" 来获取每个章节的内容

"""

import requests
import asyncio
import aiohttp
import aiofiles
import json



#获取小说的标题和各个章节的cid
async def get_Catalog(url):
    response = requests.get(url)
    #将获取的文本转为json格式
    dict_text = response.json()
    #print(text_json)

    tasks = []  #用于保存每个连接
    #循环取小说的标题和cid
    items = dict_text['data']['novel']['items']
    for item in items:
        title = item['title']
        cid = item['cid']

        tasks.append(aiodownload(cid, book_id, title))
        #print(title, cid)
    #创建异步协程任务
    await asyncio.wait(tasks)

async def aiodownload(cid, book_id, title):
    #https://dushu.baidu.com/api/pc/getChapterContent?data={%22book_id%22:%224306063500%22,%22cid%22:%224306063500|1569782244%22,%22need_bookinfo%22:1}  #章节连接
    #{"book_id%22:%224306063500","cid":"4306063500|1569782244","need_bookinfo":1}
    data = {
        "book_id": book_id,
        "cid": f"{book_id}|{cid}",
        "need_bookinfo": 1
    }
    #将json转为字符串格式
    data = json.dumps(data)
    #得到每个章节的连接
    url = f"https://dushu.baidu.com/api/pc/getChapterContent?data={data}"

    #请求获取章节的内容
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            dict_text = await response.json()   #将获取的文本转为json、注意这里在请求连接的时候要用到async 的await挂起功能
            #使用aiofiles模块处理async请求到的文本内容并写入文本中
            async with aiofiles.open(f"E:\\learn\\python\\爬虫学习库\\协程\\西游记\\{title}", "w", encoding="utf-8") as file:
                await file.write(dict_text['data']['novel']['content']) #从json中获取章节内容



if __name__ == '__main__':
    book_id = "4306063500"
    url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"4306063500"}'
    #或者 : url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"'+{book_id}+'"}'  #将book_id作为变量拼接 +号是拼接
    #get_Catalog(url)

    loop = asyncio.get_event_loop()
    loop.run_until_complete(get_Catalog(url))
    #asyncio.run(get_Catalog(url))
posted @ 2024-07-01 22:27  little小新  阅读(11)  评论(0编辑  收藏  举报