python爬虫(8)-百度小说西游记

# -*- coding: utf-8 -*-
"""
@Time    :  2022/3/23 11:26
@Author  : Andrew
@File    : 协程爬小说.py
"""
import aiofiles
import requests
from lxml import etree
import asyncio
import aiohttp

"""
    1.同步操作:访问getCatelog 拿到所有章节的cid和名称
    2.异步操作:访问getChapterContent 下载文章内容
"""

# # 所有章节地{名称,cid}
# url1 = 'http://dushu.baidu.com/api/pc/getDetail?data={"book_id":"4306063500"}'
# # 小说的一节具体内容
# url2 = "http://dushu.baidu.com/api/pc/getChapterContent?data={'book_id':'4306063500','cid':'4306063500|1569782244'," \
#        "'need_bookinfo':1} "
import json

import time
async def aiodownload(cid, b_id, title):
    data = {
        'book_id': b_id,
        'cid': f'{b_id}|{cid}',
        'need_bookinfo': 1
    }
    data = json.dumps(data)
    url = f"http://dushu.baidu.com/api/pc/getChapterContent?data={data}"
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:  # 异步requests.get()
            dic = await resp.json()  # 获取到就给dic,否则就挂起
            async with aiofiles.open("./西游记小说异步爬取/" + title+".txt", mode="w", encoding="utf-8") as f:  # 异步写文件
                await f.write(dic["data"]["novel"]["content"])  # 有数据就写,没有就挂起
                print(title, ":下载结束!")


async def getCatelog(url, b_id):
    resp = requests.get(url)
    dic = resp.json()
    resp.close()
    tasks = []
    for item in dic["data"]["novel"]["items"]:
        title = item["title"]
        cid = item["cid"]
        # 之前的都是同步,
        # 准备异步任务很多个aiodownload await 23.7253s
        tasks.append(aiodownload(cid, b_id, title))
    await asyncio.wait(tasks)  # 1.4s


if __name__ == "__main__":
    b_id = "4306063500"
    # 注意这里的拼接,踢出去的是4306063500,尽管是转成字符串,还是4306063500,并不是"4306063500"
    # b_id = str(4306063500)
    # 'http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"'+str(4306063500)+'"}'
    url = 'http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"' + b_id + '"}'
    time1 = time.time()
    asyncio.run(getCatelog(url, b_id))  # 开启异步调用
    time2 = time.time()
    print(time2-time1)

 

posted @ 2022-03-23 15:34  乔十六  阅读(304)  评论(0编辑  收藏  举报