用协程扒光百度小说中的《西游记》

# 不用协程
"""
import os
import asyncio
import requests
import aiofiles as aiofiles
from lxml import etree
import aiohttp

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57'
}


# chapter_url = 'https://dushu.baidu.com/api/pc/getChapterContent?data={"book_id":'+b_id+',"cid":"'+b_id+'|1569782244","need_bookinfo":1}'


def getnovel(url):
    # 如果文件夹不存在则创建文件夹,用来放小说
    if not os.path.exists("other/novel"):
        os.mkdir("other/novel")

    resp = requests.get(url=url, headers=headers)

    # 取得第一个标题和cid
    # print(resp.json()['data']['novel']['items'][0]['title'])
    # print(resp.json()['data']['novel']['items'][0]['cid'])

    all_chapter = resp.json()['data']['novel']['items']

    for chapter in all_chapter:
        # 每个标题跟cid
        # print(chapter['title'])
        # print(chapter['cid'])
        title = chapter['title']
        cid = chapter['cid']
        # 拼接每个章节的 url
        chapter_url = 'https://dushu.baidu.com/api/pc/getChapterContent?data={"book_id":' + b_id + ',"cid":"' + b_id + '|' + cid + '","need_bookinfo":1}'
        # print(chapter_url)

        chapter_resp = requests.get(url=chapter_url, headers=headers).json()
        # 获得每个章节的内容
        # print(chapter_resp['data']['novel']['content'])

        # 存储文件
        chapter_text = chapter_resp['data']['novel']['content']
        with open('other/novel/' + title, mode='w', encoding='utf-8') as f:
            f.write(chapter_text)

        # 测试用
        break
    # 章节内容
    # print(resp.json()['data']['novel']['content'])
    resp.close()


if __name__ == '__main__':
    b_id = "4306063500"
    # 不能用f'' 因为{"book_id":4306063500}中原本就存在{}

    url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":' + b_id + '}'
    getnovel(url)
"""

# 协程操作

import os
import asyncio
import requests
import aiofiles
import aiohttp

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57'
}


# chapter_url = 'https://dushu.baidu.com/api/pc/getChapterContent?data={"book_id":'+b_id+',"cid":"'+b_id+'|1569782244","need_bookinfo":1}'

async def aiodownload(chapter_url, title):
    async with aiohttp.ClientSession() as session:  # request
        async with session.get(url=chapter_url, headers=headers) as resp:  # chapter_resp = requests.get
            chapter_resp = await resp.json() # 请求网页也是个等待的过程
            # 获得每个章节的内容
            # print(chapter_resp['data']['novel']['content'])

            # 存储文件
            chapter_text = chapter_resp['data']['novel']['content']
            # print(chapter_text)
            async with aiofiles.open('other/novel/' + title +'.txt', mode='w', encoding='utf-8') as f:
                # 错误示范 : await f.write(await chapter_text)
                # TypeError: object str can't be used in 'await' expression
                #  类型错误:对象 str 不能在“等待”表达式中使用
                await f.write(chapter_text)
    print(title, "下载成功!!!")


async def getnovel(url):
    # 如果文件夹不存在则创建文件夹,用来放小说
    if not os.path.exists("other/novel"):
        os.mkdir("other/novel")

    resp = requests.get(url=url, headers=headers)

    # 取得第一个标题和cid
    # print(resp.json()['data']['novel']['items'][0]['title'])
    # print(resp.json()['data']['novel']['items'][0]['cid'])
    # 取得一个范围里面包含着标题和cid
    all_chapter = resp.json()['data']['novel']['items']

    # 用来放所有异步任务
    tasks = []

    for chapter in all_chapter:
        # 每个标题跟cid
        # print(chapter['title'])
        # print(chapter['cid'])
        title = chapter['title']
        cid = chapter['cid']
        # 拼接每个章节的 url
        chapter_url = 'https://dushu.baidu.com/api/pc/getChapterContent?data={"book_id":' + b_id + ',"cid":"' + b_id + '|' + cid + '","need_bookinfo":1}'
        # print(chapter_url)

        # 添加异步任务
        tasks.append(asyncio.create_task(aiodownload(chapter_url, title)))

        # 测试
        # break
    await asyncio.wait(tasks)

    resp.close()


if __name__ == '__main__':
    b_id = "4306063500"
    # 不能用f'' 因为{"book_id":4306063500}中原本就存在{}

    url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":' + b_id + '}'
    asyncio.run(getnovel(url))
posted @ 2023-03-04 00:56  0x1e61  阅读(60)  评论(0编辑  收藏  举报