connie_tong

导航

 
"""
模拟浏览器登录-处理cookie
防盗链处理-抓取梨视频
代理-防止被封IP
"""

# http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"4306063500"}
#
# http://dushu.baidu.com/api/pc/getChapterContent?data={"book_id":"4306063500","cid":"4306063500|1569782244","need_bookinfo":1}
# Host: dushu.baidu.com
# User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0
import requests
import asyncio
import aiohttp
import aiofiles
import json

# 定义一个下载小说内容的方法
async def aiodownload(cid, b_id, title):
    # 要将json变成字符串,要用json包
    data ={
        "book_id": b_id,
        "cid": f"{b_id}|{cid}",
        "need_bookinfo": 1
    }
    # 需要把data变成json字符串
    data = json.dumps(data)
    # 下载每一个小说内容的链接,参数化data
    url = f"http://dushu.baidu.com/api/pc/getChapterContent?data={data}"
    # 准备session
    async with aiohttp.ClientSession as session:
        # session发送请求
        async with session.get(url) as resp:
            # 获取页面源代码,定义为json格式,从发送请求里面,得到要的json
            dic = await resp.json()
            # 从而知道,文章内容在这个位置dic['data']['novel']['content'],open方法是属于aiofiles的,也是异步的,以title命名文件
            async with aiofiles.open('.//novel//' + title, mode="w", encoding="utf-8")as f:
                # 将内容写进文档,因此写入文档也需要异步
                await f.write(dic['data']['novel']['content'])


# 定义一个获取目录方法
async def get_catalog(main_links):
    # 获取页面代码
    resp = requests.get(main_links)
    # 打印页面代码
    # print(resp.text)
    # 获取cid,将页面内容转换为json,从json里面,json赋值给dic,成为一个字典
    dic = resp.json()
    # 创建一个空列表
    tasks = []
    # for循环,从字典里面查找
    for item in dic['data']['novel']['items']: # item就是对应每一个章节的内容和cid
        # title = item['title'] title就是item里面的title
        title = item['title']
        cid = item['cid']
        # 准备异步任务,全部整合到task列表中
        tasks.append(aiodownload(cid, b_id, title))
        # print(cid, title)
    await asyncio.wait(tasks)



if __name__ == '__main__':
    b_id = '4306063500'
    main_links = 'http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"' + b_id + '"}'
    # 调用获取目录方法,不能简单调用
    # get_catalog(main_links)
    asyncio.run(get_catalog(main_links))

 

posted on 2022-04-09 14:02  connie_tong  阅读(71)  评论(0编辑  收藏  举报