python爬虫(8)-百度小说西游记
# -*- coding: utf-8 -*- """ @Time : 2022/3/23 11:26 @Author : Andrew @File : 协程爬小说.py """ import aiofiles import requests from lxml import etree import asyncio import aiohttp """ 1.同步操作:访问getCatelog 拿到所有章节的cid和名称 2.异步操作:访问getChapterContent 下载文章内容 """ # # 所有章节地{名称,cid} # url1 = 'http://dushu.baidu.com/api/pc/getDetail?data={"book_id":"4306063500"}' # # 小说的一节具体内容 # url2 = "http://dushu.baidu.com/api/pc/getChapterContent?data={'book_id':'4306063500','cid':'4306063500|1569782244'," \ # "'need_bookinfo':1} " import json import time async def aiodownload(cid, b_id, title): data = { 'book_id': b_id, 'cid': f'{b_id}|{cid}', 'need_bookinfo': 1 } data = json.dumps(data) url = f"http://dushu.baidu.com/api/pc/getChapterContent?data={data}" async with aiohttp.ClientSession() as session: async with session.get(url) as resp: # 异步requests.get() dic = await resp.json() # 获取到就给dic,否则就挂起 async with aiofiles.open("./西游记小说异步爬取/" + title+".txt", mode="w", encoding="utf-8") as f: # 异步写文件 await f.write(dic["data"]["novel"]["content"]) # 有数据就写,没有就挂起 print(title, ":下载结束!") async def getCatelog(url, b_id): resp = requests.get(url) dic = resp.json() resp.close() tasks = [] for item in dic["data"]["novel"]["items"]: title = item["title"] cid = item["cid"] # 之前的都是同步, # 准备异步任务很多个aiodownload await 23.7253s tasks.append(aiodownload(cid, b_id, title)) await asyncio.wait(tasks) # 1.4s if __name__ == "__main__": b_id = "4306063500" # 注意这里的拼接,踢出去的是4306063500,尽管是转成字符串,还是4306063500,并不是"4306063500" # b_id = str(4306063500) # 'http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"'+str(4306063500)+'"}' url = 'http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"' + b_id + '"}' time1 = time.time() asyncio.run(getCatelog(url, b_id)) # 开启异步调用 time2 = time.time() print(time2-time1)