用python 协程 爬百度小说西游记
前言
方法,不止一种,有不同见解可以一起讨论
"" 使用协程爬取百度小说中的西游记整部小说 """ import asyncio import aiohttp import aiofiles import requests from lxml import etree async def async_download(title, url): """ 协程下载 :param title: :param url: :return: """ async with aiohttp.ClientSession() as session: file_name = "西游记/%s.txt" % title async with session.get(url) as resp: tree = etree.HTML(await resp.text()) contents = tree.xpath("//dd[@id='contents']/text()") temp = '' for content in contents: if content == '\r\n': continue temp += content async with aiofiles.open(file_name, mode='w', encoding='utf-8') as f: await f.write(temp) print("%s ...... 下载完成!" % title) async def main(td_as): """ 封装协程对象并执行 :param td_as: :return: """ tasks = [] for td in td_as: # print(td.xpath("./@href")) url_c = td.xpath("./@href")[0] # print(td.xpath("./text()")) title = td.xpath("./text()")[0] tasks.append(asyncio.create_task(async_download(title, url_c))) await asyncio.wait(tasks) if __name__ == '__main__': """ 程序入口 """ url = 'http://www.wibaidu.com/modules/article/reader.php?aid=24537' resp = requests.get(url) resp.encoding = resp.apparent_encoding tree = etree.HTML(resp.text) td_as = tree.xpath("//td[@class='L']/a") # td_a = td_as[0] # tmp = td_a.xpath() loop = asyncio.get_event_loop() loop.run_until_complete(main(td_as))