复制代码

用python 协程 爬百度小说西游记

前言

方法,不止一种,有不同见解可以一起讨论

""
使用协程爬取百度小说中的西游记整部小说
"""
import asyncio
import aiohttp
import aiofiles
import requests
from lxml import etree


async def async_download(title, url):
    """
    协程下载
    :param title:
    :param url:
    :return:
    """
    async with aiohttp.ClientSession() as session:
        file_name = "西游记/%s.txt" % title
        async with session.get(url) as resp:
            tree = etree.HTML(await resp.text())
            contents = tree.xpath("//dd[@id='contents']/text()")
            temp = ''
            for content in contents:
                if content == '\r\n':
                    continue
                temp += content
            async with aiofiles.open(file_name, mode='w', encoding='utf-8') as f:
                await f.write(temp)
    print("%s ...... 下载完成!" % title)


async def main(td_as):
    """
    封装协程对象并执行
    :param td_as:
    :return:
    """
    tasks = []
    for td in td_as:
        # print(td.xpath("./@href"))
        url_c = td.xpath("./@href")[0]
        # print(td.xpath("./text()"))
        title = td.xpath("./text()")[0]
        tasks.append(asyncio.create_task(async_download(title, url_c)))
    await asyncio.wait(tasks)


if __name__ == '__main__':
    """
    程序入口
    """
    url = 'http://www.wibaidu.com/modules/article/reader.php?aid=24537'
    resp = requests.get(url)
    resp.encoding = resp.apparent_encoding
    tree = etree.HTML(resp.text)
    td_as = tree.xpath("//td[@class='L']/a")
    # td_a = td_as[0]
    # tmp = td_a.xpath()
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main(td_as))

 

posted @ 2022-04-19 11:46  怪~咖  阅读(199)  评论(0编辑  收藏  举报
复制代码