异步爬虫例子之asyncio

异步爬虫例子：

例子1

主要使用场景：爬多个站，或者批量下载视频图片

import time

import aiohttp  # 3.8.4
import asyncio  # 3.4.3
import re
import os

# os.environ['NO_PROXY'] = 'www.baidu.com'





class Asyn():

    def __init__(self):
        self.__headers = {
                            'authority': 'go.drugbank.com',
                            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
                            'cache-control': 'no-cache',
                            'cookie': '_omx_drug_bank_session=ZiQulkyeNWQN4VxewoUdpvpjebs36v3xv0adS0ywsWTDX9cXowdJblYyKE0AHmpM9gOmiAlfQyAy2SR0tOFqctQ0APTCNNYzM2Q7hFst70od%2FfCQ7oGSC46qq7bjvdbzwMbrpxtBM0%2BYNK6r3iNyzZHw1d5n%2BvBvkVNDJBaRzB404tsju33MhMrhBJv6lNBjGxH84Dqgu8Ma7oWGC8dWeeZNl9gkZ0yHsMj2F04xDOxu0VLo4Q3LOCjHSALrqoHZrk55%2FGSxYwtfTsHXF%2FUzJ6phGpkHRq519NWQH%2FnsDL9Rc%2FLX8JHYq3LjKdGV%2B8GjdTP%2Bchgn2lv%2BUDFyqIbrnp5NP0cJFLvDB2FSfW4TCItVAsKaI7dIqWC0xcNRbsfQAnRQ0Ix5Jp%2Ft1Za11jirI5OZoEGbDZ9n9dfbBfxFO5aIECa7pOj%2BO668OqXq%2Fd8kEw0YNLw07NFkHDXT3iCMgTiMjyg77aMTFUI7HXF0h2elSRSL8S1oh9e9YcGtAA%3D%3D--0sEdP1iupQN5%2BS4%2B--IpfLjfkZLpD7tOPGb8MAPQ%3D%3D; path=/; HttpOnly; SameSite=Lax',
                            'referer': 'https://go.drugbank.com/unearth/q?query=*&button=&searcher=drugs',
                            'sec-ch-ua': '"Chromium";v="112", "Microsoft Edge";v="112", "Not:A-Brand";v="99"',
                            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
                        }

    async def __fetch(self, session, url):
        print("发送请求：", url)
        async with session.get(url, verify_ssl=False, headers=self.__headers) as response:
            content = await response.text()
            # print(content)
            try:
                info1 = re.findall(r'href="/indications/.*?">(.*?)</a', content)  # 第一列表
                if not info1:
                    print(f"最大限度页")
                    return
            except Exception as e:
                print(f"最大限度页，error={e}")
                return
            # print(info1)
            # print(len(info1))
            info2 = re.findall(r'<div class="db-matches"><a (.*?)</a></div>', content)
            info2_new = []  # 第二列表
            for i in info2:
                # i = i.replace('href="/drugs/', '').replace('">', ':').replace('</a>', '').replace('<a', '')
                i = i.replace('href="/drugs/', '').replace('">', ':').replace('</a>', '').replace('<a', '').replace(' / ',
                                                                                                                    '【/】')  # 修改的

                # print(i)
                info2_new.append(i)
            # print(len(info1), info1)
            # print(len(info2_new), info2_new)

            for yaoming, chenfen in zip(info1, info2_new):
                dic = {
                    "药名": yaoming,
                    "成分": chenfen
                }
                # total_list.append(dic)
                print(dic)
            with open('异步采集.txt', 'a', encoding='utf-8') as f:
                f.write(f'{len(info1), info1}\n{len(info2_new), info2_new}\n')
            # time.sleep(0.5)  # 加了这个没啥效果


    async def main(self):
        page = int(input("输入页数："))
        async with aiohttp.ClientSession() as session:
            url_list = [f'https://go.drugbank.com/unearth/q?approved=1&ca=0&eu=0&page={i}&query=%2A&searcher=indications'
                        for i in range(1, page + 1)]
            tasks = [asyncio.create_task(self.__fetch(session, url)) for url in url_list]
            await asyncio.wait(tasks)

    @staticmethod
    def run():
        print("""
        _____ _  Author: 十架bgm         __
        _________   ___ ___    _____________________________________________
        \_   ___ \ /   |   \  /  _  \__    ___/  _____/\______   \__    ___/
        /    \  \//    ~    \/  /_\  \|    | /   \  ___ |     ___/ |    |
        \     \___\    Y    /    |    \    | \    \_\  \|    |     |    |
         \______  /\___|_  /\____|__  /____|  \______  /|____|     |____|
                \/       \/         \/               \/        version=1.1
    
        """)



if __name__ == '__main__':
    Asyn.run()
    spide = Asyn()
    asyncio.run(spide.main())

除了get请求，aiohttp还支持其它请求类型，如POST、PUT、DELETE等，和requests使用方式类似。

可获取：

async with aiohttp.ClientSession() as session:
        async with session.post('https://www.httpbin.org/post', data=data) as response:
            print('status:', response.status)  # 状态码
            print('headers:', response.headers)  # 响应头
            print('body:', await response.text())  # 响应体
            print('bytes:', await response.read())  # 响应体二进制内容
            print('json:', await response.json())  # 响应体json数据

可设置：# 设置 1 秒的超时

async def main():
    # 设置 1 秒的超时 
    timeout = aiohttp.ClientTimeout(total=1)
    async with aiohttp.ClientSession(timeout=timeout) as session:
        url_list = [f'https://go.drugbank.com/unearth/q?approved=1&ca=0&eu=0&page={i}&query=%2A&searcher=indications'
                    for i in range(1, 200)]
        # print(url_list)
        tasks = [asyncio.create_task(fetch(session, url)) for url in url_list]
        await asyncio.wait(tasks)

补充
回调函数：

import asyncio
import aiohttp

async def fetch(session, url, callback=None):
    async with session.get(url) as response:
        data = await response.text()
        if callback:
            callback(data)

async def save_to_file(data):
    with open('output.txt', 'a') as f:
        f.write(data + '\n')

async def main():
    urls = ['https://www.example.com', 'https://www.google.com', 'https://www.yahoo.com']
    async with aiohttp.ClientSession() as session:
        for url in urls:
            await fetch(session, url, callback=save_to_file)

loop = asyncio.get_event_loop()
loop.run_until_complete(main())

在这个示例中，我们定义了两个回调函数：save_to_file()和fetch()。fetch()函数执行HTTP请求并将响应数据传递给save_to_file()函数以进行处理。save_to_file()函数将数据写入到文件output.txt中。main()函数通过ClientSession创建异步HTTP客户端，遍历URL列表并异步发起请求，然后将响应数据传递给回调函数进行处理。

注意，我们在open()函数中使用'a'参数打开文件，这意味着我们将向文件追加数据，而不是覆盖现有数据。这是因为爬虫可能会在不同的时间点运行，我们希望能够保留之前的数据。

该示例演示了如何使用回调函数将爬虫数据存储到文件中，您可以根据自己的需求修改回调函数来处理数据，例如将数据写入数据库、发送电子邮件等。

回调函数在爬虫中扮演了重要的角色，具有以下几个意义：

异步处理：在爬虫中，由于网络请求可能会花费较长的时间来完成，为了避免爬虫在等待响应时浪费时间，我们通常使用异步方式来进行爬取。而回调函数就是在异步请求完成后被调用的，可以在异步环境下执行额外的处理逻辑。

分离任务：爬虫往往需要同时处理多个任务，例如从不同的网站获取数据。为了简化代码和提高可维护性，我们可以将每个任务的回调函数分别定义，这样可以让代码更加清晰。

处理响应：爬虫的核心任务是从目标网站中提取所需的信息。回调函数可以在获取到响应后执行数据解析和处理操作，将爬虫所需的数据提取出来，并将其存储到数据库、文件或其他数据存储系统中。

错误处理：在爬虫中，网络请求时常会出现错误，例如服务器错误、网络连接中断等。使用回调函数可以让我们在出现错误时执行特定的错误处理逻辑，例如记录错误信息、重新发起请求等，提高爬虫的稳定性和鲁棒性。

因此，回调函数对于构建高效、可维护、稳定的爬虫系统非常重要。

例子2

import asyncio
import aiohttp
from lxml import etree
from bili import headers, connect
from loguru import logger


async def fetch(url, session):
    async with session.get(url, headers=headers) as response:
        return await response.text()


async def parse(url, session):
    content = await fetch(url, session)
    e = etree.HTML(content)
    yuedu_list = e.xpath('//div[@id="articlelistnew"]/div/span[1]/text()')[1:]  # 阅读量
    pl_list = e.xpath('//div[@id="articlelistnew"]/div/span[2]/text()')[1:]  # 评论
    title_list = e.xpath('//div[@id="articlelistnew"]/div/span[3]/a/text()')  # 标题
    update_time_list = e.xpath('//div[@id="articlelistnew"]/div/span[5]/text()')[1:]  # 更新时间

    for y, p, t, u in zip(yuedu_list, pl_list, title_list, update_time_list):
        dic = {
            "a": y,
            "p": p,
            "t": t,
            "u": u
        }
        logger.info(dic)


async def main():
    page = int(input("页数"))
    url_list = [f'http://guba.eastmoney.com/o/list,meigu_{i}.html' for i in range(1, page + 1)]

    async with aiohttp.ClientSession() as session:
        tasks = [parse(url, session) for url in url_list]
        await asyncio.gather(*tasks)


if __name__ == "__main__":
    asyncio.run(main())

这个代码使用了asyncio和aiohttp库来进行异步HTTP请求和解析页面内容。fetch函数负责发送异步HTTP请求，而parse函数则负责解析页面内容并打印信息。main函数是程序的入口点，它创建一个异步会话并发起多个解析任务。最后，通过调用asyncio.run(main())来运行异步程序。

posted @ 2023-04-29 19:26 __username 阅读(95) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

飞翔的企鹅

日日行，不怕千万里；常常做，不怕千万事

异步爬虫例子之asyncio

异步爬虫例子：

例子1

例子2

公告