asyncio + httpx异步请求板子

AI 写的 asyncio + httpx 异步请求板子

import asyncio
import httpx
import json
import aiofiles
from pathlib import Path
project_dir = Path(__file__).resolve().parent

# 使用代理
proxy = "http://username:password@ip:port"
# 异步并发数
max_concurrency = 5
input_file_name = '任务名单'
exit_file_name = '已采集名单'
output_file_name = '已采集名单'


also = set()
# 剔除已经采集过的任务
with open(project_dir.joinpath(exit_file_name), 'r', encoding='utf-8') as f:
    for line in f:
        try:
            doc = json.loads(line.strip())
            _id = doc.get("_id")
            also.add(_id)
        except:
            pass

tasks = []
with open(project_dir.joinpath(input_file_name), 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip() in also:
            continue
        tasks.append(line.strip())


async def fetch_url(session, task, headers, cookies, retries=3, backoff_factor=0.5):
    """
    异步获取指定 URL 的内容,最多重试指定次数。
    """

    # 构造一下url
    url = f"https://{task}.com"

    for attempt in range(retries):
        try:
            response = await session.get(url, headers=headers, cookies=cookies)
            if response.status_code == 200:
                doc = response.json()
                # 一种异常响应处理
                if doc.get('code') == 1000000:
                    print(f"【error】{task}")
                    return {}

                # 返回正常响应
                else:
                    return response.json()
            else:
                print(f"Attempt {attempt + 1}: Unexpected status code {response.status_code} for {url}")
        except httpx.RequestError as exc:
            print(f"Attempt {attempt + 1}: Request failed for {task}: {exc}")

        # 如果不是最后一次重试,则等待一段时间后再重试
        if attempt < retries - 1:
            await asyncio.sleep(backoff_factor * (2 ** attempt))  # 指数退避

    # 所有重试都失败时,返回空字典
    print(f"Failed to fetch {url} after {retries} attempts.")
    return {}

async def write_json_to_file(data):
    """
    异步写入 JSON 数据到文件。
    """
    async with aiofiles.open(project_dir.joinpath(output_file_name), 'a+', encoding='utf-8') as file:
        await file.write(json.dumps(data, ensure_ascii=False) + '\n')

async def main(urls, max_concurrency=5, proxy=None):
    """
    异步执行多个 URL 的请求。
    """
    headers = {
        "accept": "*/*",
        "accept-language": "zh-CN,zh;q=0.9",
        "cache-control": "no-cache",
        "content-type": "application/json",
        "pragma": "no-cache",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
    }
    cookies = {

    }
    async with httpx.AsyncClient(proxies=proxy) as client:
        semaphore = asyncio.Semaphore(max_concurrency)

        async def fetch_with_semaphore(url):
            async with semaphore:
                result = await fetch_url(client, url, headers, cookies)
                # 可保存的结果,这里写一段parser
                if result and result.get("message", "") == "Success" and result.get("data", {}):
                    ret = result
                    await write_json_to_file(ret)
                elif result:
                    print(result)
                return result

        tasks = [fetch_with_semaphore(url) for url in urls]
        await asyncio.gather(*tasks)

asyncio.run(main(tasks, max_concurrency, proxy=proxy))
posted @ 2024-08-27 14:19  anyiya  阅读(20)  评论(0编辑  收藏  举报