asyncio + httpx异步请求板子
AI 写的 asyncio + httpx 异步请求板子
import asyncio
import httpx
import json
import aiofiles
from pathlib import Path
project_dir = Path(__file__).resolve().parent
# 使用代理
proxy = "http://username:password@ip:port"
# 异步并发数
max_concurrency = 5
input_file_name = '任务名单'
exit_file_name = '已采集名单'
output_file_name = '已采集名单'
also = set()
# 剔除已经采集过的任务
with open(project_dir.joinpath(exit_file_name), 'r', encoding='utf-8') as f:
for line in f:
try:
doc = json.loads(line.strip())
_id = doc.get("_id")
also.add(_id)
except:
pass
tasks = []
with open(project_dir.joinpath(input_file_name), 'r', encoding='utf-8') as f:
for line in f:
if line.strip() in also:
continue
tasks.append(line.strip())
async def fetch_url(session, task, headers, cookies, retries=3, backoff_factor=0.5):
"""
异步获取指定 URL 的内容,最多重试指定次数。
"""
# 构造一下url
url = f"https://{task}.com"
for attempt in range(retries):
try:
response = await session.get(url, headers=headers, cookies=cookies)
if response.status_code == 200:
doc = response.json()
# 一种异常响应处理
if doc.get('code') == 1000000:
print(f"【error】{task}")
return {}
# 返回正常响应
else:
return response.json()
else:
print(f"Attempt {attempt + 1}: Unexpected status code {response.status_code} for {url}")
except httpx.RequestError as exc:
print(f"Attempt {attempt + 1}: Request failed for {task}: {exc}")
# 如果不是最后一次重试,则等待一段时间后再重试
if attempt < retries - 1:
await asyncio.sleep(backoff_factor * (2 ** attempt)) # 指数退避
# 所有重试都失败时,返回空字典
print(f"Failed to fetch {url} after {retries} attempts.")
return {}
async def write_json_to_file(data):
"""
异步写入 JSON 数据到文件。
"""
async with aiofiles.open(project_dir.joinpath(output_file_name), 'a+', encoding='utf-8') as file:
await file.write(json.dumps(data, ensure_ascii=False) + '\n')
async def main(urls, max_concurrency=5, proxy=None):
"""
异步执行多个 URL 的请求。
"""
headers = {
"accept": "*/*",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "no-cache",
"content-type": "application/json",
"pragma": "no-cache",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}
cookies = {
}
async with httpx.AsyncClient(proxies=proxy) as client:
semaphore = asyncio.Semaphore(max_concurrency)
async def fetch_with_semaphore(url):
async with semaphore:
result = await fetch_url(client, url, headers, cookies)
# 可保存的结果,这里写一段parser
if result and result.get("message", "") == "Success" and result.get("data", {}):
ret = result
await write_json_to_file(ret)
elif result:
print(result)
return result
tasks = [fetch_with_semaphore(url) for url in urls]
await asyncio.gather(*tasks)
asyncio.run(main(tasks, max_concurrency, proxy=proxy))