异步爬取爱卡汽车论坛信息
一、获取论坛对应的汽车fid
import asyncio import time import aiohttp
import requests from bs4 import BeautifulSoup from urllib.parse import urljoin import json import re headers = { "Host": "www.xcar.com.cn", "Pragma": "no-cache", "Proxy-Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36" } res=requests.get(r"https://www.xcar.com.cn/bbs/",headers=headers) html=res.content.decode(res.apparent_encoding) soup=BeautifulSoup(html,'lxml') span_list=soup.find_all("span",id="w959") car_ids={} for span in span_list: fid=re.findall("fid=(.*)",span.find("a").attrs.get('href'))[0] name=span.find("a").text car_ids[fid]=name # 保存为本地json文件,方便后续使用 car_ids_str=json.dumps(car_ids,ensure_ascii=False) with open("car.json", "w",encoding="utf-8") as f: f.write(car_ids_str)
二、asyncio爬取论坛信息
headers = { "Host": "www.xcar.com.cn", "Pragma": "no-cache", "Proxy-Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36" } with open("car.json", "rb") as f: car_ids = json.loads(f.read()) # 异步爬取函数 async def more_details(fid,page): _time=str(time.time()*1000) url = r"https://www.xcar.com.cn/bbs/xbbsapi/forumdisplay/get_thread_list.php" params = { "fid": fid, "orderby": "lastpost", "filter": "", "ondigest": "0", "page": page, "_": _time } async with aiohttp.ClientSession() as session:
async with session.get(url=url, params=params,headers=headers) as resp:
res=await resp.json(content_type='text/html',encoding='utf-8') return res def run(car_name): for k, v in car_ids.items(): if car_name in v: car_id = k # 创建异步循环事件池 loop = asyncio.get_event_loop() # 使用ensure_future创建异步爬取任务task,最后通过result()来获取结果 task=[asyncio.ensure_future(more_details(car_id,i)) for i in range(1,50)] done, _ =loop.run_until_complete(asyncio.wait(task)) for t in done: res=t.result() print(res) loop.close() run("奥迪A4L")