异步爬取爱卡汽车论坛信息

一、获取论坛对应的汽车fid

import asyncio
import time
import aiohttp
import requests
from bs4 import BeautifulSoup from urllib.parse import urljoin import json import re headers = { "Host": "www.xcar.com.cn", "Pragma": "no-cache", "Proxy-Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36" } res=requests.get(r"https://www.xcar.com.cn/bbs/",headers=headers) html=res.content.decode(res.apparent_encoding) soup=BeautifulSoup(html,'lxml') span_list=soup.find_all("span",id="w959") car_ids={} for span in span_list: fid=re.findall("fid=(.*)",span.find("a").attrs.get('href'))[0] name=span.find("a").text car_ids[fid]=name # 保存为本地json文件,方便后续使用 car_ids_str=json.dumps(car_ids,ensure_ascii=False) with open("car.json", "w",encoding="utf-8") as f: f.write(car_ids_str)

二、asyncio爬取论坛信息

headers = {
    "Host": "www.xcar.com.cn",
    "Pragma": "no-cache",
    "Proxy-Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
}

with open("car.json", "rb") as f:
    car_ids = json.loads(f.read())

# 异步爬取函数
async def more_details(fid,page):
    _time=str(time.time()*1000)
    url = r"https://www.xcar.com.cn/bbs/xbbsapi/forumdisplay/get_thread_list.php"
    params = {
        "fid": fid,
        "orderby": "lastpost",
        "filter": "",
        "ondigest": "0",
        "page": page,
        "_": _time
    }
    async with aiohttp.ClientSession() as session:
async with session.get(url=url, params=params,headers=headers) as resp:
res=await resp.json(content_type='text/html',encoding='utf-8') return res def run(car_name): for k, v in car_ids.items(): if car_name in v: car_id = k # 创建异步循环事件池 loop = asyncio.get_event_loop() # 使用ensure_future创建异步爬取任务task,最后通过result()来获取结果 task=[asyncio.ensure_future(more_details(car_id,i)) for i in range(1,50)] done, _ =loop.run_until_complete(asyncio.wait(task)) for t in done: res=t.result() print(res) loop.close() run("奥迪A4L")

 

posted @ 2020-12-31 14:34  Maple_feng  阅读(205)  评论(0编辑  收藏  举报