爬取汽车之家车型配置信息

一、需求

获取指定品牌的所有车型配置信息,并保存到excel中。

流程大致思路:

1.获取品牌id:brand_id

2.通过品牌id获取车型id:series_id

3.获取车型配置页面

4.解析配置页面内容(这步最复杂,使用了之前一些大神的代码)

二、代码

测试完美运行

 

 

 

import requests
import json
import xlwt
from bs4 import BeautifulSoup
import re
from urllib import parse
from selenium import webdriver


class Car_home_config(object):
    def __init__(self):
        self.session = requests.Session()
        self.params = None
        self.brand_dict = {}
        self.series_dict = {}
        self.brand_name = None

    def get_header(self):
        self.headers = {
            "authority": "car.autohome.com.cn",
            "method": "GET",
            "path": "/AsLeftMenu/As_LeftListNew.ashx?%s" % parse.urlencode(self.params),
            "scheme": "https",
            "accept": "*/*",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
            "cache-control": "no-cache",
            "pragma": "no-cache",
            "sec-ch-ua": "Google Chrome;v=87,Not;A Brand;v=99,Chromium;v=87",
            "sec-ch-ua-mobile": "?0",
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "none",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
        }

    # 获取所有品牌id号
    def get_brand_id(self):
        self.params = {
            "typeId": "1",
            "brandId": "0",
            "fctId": "0",
            "seriesId": "0"
        }
        self.get_header()
        url = r"https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx"
        res = self.session.get(url=url, headers=self.headers, params=self.params)
        res.encoding = res.apparent_encoding
        html = res.text
        # print(html)
        soup = BeautifulSoup(html, 'lxml')
        ul_list = soup.find_all("ul")
        for ul in ul_list:
            li_list = ul.find_all("li")
            for li in li_list:
                a_href = li.find("a").attrs.get('href')
                a_text = li.find("a").text
                # print(a_href)
                # print(a_text)
                brand_id = re.findall("[0-9]\d*", a_href)[0]
                self.brand_dict[brand_id] = a_text
        return self.brand_dict

    def get_AsLeftMenu(self):
        url = r"https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx"
        res = self.session.get(url=url, headers=self.headers, params=self.params)
        res.encoding = res.apparent_encoding
        html = res.text
        soup = BeautifulSoup(html, 'lxml')
        dd_list = soup.find_all("dd")
        for dd in dd_list:
            a_list = dd.find_all("a")
            for a in a_list:
                a_href = a.attrs.get('href')
                a_text = a.text
                print(a_href)
                print(a_text)
                series_id = re.findall("[0-9]\d*", a_href)[0]
                self.series_dict[series_id] = a_text

    # 获取某一品牌下车型的id号
    def get_series_id(self):
        self.get_brand_id()
        if self.brand_name:
            for k, v in self.brand_dict.items():
                if self.brand_name in v:
                    self.params = {
                        "typeId": "1",
                        "brandId": k,
                        "fctId": "0",
                        "seriesId": "0"
                    }
                    self.get_header()
                    self.get_AsLeftMenu()
                    return self.series_dict
        else:
            for k, v in self.brand_dict.items():
                self.params = {
                    "typeId": "1",
                    "brandId": k,
                    "fctId": "0",
                    "seriesId": "0"
                }
                self.get_header()
                self.get_AsLeftMenu()
            return self.series_dict

    # 获取车型配置信息
    def get_config_content(self, series_id):
        res = self.session.get(r"https://car.autohome.com.cn/config/series/{}.html".format(series_id), verify=False,
                           headers={
                               "authority": "car.autohome.com.cn",
                               "method": "GET",
                               "path": "/config/series/{}.html".format(series_id),
                               "scheme": "https",
                               "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                               "accept-encoding": "gzip, deflate, br",
                               "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
                               "cache-control": "no-cache",
                               "referer": "https://www.autohome.com.cn/",
                               "sec-ch-ua": "Google Chrome;v=87,Not;A Brand;v=99,Chromium;v=87",
                               "sec-ch-ua-mobile": "?0",
                               "sec-fetch-dest": "document",
                               "sec-fetch-mode": "navigate",
                               "sec-fetch-site": "same-site",
                               "ec-fetch-user": "?1",
                               "upgrade-insecure-requests": "1",
                               "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"})
        html = res.content.decode("utf-8")
        return html

    def car_info(self, html):
        config = re.search("var config = (.*?)};", html)  # 车的参数
        option = re.search("var option = (.*?)};", html)  # 主被动安全装备
        bag = re.search("var bag = (.*?)};", html)  # 选装包
        # 处理汽车参数
        car_info = ""
        if config and option and bag:
            car_info = car_info + config.group(0) + option.group(0) + bag.group(0)
        return car_info

    def write_html(self, js_list, car_info):
        # 运行JS的DOM -- 这部破解是最麻烦的,非常耗时间~参考了互联网上的大神代码
        DOM = ("var rules = '2';"
               "var document = {};"
               "function getRules(){return rules}"
               "document.createElement = function() {"
               "      return {"
               "              sheet: {"
               "                      insertRule: function(rule, i) {"
               "                              if (rules.length == 0) {"
               "                                      rules = rule;"
               "                              } else {"
               "                                      rules = rules + '#' + rule;"
               "                              }"
               "                      }"
               "              }"
               "      }"
               "};"
               "document.querySelectorAll = function() {"
               "      return {};"           "};"
               "document.head = {};"
               "document.head.appendChild = function() {};"

               "var window = {};"
               "window.decodeURIComponent = decodeURIComponent;")

        # 把JS文件写入到文件中去
        for item in js_list:
            DOM = DOM + item
        html_type = "<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8' /><head></head><body>    <script type='text/javascript'>"
        # 拼接成一个可以运行的网页
        js = html_type + DOM + " document.write(rules)</script></body></html>"
        # 再次运行的时候,请把文件删除,否则无法创建同名文件,或者自行加验证即可
        with open("original.html", "w", encoding="utf-8") as f:
            f.write(js)
        try:
            driver = webdriver.PhantomJS(
                executable_path=r"phantomjs.exe")
            driver.get("original.html")
            # 读取body部分
            text = driver.find_element_by_tag_name('body').text
            if not text:
                return
        except Exception as e:
            print(e)
        finally:
            driver.close()
        # 匹配车辆参数中所有的span标签
        span_list = re.findall("<span(.*?)></span>", car_info)  # car_info 是我上面拼接的字符串
        # 按照span标签与text中的关键字进行替换
        for span in span_list:
            # 这个地方匹配的是class的名称  例如 <span class='hs_kw7_optionZl'></span> 匹配   hs_kw7_optionZl 出来
            info = re.search("'(.*?)'", span)
            if info:
                class_info = str(info.group(
                    1)) + "::before { content:(.*?)}"  # 拼接为  hs_kw7_optionZl::before { content:(.*?)}
                content = re.search(class_info, text).group(1)  # 匹配文字内容,返回结果为 "实测""油耗""质保"
                car_info = car_info.replace(str("<span class='" + info.group(1) + "'></span>"),
                                            re.search("\"(.*?)\"", content).group(1))
        return car_info

    def save(self, car_info, car_name, save_path):
        # 持久化
        car_item = {}
        config = re.search("var config = (.*?);", car_info).group(1)
        option = re.search("var option = (.*?);var", car_info).group(1)
        bag = re.search("var bag = (.*?);", car_info).group(1)
        config_re = json.loads(config)
        option_re = json.loads(option)
        bag_re = json.loads(bag)
        config_item =[]
        option_item = []
        for i in config_re['result']['paramtypeitems']:
            config_item+=i['paramitems']
        for i in option_re['result']['configtypeitems']:
            option_item+=i['configitems']
        # bag_item = bag_re['result']['bagtypeitems'][0]['bagitems']
        for car in config_item:
            car_item[car['name']] = []
            for value in car['valueitems']:
                car_item[car['name']].append(value['value'])
        for car in option_item:
            car_item[car['name']] = []
            for value in car['valueitems']:
                car_item[car['name']].append(value['value'])
        # for car in bag_item[0]['valueitems']:
        #     car_item[car['name']] = []
        #     car_item[car['name']].append(car['bagid'])
        #     car_item[car['name']].append(car['pricedesc'])
        #     car_item[car['name']].append(car['description'])
        # 生成表格
        workbook = xlwt.Workbook(encoding='ascii')  # 创建一个文件
        worksheet = workbook.add_sheet('汽车之家')  # 创建一个表
        cols = 0
        start_row = 0
        for co in car_item:
            worksheet.write(start_row, cols, co)  # 在第0(一)行写入车的配置信息
            cols = cols + 1
        end_row_num = start_row + len(car_item['车型名称'])  # 车辆款式记录数
        for row in range(start_row, end_row_num):
            col_num = 0  # 列数
            row += 1
            for col in car_item:
                try:
                    con = str(car_item[col][row - 1])
                except:
                    con = ""
                worksheet.write(row, col_num, con)
                col_num = col_num + 1
        workbook.save('{}/{}.xls'.format(save_path, car_name))

    # 查找车型配置,brand_name不填就是查找所有
    def check(self, brand_name, save_path="./"):
        self.brand_name = brand_name
        self.get_series_id()
        for series_id, car_name in self.series_dict.items():
            print(series_id, car_name)
            html = self.get_config_content(series_id)
            car_info = self.car_info(html)
            js_list = re.findall('(\(function\([a-zA-Z]{2}.*?_\).*?\(document\);)', html)
            car_info = self.write_html(js_list, car_info)
            if car_info:
                self.save(car_info, car_name, save_path)


car = Car_home_config()
car.check("奥迪")

phantomjs.exe下载地址:https://phantomjs.org/download.html

感谢以下作者:
https://www.cnblogs.com/kangz/p/10011348.html
https://www.cnblogs.com/pontoon/p/10459471.html

posted @ 2021-01-21 10:37  Maple_feng  阅读(1816)  评论(2编辑  收藏  举报