python scrapy爬取瓜子二手车网站二手车信息（二）

接上文 https://www.cnblogs.com/achangblog/p/13956140.html
第五步：编辑中间件文件middlewares.py并启用
在上一步破解js代码得到正确响应内容后，即可把破解方法复制进middlewares.py文件中，稍作修改即可：

import re
import execjs
import datetime

class GuaziDownloaderMiddleware:

    def __init__(self):
        self.f = open('guazi.js', 'r', encoding='utf-8')
        self.content = self.f.read()

    def process_response(self, request, response, spider):
        html = response.text
        if '正在打开中,请稍后' in html:
            pattern = re.compile(r'value=anti\(\'(.*?)\',\'(.*?)\'\)')
            string_ = pattern.search(html).group(1)
            key = pattern.search(html).group(2)
            js_compile = execjs.compile(self.content)
            value = js_compile.call('anti', string_, key)
            expire_time = datetime.datetime.utcnow() + datetime.timedelta(seconds=2592000)
            expires = "; expires=" + expire_time.strftime('%a, %d %b %Y %H:%M:%S GMT')
            cookie = {'antipas': value, 'expires': expires, 'path': '/'}
            request.cookies = cookie
            return request
        elif response.status == 200:
            return response
        # elif '客官请求太频繁啦,请1分钟后重试' in html:
        #     print('!!! response.status是：{}'.format(response.status))
        #     print('request.url: {}'.format(request.url))
        #     print('请求频繁')
        #     return request

    def close_spider(self):
        self.f.close()

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

然后在settings.py文件开启该middleware：

DOWNLOADER_MIDDLEWARES = {
   'guazi.middlewares.GuaziDownloaderMiddleware': 543,
}

第六步：分析网页结构，初步编写爬虫文件guazi_spider.py
①.通过分析瓜子网站网页结构，我们可以从 https://www.guazi.com/www/buy 网页开始爬起，通过抓取不同城市url循环遍历每个城市的二手车信息；城市url信息隐藏在一个js函数中，可通过正则表达式获取。
②.再分析每个城市二手车信息的页面，发现每页展示40条二手车信息，最多展示50页，也就是最多展示2000条数据，无法完全展示城市所有二手车信息，因此，我们需要按品牌分类继续往下细分url。以城市安吉为例，点击品牌栏上的“大众”，得到的url为 https://www.guazi.com/anji/dazhong/，发现 “为您找到7802辆好车”，仍无法完全显示，则继续细分车系，如点击车系栏的“速腾”，得到的url为 https://www.guazi.com/anji/suteng/ ，页面展示 “为您找到890辆好车”，小于2000，可以完全展示，如此循环遍历每一个大的品牌，若发现车辆数大于2000，则继续细分车系，至此，url细分结束；
③.接下来就是翻页操作，通过分析网页结构可以发现，只要不是最后一页，都有下一页的按钮存在，据此，循环遍历可以获取所有url信息
④.有了每一页的url，即可得知每一页上的二手车详情页的url，根据二手车详情页url，即可获取到该二手车数据信息
通过以上分析，初步得到的guazi_spider.py文件内容如下：

# -*- coding: utf-8 -*-
import scrapy
import re
import json
from guazi.items import GuaziItem
import pytesseract
import requests
from PIL import Image
from ..settings import DEFAULT_REQUEST_HEADERS


class GuaziSpiderSpider(scrapy.Spider):
    name = 'guazi_spider'
    allowed_domains = ['www.guazi.com']
    start_urls = ['https://www.guazi.com/www/buy']

    def parse(self, response):
        pattern = re.compile(r'cityLeft = (.*?);')
        city_left_dict = json.loads(pattern.search(response.text).group(1))
        pattern = re.compile(r'cityRight = (.*?);')
        city_right_dict = json.loads(pattern.search(response.text).group(1))
        city_domain_and_name_list = []
        for key, value in {**city_left_dict, **city_right_dict}.items():
            for city_detail in value:
                one_couple = {}
                if 'domain' in city_detail:
                    one_couple['domain'] = city_detail['domain']
                if 'name' in city_detail:
                    one_couple['name'] = city_detail['name']
                city_domain_and_name_list.append(one_couple)
        for city_domain_and_name in city_domain_and_name_list:
            city_name = city_domain_and_name['name']
            city_url = 'https://www.guazi.com/' + city_domain_and_name['domain'] + '/buy'
            yield scrapy.Request(url=city_url, callback=self.parse_city_detail, meta={'city': city_name},
                                 dont_filter=True)

    # 处理每个城市，获取品牌url
    def parse_city_detail(self, response):
        brand_list = response.xpath('//div[@class="dd-all clearfix js-brand js-option-hid-info"]//ul//li/p//a')
        for brand in brand_list:
            brand_url = response.urljoin(brand.xpath('./@href').get())
            yield scrapy.Request(url=brand_url, callback=self.parse_brand_detail, meta=response.meta, dont_filter=True)

    # 处理品牌url及其车系(是否)分类
    def parse_brand_detail(self, response):
        car_count_pattern = re.compile(r'为您找到(.*)辆好车')
        car_count = int(car_count_pattern.search(response.xpath('//p[@class="result-p3"]/text()').get()).group(1))
        if 0 < car_count <= 2000:
            yield scrapy.Request(url=response.url, callback=self.parse_car_list, meta=response.meta, dont_filter=True)
        # elif car_count > 2000:
        #     brand_list = response.xpath('//div[@class="screen"]/dl[position()=2]//ul//li/p//a')
        #     for brand in brand_list:
        #         brand_url = response.urljoin(brand.xpath('./@href').get())
        #         print('brand_url: ' + brand_url)
        #         yield scrapy.Request(url=brand_url, callback=self.parse_brand_detail, meta=response.meta,
        #                              dont_filter=True)
        # else:
        #     pass

    # 处理页面数据及翻页
    def parse_car_list(self, response):
        car_list = response.xpath('//ul[@class="carlist clearfix js-top"]//li/a')
        for car in car_list:
            car_url = response.urljoin(car.xpath('./@href').get())
            yield scrapy.Request(car_url, callback=self.parse_car_detail, meta=response.meta, dont_filter=True)
        # next_page_a_label = response.xpath('//ul[@class="pageLink clearfix"]//li[last()]/a')
        # next_page_button = next_page_a_label.xpath('./span/text()').get()
        # if next_page_button == '下一页':
        #     next_url = response.urljoin(next_page_a_label.xpath('./@href').get())
        #     yield scrapy.Request(next_url, callback=self.parse_car_list, meta=response.meta, dont_filter=True)

    # 处理二手车数据详情
    def parse_car_detail(self, response):
        item = GuaziItem()
        item['city'] = response.meta['city']
        item['title'] = ''.join(response.xpath('//h1[@class="titlebox"]/text()').getall()).strip()
        source_id_pattern = re.compile(r'车源号：(.*)')
        source_id_str = response.xpath('//div[@class="right-carnumber"]/text()').get()
        item['source_id'] = source_id_pattern.search(source_id_str).group(1).strip()
        license_time_img_url = response.xpath('//ul[@class="assort clearfix"]/li[@class="one"]/span/img/@src').get()
        # item['license_time'] = self.image_to_str(license_time_img_url)[0:7]
        item['license_time'] = license_time_img_url
        item['mileage'] = response.xpath('//ul[@class="assort clearfix"]/li[@class="two"]/span/text()').get()
        item['displacement'] = response.xpath('//ul[@class="assort clearfix"]/li[@class="three"]/span/text()').get()
        item['gearbox_type'] = response.xpath('//ul[@class="assort clearfix"]/li[@class="last"]/span/text()').get()
        item['full_price'] = response.xpath('//span[@class="price-num"]/text()').get()
        yield item
    
    # 识别图片转为文字
    # def image_to_str(self, img_url):
    #     response = requests.get(url=img_url, headers=DEFAULT_REQUEST_HEADERS)
    #     with open('image.jpg', 'wb') as f:
    #         f.write(response.content)
    #     image = Image.open('image.jpg')
    #     return pytesseract.image_to_string(image)

ps：在获取“上牌时间”时，发现网页显示为一个小图片，而非文本数据，此时可以通过pytesseract模块，自动识别图片上的文字，即可获取正确的时间信息；或者直接保存“上牌时间”图片的url信息。

第七步：优化爬虫文件guazi_spider.py
由于上述代码每次运行都要解析一遍城市url数据，而城市url数据是固定不变的，所以，可以先把城市url解析出来存入mongodb数据库，在爬虫文件中直接使用即可。

首先需在items.py的同级目录下创建文件guazi_mongo.py

import pymongo


class GuaziMongo:

    def __init__(self):
        self.client = pymongo.MongoClient(host='localhost', port=27017)
        self.guazi_db = self.client['guazi']
        self.guazi_uncol = self.guazi_db['guazi_url_and_name']
        self.guazi_item_col = self.guazi_db['guazi_item']


guaziMongo = GuaziMongo()

然后再在items.py的同级目录下创建文件guazi_get_city_url.py

# 获取所有瓜子二手车城市的url和名字，以字典的形式存入数据库 {'name': '北京', 'url': 'https://www.guazi.com/bj/buy'}
import requests
import re
import execjs
import datetime
import json
from guazi.guazi_mongo import guaziMongo


def get_city_name_and_url(response):
    pattern = re.compile(r'cityLeft = (.*?);')
    city_left_dict = json.loads(pattern.search(response.text).group(1))
    pattern = re.compile(r'cityRight = (.*?);')
    city_right_dict = json.loads(pattern.search(response.text).group(1))
    city_domain_and_name_list = []
    for key, value in {**city_left_dict, **city_right_dict}.items():
        for city_detail in value:
            one_couple = {}
            if 'domain' in city_detail:
                one_couple['domain'] = city_detail['domain']
            if 'name' in city_detail:
                one_couple['name'] = city_detail['name']
            city_domain_and_name_list.append(one_couple)
    city_name_and_url_list = []
    for city_domain_and_name in city_domain_and_name_list:
        city_name = city_domain_and_name['name']
        city_url = 'https://www.guazi.com/' + city_domain_and_name['domain'] + '/buy'
        city_name_and_url_list.append({'name': city_name, 'url': city_url})
    return city_name_and_url_list


headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}

url = 'https://www.guazi.com/www/buy'
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
html = response.text
if '正在打开中,请稍后' in html:
    pattern = re.compile(r'value=anti\(\'(.*?)\',\'(.*?)\'\)')
    string_ = pattern.search(html).group(1)
    key = pattern.search(html).group(2)
    print('string_: {}, key: {}'.format(string_, key))
    with open('guazi.js', 'r', encoding='utf-8') as f:
        js_compile = execjs.compile(f.read())
        value = js_compile.call('anti', string_, key)
    name = 'antipas'
    expire_time = datetime.datetime.utcnow() + datetime.timedelta(seconds=2592000)
    expires = "; expires=" + expire_time.strftime('%a, %d %b %Y %H:%M:%S GMT')
    cookie = name + "=" + value + expires + "; path=/"
    headers['Cookie'] = cookie
    response = requests.get(url, headers=headers)
    data_col = guaziMongo.guazi_uncol
    data = get_city_name_and_url(response)
    print(data)
    # data_col.delete_many({})
    data_col.insert_many(data)
    for data_ in data:
        print(data_)

运行一遍guazi_get_city_url.py即可往数据库中插入一次所有城市url数据

爬虫文件guazi_spider.py可修改为如下：

# -*- coding: utf-8 -*-
import scrapy
import re
import json
from guazi.items import GuaziItem
from guazi.guazi_mongo import guaziMongo
import pytesseract
import requests
from PIL import Image
from ..settings import DEFAULT_REQUEST_HEADERS


class GuaziSpiderSpider(scrapy.Spider):
    name = 'guazi_spider'
    allowed_domains = ['www.guazi.com']
    # start_urls = ['https://www.guazi.com/www/buy']

    def start_requests(self):
        while True:
            city_url_and_name = guaziMongo.guazi_uncol.find_one_and_delete({})
            if city_url_and_name:
                city_url = city_url_and_name['url']
                city_name = city_url_and_name['name']
                yield scrapy.Request(url=city_url, callback=self.parse_city_detail, meta={'city': city_name},
                                     dont_filter=True)
            else:
                break

    # 处理每个城市，获取品牌url
    def parse_city_detail(self, response):
        brand_list = response.xpath('//div[@class="dd-all clearfix js-brand js-option-hid-info"]//ul//li/p//a')
        for brand in brand_list:
            brand_url = response.urljoin(brand.xpath('./@href').get())
            yield scrapy.Request(url=brand_url, callback=self.parse_brand_detail, meta=response.meta, dont_filter=True)

    # 处理品牌url及其车系(是否)分类
    def parse_brand_detail(self, response):
        car_count_pattern = re.compile(r'为您找到(.*)辆好车')
        car_count = int(car_count_pattern.search(response.xpath('//p[@class="result-p3"]/text()').get()).group(1))
        if 0 < car_count <= 2000:
            yield scrapy.Request(url=response.url, callback=self.parse_car_list, meta=response.meta, dont_filter=True)
        # elif car_count > 2000:
        #     brand_list = response.xpath('//div[@class="screen"]/dl[position()=2]//ul//li/p//a')
        #     for brand in brand_list:
        #         brand_url = response.urljoin(brand.xpath('./@href').get())
        #         print('brand_url: ' + brand_url)
        #         yield scrapy.Request(url=brand_url, callback=self.parse_brand_detail, meta=response.meta,
        #                              dont_filter=True)
        # else:
        #     pass

    # 处理页面数据及翻页
    def parse_car_list(self, response):
        car_list = response.xpath('//ul[@class="carlist clearfix js-top"]//li/a')
        for car in car_list:
            car_url = response.urljoin(car.xpath('./@href').get())
            yield scrapy.Request(car_url, callback=self.parse_car_detail, meta=response.meta, dont_filter=True)
        # next_page_a_label = response.xpath('//ul[@class="pageLink clearfix"]//li[last()]/a')
        # next_page_button = next_page_a_label.xpath('./span/text()').get()
        # if next_page_button == '下一页':
        #     next_url = response.urljoin(next_page_a_label.xpath('./@href').get())
        #     yield scrapy.Request(next_url, callback=self.parse_car_list, meta=response.meta, dont_filter=True)

    # 处理二手车数据详情
    def parse_car_detail(self, response):
        item = GuaziItem()
        item['city'] = response.meta['city']
        item['title'] = ''.join(response.xpath('//h1[@class="titlebox"]/text()').getall()).strip()
        source_id_pattern = re.compile(r'车源号：(.*)')
        source_id_str = response.xpath('//div[@class="right-carnumber"]/text()').get()
        item['source_id'] = source_id_pattern.search(source_id_str).group(1).strip()
        license_time_img_url = response.xpath('//ul[@class="assort clearfix"]/li[@class="one"]/span/img/@src').get()
        # item['license_time'] = self.image_to_str(license_time_img_url)[0:7]
        item['license_time'] = license_time_img_url
        item['mileage'] = response.xpath('//ul[@class="assort clearfix"]/li[@class="two"]/span/text()').get()
        item['displacement'] = response.xpath('//ul[@class="assort clearfix"]/li[@class="three"]/span/text()').get()
        item['gearbox_type'] = response.xpath('//ul[@class="assort clearfix"]/li[@class="last"]/span/text()').get()
        item['full_price'] = response.xpath('//span[@class="price-num"]/text()').get()
        yield item

    # def image_to_str(self, img_url):
    #     response = requests.get(url=img_url, headers=DEFAULT_REQUEST_HEADERS)
    #     with open('image.jpg', 'wb') as f:
    #         f.write(response.content)
    #     image = Image.open('image.jpg')
    #     return pytesseract.image_to_string(image)

第八步：配置pipelines.py，将数据写入mongdb数据库

from guazi.guazi_mongo import guaziMongo

class GuaziPipeline:
    def process_item(self, item, spider):
        item = dict(item)
        source_id = item['source_id']
        # 以车源号为去重指标，更新数据库内的数据（若有此车源号，更新数据，没有则插入数据）
        guaziMongo.guazi_item_col.update_one({'source_id': source_id}, {'$set': item}, True)
        return item

在settings.py中配置pipeline：

ITEM_PIPELINES = {
   'guazi.pipelines.GuaziPipeline': 300,
}

第九步：设置ip代理
由于爬取过程中ip单一，可能还未爬取到真实数据即会返回203，请求频繁的提示，因此，可以在middlewares.py中设置ip代理（付费）
ip代理商有很多，可以去网络搜索，这里以“亿牛云”为例：
代码参考该网址：https://www.16yun.cn/help/ss_demo/#1python 下面有scrapy项目所需的代码，将代码复制到middlewares.py中，在settings.py中开启该中间件。

DOWNLOADER_MIDDLEWARES = {
   'guazi.middlewares.GuaziDownloaderMiddleware': 543,
   'guazi.middlewares.ProxyMiddleware': 500
}

配置完成后仍需修改settings.py文件，配置常用属性：

ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 5
DOWNLOAD_DELAY = 0.2
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}

第十步：创建主文件，运行爬虫项目
在items.py的同级目录下创建guazi_main.py文件：

from scrapy import cmdline

cmdline.execute('scrapy crawl guazi_spider'.split(' '))

最终的项目目录层级如下图：

运行guazi_main.py文件，查看控制台输出及数据库数据(由于我没有付费ip代理，且为了测试的速度，该测试结果为未开启付费代理，且注释了解析品牌、翻页等内容后的结果)：

可以看到，成功抓取到了数据并存入了mongodb数据库。

项目到此结束。

后续可优化：
一：由于瓜子二手车网站二手车数据很多，为了提升抓取效率，可部署分布式爬虫（暂无结果，我调试了好几天，分布式也没有成功，问题可能出在了python版本的异同性上，后续有时间有机会了在重新尝试一下）
二：存入mongodb数据库的去重原则是通过数据库的update语句的特性来操作的，这种方式在数据量少的时候没有问题，但如果数据量很大，则查询就会很消耗时间，严重影响爬取效率，可能的解决方式：集成bloomfilter到scrapy-redis中，大大提升去重的效率（这种方式我暂时还不会，有待学习）

posted @ 2020-11-11 12:07 脱下长日的假面阅读(758) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

脱下长日的假面

python scrapy爬取瓜子二手车网站二手车信息（二）

项目到此结束。

公告