python scrapy爬取瓜子二手车网站二手车信息(二)

接上文 https://www.cnblogs.com/achangblog/p/13956140.html
第五步:编辑中间件文件middlewares.py并启用
在上一步破解js代码得到正确响应内容后,即可把破解方法复制进middlewares.py文件中,稍作修改即可:

import re
import execjs
import datetime

class GuaziDownloaderMiddleware:

    def __init__(self):
        self.f = open('guazi.js', 'r', encoding='utf-8')
        self.content = self.f.read()

    def process_response(self, request, response, spider):
        html = response.text
        if '正在打开中,请稍后' in html:
            pattern = re.compile(r'value=anti\(\'(.*?)\',\'(.*?)\'\)')
            string_ = pattern.search(html).group(1)
            key = pattern.search(html).group(2)
            js_compile = execjs.compile(self.content)
            value = js_compile.call('anti', string_, key)
            expire_time = datetime.datetime.utcnow() + datetime.timedelta(seconds=2592000)
            expires = "; expires=" + expire_time.strftime('%a, %d %b %Y %H:%M:%S GMT')
            cookie = {'antipas': value, 'expires': expires, 'path': '/'}
            request.cookies = cookie
            return request
        elif response.status == 200:
            return response
        # elif '客官请求太频繁啦,请1分钟后重试' in html:
        #     print('!!! response.status是:{}'.format(response.status))
        #     print('request.url: {}'.format(request.url))
        #     print('请求频繁')
        #     return request

    def close_spider(self):
        self.f.close()

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

然后在settings.py文件开启该middleware:

DOWNLOADER_MIDDLEWARES = {
   'guazi.middlewares.GuaziDownloaderMiddleware': 543,
}

第六步: 分析网页结构,初步编写爬虫文件guazi_spider.py
①.通过分析瓜子网站网页结构,我们可以从 https://www.guazi.com/www/buy 网页开始爬起,通过抓取不同城市url循环遍历每个城市的二手车信息;城市url信息隐藏在一个js函数中,可通过正则表达式获取。
②.再分析每个城市二手车信息的页面,发现每页展示40条二手车信息,最多展示50页,也就是最多展示2000条数据,无法完全展示城市所有二手车信息,因此,我们需要按品牌分类继续往下细分url。以城市安吉为例,点击品牌栏上的“大众”,得到的url为 https://www.guazi.com/anji/dazhong/,发现 “为您找到7802辆好车”,仍无法完全显示,则继续细分车系,如点击车系栏的“速腾”,得到的url为 https://www.guazi.com/anji/suteng/ ,页面展示 “为您找到890辆好车”,小于2000,可以完全展示,如此循环遍历每一个大的品牌,若发现车辆数大于2000,则继续细分车系,至此,url细分结束;
③.接下来就是翻页操作,通过分析网页结构可以发现,只要不是最后一页,都有下一页的按钮存在,据此,循环遍历可以获取所有url信息
④.有了每一页的url,即可得知每一页上的二手车详情页的url,根据二手车详情页url,即可获取到该二手车数据信息
通过以上分析,初步得到的guazi_spider.py文件内容如下:

# -*- coding: utf-8 -*-
import scrapy
import re
import json
from guazi.items import GuaziItem
import pytesseract
import requests
from PIL import Image
from ..settings import DEFAULT_REQUEST_HEADERS


class GuaziSpiderSpider(scrapy.Spider):
    name = 'guazi_spider'
    allowed_domains = ['www.guazi.com']
    start_urls = ['https://www.guazi.com/www/buy']

    def parse(self, response):
        pattern = re.compile(r'cityLeft = (.*?);')
        city_left_dict = json.loads(pattern.search(response.text).group(1))
        pattern = re.compile(r'cityRight = (.*?);')
        city_right_dict = json.loads(pattern.search(response.text).group(1))
        city_domain_and_name_list = []
        for key, value in {**city_left_dict, **city_right_dict}.items():
            for city_detail in value:
                one_couple = {}
                if 'domain' in city_detail:
                    one_couple['domain'] = city_detail['domain']
                if 'name' in city_detail:
                    one_couple['name'] = city_detail['name']
                city_domain_and_name_list.append(one_couple)
        for city_domain_and_name in city_domain_and_name_list:
            city_name = city_domain_and_name['name']
            city_url = 'https://www.guazi.com/' + city_domain_and_name['domain'] + '/buy'
            yield scrapy.Request(url=city_url, callback=self.parse_city_detail, meta={'city': city_name},
                                 dont_filter=True)

    # 处理每个城市,获取品牌url
    def parse_city_detail(self, response):
        brand_list = response.xpath('//div[@class="dd-all clearfix js-brand js-option-hid-info"]//ul//li/p//a')
        for brand in brand_list:
            brand_url = response.urljoin(brand.xpath('./@href').get())
            yield scrapy.Request(url=brand_url, callback=self.parse_brand_detail, meta=response.meta, dont_filter=True)

    # 处理品牌url及其车系(是否)分类
    def parse_brand_detail(self, response):
        car_count_pattern = re.compile(r'为您找到(.*)辆好车')
        car_count = int(car_count_pattern.search(response.xpath('//p[@class="result-p3"]/text()').get()).group(1))
        if 0 < car_count <= 2000:
            yield scrapy.Request(url=response.url, callback=self.parse_car_list, meta=response.meta, dont_filter=True)
        # elif car_count > 2000:
        #     brand_list = response.xpath('//div[@class="screen"]/dl[position()=2]//ul//li/p//a')
        #     for brand in brand_list:
        #         brand_url = response.urljoin(brand.xpath('./@href').get())
        #         print('brand_url: ' + brand_url)
        #         yield scrapy.Request(url=brand_url, callback=self.parse_brand_detail, meta=response.meta,
        #                              dont_filter=True)
        # else:
        #     pass

    # 处理页面数据及翻页
    def parse_car_list(self, response):
        car_list = response.xpath('//ul[@class="carlist clearfix js-top"]//li/a')
        for car in car_list:
            car_url = response.urljoin(car.xpath('./@href').get())
            yield scrapy.Request(car_url, callback=self.parse_car_detail, meta=response.meta, dont_filter=True)
        # next_page_a_label = response.xpath('//ul[@class="pageLink clearfix"]//li[last()]/a')
        # next_page_button = next_page_a_label.xpath('./span/text()').get()
        # if next_page_button == '下一页':
        #     next_url = response.urljoin(next_page_a_label.xpath('./@href').get())
        #     yield scrapy.Request(next_url, callback=self.parse_car_list, meta=response.meta, dont_filter=True)

    # 处理二手车数据详情
    def parse_car_detail(self, response):
        item = GuaziItem()
        item['city'] = response.meta['city']
        item['title'] = ''.join(response.xpath('//h1[@class="titlebox"]/text()').getall()).strip()
        source_id_pattern = re.compile(r'车源号:(.*)')
        source_id_str = response.xpath('//div[@class="right-carnumber"]/text()').get()
        item['source_id'] = source_id_pattern.search(source_id_str).group(1).strip()
        license_time_img_url = response.xpath('//ul[@class="assort clearfix"]/li[@class="one"]/span/img/@src').get()
        # item['license_time'] = self.image_to_str(license_time_img_url)[0:7]
        item['license_time'] = license_time_img_url
        item['mileage'] = response.xpath('//ul[@class="assort clearfix"]/li[@class="two"]/span/text()').get()
        item['displacement'] = response.xpath('//ul[@class="assort clearfix"]/li[@class="three"]/span/text()').get()
        item['gearbox_type'] = response.xpath('//ul[@class="assort clearfix"]/li[@class="last"]/span/text()').get()
        item['full_price'] = response.xpath('//span[@class="price-num"]/text()').get()
        yield item
    
    # 识别图片转为文字
    # def image_to_str(self, img_url):
    #     response = requests.get(url=img_url, headers=DEFAULT_REQUEST_HEADERS)
    #     with open('image.jpg', 'wb') as f:
    #         f.write(response.content)
    #     image = Image.open('image.jpg')
    #     return pytesseract.image_to_string(image)

ps: 在获取“上牌时间”时,发现网页显示为一个小图片,而非文本数据,此时可以通过pytesseract模块,自动识别图片上的文字,即可获取正确的时间信息;或者直接保存“上牌时间”图片的url信息。

第七步:优化爬虫文件guazi_spider.py
由于上述代码每次运行都要解析一遍城市url数据,而城市url数据是固定不变的,所以,可以先把城市url解析出来存入mongodb数据库,在爬虫文件中直接使用即可。

首先需在items.py的同级目录下创建文件guazi_mongo.py

import pymongo


class GuaziMongo:

    def __init__(self):
        self.client = pymongo.MongoClient(host='localhost', port=27017)
        self.guazi_db = self.client['guazi']
        self.guazi_uncol = self.guazi_db['guazi_url_and_name']
        self.guazi_item_col = self.guazi_db['guazi_item']


guaziMongo = GuaziMongo()

然后再在items.py的同级目录下创建文件guazi_get_city_url.py

# 获取所有瓜子二手车城市的url和名字,以字典的形式存入数据库 {'name': '北京', 'url': 'https://www.guazi.com/bj/buy'}
import requests
import re
import execjs
import datetime
import json
from guazi.guazi_mongo import guaziMongo


def get_city_name_and_url(response):
    pattern = re.compile(r'cityLeft = (.*?);')
    city_left_dict = json.loads(pattern.search(response.text).group(1))
    pattern = re.compile(r'cityRight = (.*?);')
    city_right_dict = json.loads(pattern.search(response.text).group(1))
    city_domain_and_name_list = []
    for key, value in {**city_left_dict, **city_right_dict}.items():
        for city_detail in value:
            one_couple = {}
            if 'domain' in city_detail:
                one_couple['domain'] = city_detail['domain']
            if 'name' in city_detail:
                one_couple['name'] = city_detail['name']
            city_domain_and_name_list.append(one_couple)
    city_name_and_url_list = []
    for city_domain_and_name in city_domain_and_name_list:
        city_name = city_domain_and_name['name']
        city_url = 'https://www.guazi.com/' + city_domain_and_name['domain'] + '/buy'
        city_name_and_url_list.append({'name': city_name, 'url': city_url})
    return city_name_and_url_list


headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}

url = 'https://www.guazi.com/www/buy'
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
html = response.text
if '正在打开中,请稍后' in html:
    pattern = re.compile(r'value=anti\(\'(.*?)\',\'(.*?)\'\)')
    string_ = pattern.search(html).group(1)
    key = pattern.search(html).group(2)
    print('string_: {}, key: {}'.format(string_, key))
    with open('guazi.js', 'r', encoding='utf-8') as f:
        js_compile = execjs.compile(f.read())
        value = js_compile.call('anti', string_, key)
    name = 'antipas'
    expire_time = datetime.datetime.utcnow() + datetime.timedelta(seconds=2592000)
    expires = "; expires=" + expire_time.strftime('%a, %d %b %Y %H:%M:%S GMT')
    cookie = name + "=" + value + expires + "; path=/"
    headers['Cookie'] = cookie
    response = requests.get(url, headers=headers)
    data_col = guaziMongo.guazi_uncol
    data = get_city_name_and_url(response)
    print(data)
    # data_col.delete_many({})
    data_col.insert_many(data)
    for data_ in data:
        print(data_)

运行一遍guazi_get_city_url.py即可往数据库中插入一次所有城市url数据

爬虫文件guazi_spider.py可修改为如下:

# -*- coding: utf-8 -*-
import scrapy
import re
import json
from guazi.items import GuaziItem
from guazi.guazi_mongo import guaziMongo
import pytesseract
import requests
from PIL import Image
from ..settings import DEFAULT_REQUEST_HEADERS


class GuaziSpiderSpider(scrapy.Spider):
    name = 'guazi_spider'
    allowed_domains = ['www.guazi.com']
    # start_urls = ['https://www.guazi.com/www/buy']

    def start_requests(self):
        while True:
            city_url_and_name = guaziMongo.guazi_uncol.find_one_and_delete({})
            if city_url_and_name:
                city_url = city_url_and_name['url']
                city_name = city_url_and_name['name']
                yield scrapy.Request(url=city_url, callback=self.parse_city_detail, meta={'city': city_name},
                                     dont_filter=True)
            else:
                break

    # 处理每个城市,获取品牌url
    def parse_city_detail(self, response):
        brand_list = response.xpath('//div[@class="dd-all clearfix js-brand js-option-hid-info"]//ul//li/p//a')
        for brand in brand_list:
            brand_url = response.urljoin(brand.xpath('./@href').get())
            yield scrapy.Request(url=brand_url, callback=self.parse_brand_detail, meta=response.meta, dont_filter=True)

    # 处理品牌url及其车系(是否)分类
    def parse_brand_detail(self, response):
        car_count_pattern = re.compile(r'为您找到(.*)辆好车')
        car_count = int(car_count_pattern.search(response.xpath('//p[@class="result-p3"]/text()').get()).group(1))
        if 0 < car_count <= 2000:
            yield scrapy.Request(url=response.url, callback=self.parse_car_list, meta=response.meta, dont_filter=True)
        # elif car_count > 2000:
        #     brand_list = response.xpath('//div[@class="screen"]/dl[position()=2]//ul//li/p//a')
        #     for brand in brand_list:
        #         brand_url = response.urljoin(brand.xpath('./@href').get())
        #         print('brand_url: ' + brand_url)
        #         yield scrapy.Request(url=brand_url, callback=self.parse_brand_detail, meta=response.meta,
        #                              dont_filter=True)
        # else:
        #     pass

    # 处理页面数据及翻页
    def parse_car_list(self, response):
        car_list = response.xpath('//ul[@class="carlist clearfix js-top"]//li/a')
        for car in car_list:
            car_url = response.urljoin(car.xpath('./@href').get())
            yield scrapy.Request(car_url, callback=self.parse_car_detail, meta=response.meta, dont_filter=True)
        # next_page_a_label = response.xpath('//ul[@class="pageLink clearfix"]//li[last()]/a')
        # next_page_button = next_page_a_label.xpath('./span/text()').get()
        # if next_page_button == '下一页':
        #     next_url = response.urljoin(next_page_a_label.xpath('./@href').get())
        #     yield scrapy.Request(next_url, callback=self.parse_car_list, meta=response.meta, dont_filter=True)

    # 处理二手车数据详情
    def parse_car_detail(self, response):
        item = GuaziItem()
        item['city'] = response.meta['city']
        item['title'] = ''.join(response.xpath('//h1[@class="titlebox"]/text()').getall()).strip()
        source_id_pattern = re.compile(r'车源号:(.*)')
        source_id_str = response.xpath('//div[@class="right-carnumber"]/text()').get()
        item['source_id'] = source_id_pattern.search(source_id_str).group(1).strip()
        license_time_img_url = response.xpath('//ul[@class="assort clearfix"]/li[@class="one"]/span/img/@src').get()
        # item['license_time'] = self.image_to_str(license_time_img_url)[0:7]
        item['license_time'] = license_time_img_url
        item['mileage'] = response.xpath('//ul[@class="assort clearfix"]/li[@class="two"]/span/text()').get()
        item['displacement'] = response.xpath('//ul[@class="assort clearfix"]/li[@class="three"]/span/text()').get()
        item['gearbox_type'] = response.xpath('//ul[@class="assort clearfix"]/li[@class="last"]/span/text()').get()
        item['full_price'] = response.xpath('//span[@class="price-num"]/text()').get()
        yield item

    # def image_to_str(self, img_url):
    #     response = requests.get(url=img_url, headers=DEFAULT_REQUEST_HEADERS)
    #     with open('image.jpg', 'wb') as f:
    #         f.write(response.content)
    #     image = Image.open('image.jpg')
    #     return pytesseract.image_to_string(image)

第八步:配置pipelines.py,将数据写入mongdb数据库

from guazi.guazi_mongo import guaziMongo

class GuaziPipeline:
    def process_item(self, item, spider):
        item = dict(item)
        source_id = item['source_id']
        # 以车源号为去重指标,更新数据库内的数据(若有此车源号,更新数据,没有则插入数据)
        guaziMongo.guazi_item_col.update_one({'source_id': source_id}, {'$set': item}, True)
        return item

在settings.py中配置pipeline:

ITEM_PIPELINES = {
   'guazi.pipelines.GuaziPipeline': 300,
}

第九步:设置ip代理
由于爬取过程中ip单一,可能还未爬取到真实数据即会返回203,请求频繁的提示,因此,可以在middlewares.py中设置ip代理(付费)
ip代理商有很多,可以去网络搜索,这里以“亿牛云”为例:
代码参考该网址:https://www.16yun.cn/help/ss_demo/#1python 下面有scrapy项目所需的代码,将代码复制到middlewares.py中,在settings.py中开启该中间件。

DOWNLOADER_MIDDLEWARES = {
   'guazi.middlewares.GuaziDownloaderMiddleware': 543,
   'guazi.middlewares.ProxyMiddleware': 500
}

配置完成后仍需修改settings.py文件,配置常用属性:

ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 5
DOWNLOAD_DELAY = 0.2
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}

第十步:创建主文件,运行爬虫项目
在items.py的同级目录下创建guazi_main.py文件:

from scrapy import cmdline

cmdline.execute('scrapy crawl guazi_spider'.split(' '))

最终的项目目录层级如下图:

运行guazi_main.py文件,查看控制台输出及数据库数据(由于我没有付费ip代理,且为了测试的速度,该测试结果为未开启付费代理,且注释了解析品牌、翻页等内容后的结果):


可以看到,成功抓取到了数据并存入了mongodb数据库。

项目到此结束。

后续可优化:
一:由于瓜子二手车网站二手车数据很多,为了提升抓取效率,可部署分布式爬虫(暂无结果,我调试了好几天,分布式也没有成功,问题可能出在了python版本的异同性上,后续有时间有机会了在重新尝试一下)
二:存入mongodb数据库的去重原则是通过数据库的update语句的特性来操作的,这种方式在数据量少的时候没有问题,但如果数据量很大,则查询就会很消耗时间,严重影响爬取效率,可能的解决方式:集成bloomfilter到scrapy-redis中,大大提升去重的效率(这种方式我暂时还不会,有待学习)

posted @ 2020-11-11 12:07  脱下长日的假面  阅读(758)  评论(0编辑  收藏  举报