爬取瓜子二手车代码
setting.py中
ROBOTSTXT_OBEY = False
guazispider.py
import json
from ..items import CarItem
import scrapy
from fake_headers import Headers
header = Headers(
browser='chrome',
os='win',
headers=True
)
class guaziSpider(scrapy.Spider):
name = 'guazi'
allowed_domains = ['guazi.com']
url_format = 'https://mapi.guazi.com/car-source/carList/pcList?page={}&pageSize=12&city_filter=12&city=12&guazi_city=12&tag_types=18&versionId=0.0.0.0&osv=Unknown&platfromSource=wap'
font0_num_map = {
"": "0",
"": "1",
"": "2",
"": "3",
"": "4",
"": "5",
"": "6",
"": "7",
"": "8",
"": "9"
}
font1_num_map = {
"": "0",
"": "1",
"": "2",
"": "3",
"": "4",
"": "5",
"": "6",
"": "7",
"": "8",
"": "9"
}
font2_num_map = {
"": "0",
"": "1",
"": "2",
"": "3",
"": "4",
"": "5",
"": "6",
"": "7",
"": "8",
"": "9"
}
font3_num_map = {
'': '0',
'': '1',
'': '2',
'': '3',
'': '4',
'': '5',
'': '6',
'': '7',
'': '8',
'': '9'
}
font4_num_map = {
"": "0",
"": "1",
"": "2",
"": "3",
"": "4",
"": "5",
"": "6",
"": "7",
"": "8",
"": "9"
}
# 选取第一种编码格式,
# 其实并不知道到底是哪一个编码规则,所以五个编码规则都要试一下,通过首付价<一口价,首付价/一口价~30%
# 所以这里换一下
# def decode_num(self, text: str):
# result = text
# for key in self.font1_num_map:
# result = result.replace(key, self.font1_num_map.get(key))
# return result
# 将上面五种格式方法存储在列表中
font_num_map_list = [font0_num_map, font1_num_map, font2_num_map, font3_num_map, font4_num_map]
# 用五套编码规则解析,并保存在value_list中
def decode_num_with_font_list(self, text):
value_list = []
for l in self.font_num_map_list:
value_list.append(self.decode_num(text, l))
return value_list
def decode_num(self, text: str, l):
result = text
for key in l:
result = result.replace(key, l.get(key))
return result
# 通过首付价格和价格,判断五套编码中正确的索引index
def predict_best_index(self, price_list, first_pay_list):
list1 = [0, 1, 2, 3, 4]
for i in range(5):
price = float(price_list[i].split("万")[0])
first_pay = 0
if first_pay_list[i] != '':
first_pay = float(first_pay_list[i].split("万")[0])
percent = int(first_pay * 10 / price)
if first_pay >= price or percent > 3:
list1.remove(i)
if len(list1) != 0:
return list1[0]
return 0
def start_requests(self):
# 设置可以选择页数
for i in range(1, 20):
url = self.url_format.format(i)
yield scrapy.Request(url=url, headers=header.generate(), callback=self.parse)
def parse(self, response):
jsonObj = json.loads(response.text)
data = jsonObj.get('data', None)
if data is not None:
postList = data.get('postList', None)
if postList is not None:
for car in postList:
title = car['title']
# 有的没有buyOutPrice,所以设为0,这时候必须用get方法了
buyOutPrice = car.get('buyOutPrice', '0万')
# 后面这三个是加密的需要解密
price = car['price']
first_pay = car['first_pay']
road_haul = car['road_haul']
decode_price_list = self.decode_num_with_font_list(price)
decode_first_pay_list = self.decode_num_with_font_list(first_pay)
decode_road_haul_list = self.decode_num_with_font_list(road_haul)
print(title, price, first_pay, road_haul, buyOutPrice)
print(title, decode_price_list, decode_first_pay_list, decode_road_haul_list, buyOutPrice)
index = self.predict_best_index(decode_price_list, decode_first_pay_list)
print("预测后的最佳价格为:", decode_price_list[index], decode_first_pay_list[index],
decode_road_haul_list[index])
# 传入到item中去
car = CarItem()
car['title'] = title
car['buyOutPrice'] = buyOutPrice
car['price'] = decode_price_list[index]
car['first_pay'] = decode_first_pay_list[index]
car['road_haul'] = decode_road_haul_list[index]
yield car
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class SpiderdemoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class CarItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
first_pay = scrapy.Field()
road_haul = scrapy.Field()
buyOutPrice = scrapy.Field()
pipelines.py
class SpiderdemoPipeline:
def process_item(self, item, spider):
return item
init.py
from scrapy import cmdline
# cmdline.execute("scrapy crawl guazi".split())
# 保存
cmdline.execute("scrapy crawl guazi -O cars.csv".split())