数学建模用/Python爬虫实战——爬取Kelley Blue Book(KBB二手车交易网站)的交易信息
首先这是本次建模的题目:
于是我们先用chrome查看网页源码,还好比较简单是静态页面:
在cars for sale 这边选择Used,右键“网页查视网页源码”:
如图黄色荧光笔显示的分别是二手出售价格,品牌,里程,型号。是我们需要且可以找到的。可以看到这些东西包含在script标签下,属性是application/ld+json(这里貌似遮住看不到了,而script倒是可以看</script>结尾标签推出)
以下给出代码,一共两个文件:
from bs4 import BeautifulSoup
import os
import requests
import sys
class Logger(object):
def __init__(self, fileN="Default.log"):
self.terminal = sys.stdout
self.log = open(fileN, "a", encoding="utf-8")
def write(self, message):
self.terminal.write(message)
self.log.write(message)
def flush(self):
pass
def main():
os.chdir("E:\\")
result = requests.get("https://www.kbb.com/cars-for-sale/cars/used-cars/?distance=none")
r = result.text
soup = BeautifulSoup(r, 'lxml')
pretty = soup.prettify()
sys.stdout = Logger("E:\\pretty.txt")
print(pretty)
if __name__ == '__main__':
main()
from bs4 import BeautifulSoup
import re
import pandas as pd
pattern = re.compile(r'\d+')
p = []
v = []
brand = []
model = []
file = open("E:\\pretty.txt", 'r', encoding='utf-8')
soup = BeautifulSoup(file, 'lxml')
for s in soup.body.find_all('script', type="application/ld+json"):
p += re.findall('.*price": "(.*)",', s.string)
v += re.findall('.*value": "(.*)"', s.string)
brand += re.findall('.*brand": "(.*)"', s.string)
model += re.findall('.*model": "(.*)"', s.string)
price = []
for n in p:
price.append(float(n))
value = []
for n in v:
value.append(float(n))
dataframe = pd.DataFrame({'price': price, 'value': value, 'brand': brand, 'model': model})
dataframe.to_csv('result.csv', index=False, sep=',')
data = pd.read_csv('result.csv')
print(data)
END