- Car_Name 车辆型号
- Selling_Price(lacs) 车主想要出售汽车的价格
- Present_Price(lacs) 这是该车当前的出厂价
- Kms_Driven 汽车行驶的公里数
- Fuel_Type 汽车燃料类型(汽油/柴油/CNG/LPG/电动)
- Seller_Type 个人还是经销商
- Transmission 汽车齿轮传动(自动/手动)
- Past_Owners 汽车的前车主数量
- Year 购车年份
1 #import libraries 2 import pandas as pd 3 import numpy as np 4 import matplotlib.pyplot as plt 5 import seaborn as sns 6 import os 7 import warnings 8 %matplotlib inline 9 pd.set_option("display.max_rows", None,"display.max_columns", None) 10 warnings.simplefilter(action='ignore') 11 plt.style.use('seaborn') 12 #load dataset 13 df_main = pd.read_csv('./car data.csv') 14 df_main.head()

数据共有301条 共有9条特征
#numerical stats df_main.describe()
1 df_main.isna().sum()
1 fig, axes = plt.subplots(nrows = 3 , ncols = 2 ) 2 fig.set_size_inches( 25 , 13 ) 3 4 sns.barplot(x = df_main[ 'Year' ],y = df_main[ 'Selling_Price' ] ,ax = axes[ 0 ][ 0 ]) 5 sns.barplot(x = df_main[ 'Fuel_Type' ], y = df_main[ 'Selling_Price' ], ax = axes[ 0 ][ 1 ]) 6 sns.barplot(x = df_main[ 'Seller_Type' ], y = df_main[ 'Selling_Price' ], ax = axes[ 1 ][ 0 ]) 7 sns.barplot(x = df_main[ 'Transmission' ], y = df_main[ 'Selling_Price' ], ax = axes[ 1 ][ 1 ]) 8 sns.barplot(x = df_main[ 'Owner' ], y = df_main[ 'Selling_Price' ], ax = axes[ 2 ][ 0 ]) 9 sns.scatterplot(x = df_main[ 'Kms_Driven' ], y = df_main[ 'Selling_Price' ], ax = axes[ 2 ][ 1 ]) |

- 随着汽车老化,价格开始下降,年份图看起来不成比例,因为并非所有数据都同样可用,否则会看到线性图。
- 与其他汽车相比,柴油车的价格最高,这很奇怪,因为柴油车有特定的使用寿命。
- 经销商提供的自动驾驶汽车价格很高,这是有道理的,因为很少有人喜欢在驾驶时换挡。 我们可以清楚地看到,
- 并不是所有的数据对于所有的属性都是均匀分布的。
- 例如-2003年至2011年的数据很少。2018年的数据也很少与汽油相比,CNG的数据也非常少。
1 fig, (ax1, ax2) = plt.subplots(nrows = 2 ) 2 fig.set_size_inches( 22 , 15 ) 3 sns.barplot(x = df_main[ 'Year' ], y = df_main[ 'Selling_Price' ],hue = df_main[ 'Transmission' ], ax = ax1) 4 sns.scatterplot(x = df_main[ 'Present_Price' ], y = df_main[ 'Selling_Price' ], ax = ax2) |
<br> 1 #把车辆年份转换为车辆使用的年数 数据集是2020年的所以用2020-去age 随后删除year这一列 2 df_main[ 'Age' ] = 2020 - df_main[ 'Year' ] 3 df_main.drop( 'Year' ,axis = 1 ,inplace = True ) 4 df_main.rename(columns = { 'Selling_Price' : 'Selling_Price(lacs)' , 'Present_Price' : 'Present_Price(lacs)' , 'Owner' : 'Past_Owners' },inplace = True ) |
cat_cols = [ 'Fuel_Type' , 'Seller_Type' , 'Transmission' , 'Past_Owners' ] i = 0 while i < 4 : fig = plt.figure(figsize = [ 10 , 4 ]) #ax1 = fig.add_subplot(121) #ax2 = fig.add_subplot(122) #ax1.title.set_text(cat_cols[i]) plt.subplot( 1 , 2 , 1 ) sns.countplot(x = cat_cols[i], data = df_main) i + = 1 #ax2.title.set_text(cat_cols[i]) plt.subplot( 1 , 2 , 2 ) sns.countplot(x = cat_cols[i], data = df_main) i + = 1 plt.show() |
num_cols = [ 'Selling_Price(lacs)' , 'Present_Price(lacs)' , 'Kms_Driven' , 'Age' ] i = 0 while i < 4 : fig = plt.figure(figsize = [ 13 , 3 ]) #ax1 = fig.add_subplot(121) #ax2 = fig.add_subplot(122) #ax1.title.set_text(num_cols[i]) plt.subplot( 1 , 2 , 1 ) sns.boxplot(x = num_cols[i], data = df_main) i + = 1 #ax2.title.set_text(num_cols[i]) plt.subplot( 1 , 2 , 2 ) sns.boxplot(x = num_cols[i], data = df_main) i + = 1 plt.show() |
1 def num_summary(dataframe, numerical_col): 2 quantiles = [ 0.05 , 0.10 , 0.20 , 0.30 , 0.40 , 0.50 , 0.60 , 0.70 , 0.80 , 0.90 ] 3 print (dataframe[numerical_col].describe(quantiles).T) 4 for num_col in df_main[[ 'Present_Price(lacs)' , 'Selling_Price(lacs)' , 'Kms_Driven' ]].columns: 5 num_summary(df_main, num_col) |
sns.heatmap(df_main.corr(), annot = True , cmap = "RdBu" ) plt.show() |
df_main.corr()[ 'Selling_Price(lacs)' ] |
1 | df_main.pivot_table(values = 'Selling_Price(lacs)' , index = 'Seller_Type' , columns = 'Fuel_Type' ) |
1 | df_main.pivot_table(values = 'Selling_Price(lacs)' , index = 'Seller_Type' , columns = 'Transmission' ) |
1 2 3 4 5 6 7 8 | y = df_main[ 'Selling_Price(lacs)' ] X = df_main.drop( 'Selling_Price(lacs)' ,axis = 1 ) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state = 1 ) print ( "x train: " ,X_train.shape) print ( "x test: " ,X_test.shape) print ( "y train: " ,y_train.shape) print ( "y test: " ,y_test.shape) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | from sklearn.metrics import r2_score from sklearn.model_selection import cross_val_score CV = [] R2_train = [] R2_test = [] def car_pred_model(model,model_name): # 模型训练 model.fit(X_train,y_train) # 训练集R2分数 y_pred_train = model.predict(X_train) R2_train_model = r2_score(y_train,y_pred_train) R2_train.append( round (R2_train_model, 2 )) # 测试集R2分数 y_pred_test = model.predict(X_test) R2_test_model = r2_score(y_test,y_pred_test) R2_test.append( round (R2_test_model, 2 )) # R2交叉验证分数 cross_val = cross_val_score(model ,X_train ,y_train ,cv = 5 ) cv_mean = cross_val.mean() CV.append( round (cv_mean, 2 )) # 结果打印 print ( "Train R2-score :" , round (R2_train_model, 2 )) print ( "Test R2-score :" , round (R2_test_model, 2 )) print ( "Train CV scores :" ,cross_val) print ( "Train CV mean :" , round (cv_mean, 2 )) # Plotting Graphs # Residual Plot of train data fig, ax = plt.subplots( 1 , 2 ,figsize = ( 10 , 4 )) ax[ 0 ].set_title( 'Residual Plot of Train samples' ) sns.distplot((y_train - y_pred_train),hist = False ,ax = ax[ 0 ]) ax[ 0 ].set_xlabel( 'y_train - y_pred_train' ) # Y_test vs Y_train scatter plot ax[ 1 ].set_title( 'y_test vs y_pred_test' ) ax[ 1 ].scatter(x = y_test, y = y_pred_test) ax[ 1 ].set_xlabel( 'y_test' ) ax[ 1 ].set_ylabel( 'y_pred_test' ) plt.show() |
1 2 3 4 | from sklearn.linear_model import LinearRegression lr = LinearRegression() car_pred_model(lr, "Linear_regressor.pkl" ) |
1 2 3 4 5 6 7 8 | from sklearn.linear_model import Lasso from sklearn.model_selection import RandomizedSearchCV ls = Lasso() alpha = np.logspace( - 3 , 3 ,num = 14 ) # range for alpha ls_rs = RandomizedSearchCV(estimator = ls, param_distributions = dict (alpha = alpha)) car_pred_model(ls_rs, "lasso.pkl" ) |
1 from sklearn.ensemble import RandomForestRegressor 2 from sklearn.model_selection import RandomizedSearchCV 3 4 rf = RandomForestRegressor() 5 6 # 基学习器个数 7 n_estimators = list ( range ( 500 , 1000 , 100 )) 8 # 最大深度 9 max_depth = list ( range ( 4 , 9 , 4 )) 10 # 最小分类样本数 11 min_samples_split = list ( range ( 4 , 9 , 2 )) 12 # 叶子节点最少样本数 13 min_samples_leaf = [ 1 , 2 , 5 , 7 ] 14 # 每次拆分时需要考虑的特征数量 15 max_features = [ 'auto' , 'sqrt' ] 16 17 # 网格搜素字典 18 param_grid = { "n_estimators" :n_estimators, 19 "max_depth" :max_depth, 20 "min_samples_split" :min_samples_split, 21 "min_samples_leaf" :min_samples_leaf, 22 "max_features" :max_features} 23 24 rf_rs = RandomizedSearchCV(estimator = rf, param_distributions = param_grid) |
准确度相当高,基本上都分布在0的周围 相比较而言说明随机森林的拟合能力很强
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import RandomizedSearchCV gb = GradientBoostingRegressor() # 学习率 learning_rate = [ 0.001 , 0.01 , 0.1 , 0.2 ] # 基学习器个数 n_estimators = list ( range ( 500 , 1000 , 100 )) # 最大深度 max_depth = list ( range ( 4 , 9 , 4 )) # 最小分类样本数 min_samples_split = list ( range ( 4 , 9 , 2 )) # 叶子节点最少样本数 min_samples_leaf = [ 1 , 2 , 5 , 7 ] # 每次拆分时需要考虑的特征数量 max_features = [ 'auto' , 'sqrt' ] # 网格搜索字典 param_grid = { "learning_rate" :learning_rate, "n_estimators" :n_estimators, "max_depth" :max_depth, "min_samples_split" :min_samples_split, "min_samples_leaf" :min_samples_leaf, "max_features" :max_features} gb_rs = RandomizedSearchCV(estimator = gb, param_distributions = param_grid) car_pred_model(gb_rs, "gradient_boosting.pkl" ) |
1 2 3 4 | # Technique = ["LinearRegression","Ridge","Lasso","RandomForestRegressor","GradientBoostingRegressor"] results = pd.DataFrame({ 'Model' : Technique, 'R Squared(Train)' : R2_train, 'R Squared(Test)' : R2_test, 'CV score mean(Train)' : CV}) display(results) print ( len (Technique), len (R2_train), len (R2_test), len (CV)) |
