房价预测
https://www.kaggle.com/c/california-house-prices/data?select=train.csv
https://cloud.tencent.com/developer/article/1466590
1. 加载数据,理解各特征含义
- 画出相关性图,针对各相关性特征,找出离群点,并删除离群点
- 对上面相关性之外的特征进行处理(非数字),a.部分数字部分文字,文字转数字
4.缺省值填充,有4中方式,填充0, 填充None, 填充最大值/最小值/均值, 填充众数
5.挑选特征
6.训练
import pandas as pd
train_df = pd.read_csv("./0.hourse_prices_data/train.csv")
test_df = pd.read_csv("./0.hourse_prices_data/test.csv")
test_id = test_df["Id"]
sample_submission_df = pd.read_csv("./0.hourse_prices_data/sample_submission.csv")
print(train_df.shape)
print(test_df.shape)
print(sample_submission_df.shape)
1.EDA
1.1 查看缺失值
print("***************** train data missing ")
print(train_df.isnull().sum())
print("* test data missing ****************")
print(test_df.isnull().sum())
1.2 数字特征的相关性分析
1.2.1 相关性分析
【线性相关关系】的特征有:
- bathrooms/full bathrooms
- 3个 school score 看来美国人买房子也是要看周围学校的
- 2个 tax
- listed price 挂牌价格
- last sold price 上次成交价格
import matplotlib.pyplot as plt
import seaborn as sns
colormap = plt.cm.viridis
plt.figure(figsize = (20,20))
sns.heatmap(train_df.drop(['Id'],axis=1).corr(), square = True, cmap = colormap, linecolor = 'white', annot = True)
sns.heatmap(train_df.drop(['Id','Zip'],axis=1).corr(), square = True, cmap = colormap, linecolor = 'white', annot = True)
plt.show()
1.2.2 相关性特征特征清洗
针对以上分析出来相关性特征进行异常点清洗
########## Bathrooms
train_df.columns
all_df = train_df.loc[:, ["Sold Price", "Bathrooms"]]
plt.figure(figsize = (20, 6))
sns.scatterplot(data=all_df, x = 'Sold Price', y = 'Bathrooms')
plt.show()
######### Tax assessed value ######## 3674 43398
all_df = train_df.loc[:, ["Sold Price", "Tax assessed value"]]
plt.figure(figsize = (20, 6))
sns.scatterplot(data=all_df, x = 'Sold Price', y = 'Tax assessed value')
for i in range(train_df.shape[0]):
if train_df.iloc[i].at['Sold Price'] and int(train_df.iloc[i].at['Sold Price']) > 50000000:
plt.text(train_df.iloc[i].at['Sold Price']+200, train_df.iloc[i].at['Tax assessed value'], train_df.iloc[i].at['Id'])
plt.show()
######### Tax assessed value ######## 3674 43398
all_df = train_df.loc[:, ["Sold Price", "Annual tax amount"]]
plt.figure(figsize = (20, 6))
sns.scatterplot(data=all_df, x = 'Sold Price', y = 'Annual tax amount')
for i in range(train_df.shape[0]):
if train_df.iloc[i].at['Sold Price'] and int(train_df.iloc[i].at['Sold Price']) > 50000000:
plt.text(train_df.iloc[i].at['Sold Price']+200, train_df.iloc[i].at['Annual tax amount'], train_df.iloc[i].at['Id'])
plt.show()
######### Listed Price ######## 3674 43398 44633
all_df = train_df.loc[:, ["Sold Price", "Listed Price"]]
plt.figure(figsize = (20, 6))
sns.scatterplot(data=all_df, x = 'Sold Price', y = 'Listed Price')
for i in range(train_df.shape[0]):
if (train_df.iloc[i].at['Sold Price'] and int(train_df.iloc[i].at['Sold Price']) > 50000000) or int(train_df.iloc[i].at['Listed Price']) > 100000000:
plt.text(train_df.iloc[i].at['Sold Price']+200, train_df.iloc[i].at['Listed Price'], train_df.iloc[i].at['Id'])
plt.show()
######### Last Sold Price ######## 3674 43398 44633
all_df = train_df.loc[:, ["Sold Price", "Last Sold Price"]]
plt.figure(figsize = (20, 6))
sns.scatterplot(data=all_df, x = 'Sold Price', y = 'Last Sold Price')
for i in range(train_df.shape[0]):
if train_df.iloc[i].at['Sold Price']:
if int(train_df.iloc[i].at['Sold Price']) > 50000000:
plt.text(train_df.iloc[i].at['Sold Price']+200, train_df.iloc[i].at['Last Sold Price'], train_df.iloc[i].at['Id'])
if train_df.iloc[i].at['Last Sold Price']:
if int(train_df.iloc[i].at['Last Sold Price']) > 28000000:
plt.text(train_df.iloc[i].at['Sold Price']+200, train_df.iloc[i].at['Last Sold Price'], train_df.iloc[i].at['Id'])
plt.show()
train = train_df.drop([3674,6055,32867,34876,43398,44091,44633]).reset_index(drop=True)
label = train["Sold Price"]
feature_train = train.drop(["Sold Price"], axis = 1)
features = pd.concat([feature_train, test_df])
label, feature_train.shape, test_df.shape, features.shape
1.3 文本特征处理
1.3.1 Bedrooms
def proc_bathroom(x):
if not pd.isna(x) and not x.isdigit():
rooms = x.split(",")
rooms_len = len(rooms)
if "Walk-in Closet" in rooms:
rooms_len -= 1
return rooms_len
return x
features["Bedrooms"] = features["Bedrooms"].apply(lambda x: proc_bathroom(x))
features["Bedrooms"] = pd.to_numeric(features["Bedrooms"])
features["Sold Price"] = label
features[['Bedrooms', 'Sold Price']].corr()
features.drop(["Sold Price"], axis = 1)
1.3.2 City
codes, uni = pd.factorize(features['City'])
features['City'] = pd.factorize(features['City'])[0]
features['City'].value_counts()[:20], features['City'][:20]
1.3.3 Parking
park_garage = ['Garage']
park_attached = ['Garage - Attached','Attached','Attached Carport']
park_detached = ['Garage - Detached']
park_driveway = ['Driveway']
park_coverd = ['Covered']
no_park = ['0 spaces','No Garage']
features['Parking'] = features['Parking'].astype('str')
features['park_garage'] = features['Parking'].apply(lambda x: 1 if (set(x.split(',')) & set(park_garage)) else 0)
features['park_attached'] = features['Parking'].apply(lambda x: 1 if (set(x.split(',')) & set(park_attached)) else 0)
features['park_detached'] = features['Parking'].apply(lambda x: 1 if (set(x.split(',')) & set(park_detached)) else 0)
features['park_driveway'] = features['Parking'].apply(lambda x: 1 if (set(x.split(',')) & set(park_driveway)) else 0)
features['park_coverd'] = features['Parking'].apply(lambda x: 1 if (set(x.split(',')) & set(park_coverd)) else 0)
features['no_park'] = features['Parking'].apply(lambda x: 1 if (set(x.split(',')) & set(no_park)) else 0)
1.3.4 type
============= new type =============
构造map
type_map = {
'Condo': 'Condo',
'Townhouse': 'Townhouse',
'Unknown': 'Unknown',
'MultiFamily': 'MultiFamily',
'MobileManufactured': 'MobileManufactured',
'VacantLand': 'VacantLand'
}
type_map.update(dict.fromkeys(['SingleFamily','Single Family'],'SingleFamily'))
map处理
features['new type'] = features['Type'].map(type_map)
new_types_set = ['SingleFamily','Condo','Townhouse','Unknown','MultiFamily','MobileManufactured','VacantLand']
features['new type'] = features['new type'].apply(lambda x: 'Other' if x not in new_types_set else x)
get dummy
type_dummy_df = pd.get_dummies(features['new type'], prefix = 'type')
features = pd.concat([features, type_dummy_df], axis = 1)
features = features.drop(['Type','new type'], axis = 1)
features = features.drop(['type_Condo','type_Townhouse','type_Unknown','type_MultiFamily','type_MobileManufactured','type_VacantLand','type_Other'], axis = 1)
print======
print('Process : type')
1.4 缺省值填充
null_data = pd.DataFrame(features.isnull().sum().sort_values(ascending=False), columns=["Total"])
null_data["percent %"] = (null_data["Total"] / features.shape[0]) * 100
null_data
import numpy as np
I want to thanks @masumrumi for sharing this amazing plot!
def plotting_3_chart(df, feature):
## Importing seaborn, matplotlab and scipy modules.
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy import stats
import matplotlib.style as style
style.use('fivethirtyeight')
## Creating a customized chart. and giving in figsize and everything.
fig = plt.figure(constrained_layout=True, figsize=(12,8))
## creating a grid of 3 cols and 3 rows.
grid = gridspec.GridSpec(ncols=3, nrows=3, figure=fig)
#gs = fig3.add_gridspec(3, 3)
## Customizing the histogram grid.
ax1 = fig.add_subplot(grid[0, :2])
## Set the title.
ax1.set_title('Histogram')
## plot the histogram.
sns.distplot(df.loc[:,feature], norm_hist=True, ax = ax1)
# customizing the QQ_plot.
ax2 = fig.add_subplot(grid[1, :2])
## Set the title.
ax2.set_title('QQ_plot')
## Plotting the QQ_Plot.
stats.probplot(df.loc[:,feature], plot = ax2)
## Customizing the Box Plot.
ax3 = fig.add_subplot(grid[:, 2])
## Set title.
ax3.set_title('Box Plot')
## Plotting the box plot.
sns.boxplot(df.loc[:,feature], orient='v', ax = ax3 );
plt.show()
Fix the target variable
y = np.log1p(label)
plotting_3_chart(pd.DataFrame(y), 'Sold Price')
1.4 构造新特征
price per bedroom
features['price per bedroom'] = features['Listed Price'] / (features['Bedrooms'] + 1)
price per bathroom
features['price per bathroom'] = features['Listed Price'] / (features['Bathrooms'] + 1)
price per full bathrooms
features['price per fullbath'] = features['Listed Price'] / (features['Full bathrooms'] + 1)
total room
features['total room'] = features['Bedrooms'] + features['Bathrooms']
price per room
features['price per room'] = features['Listed Price'] / (features['total room'] + 1)
查看相关系数
features[['Sold Price','price per bedroom','price per bathroom','price per fullbath','total room','price per room']].corr()
features['listed on year'] = features['Listed On'].str.extract(r'(\d{4})')
features['listed on year']
1.4.1 score sum
由之前的相关系数热力图可以看出,三个school score之间的相关性很强,如果把三个score都加入,可能存在共线性的问题。
所以把三个school score汇总成一个特征score sum。
填补之前相关性
features['score sum'] = features['Elementary School Score'] + features['Middle School Score'] + features['High School Score']
use random forest model to fill missing values
from sklearn.ensemble import RandomForestRegressor
score_df = features[['score sum','Elementary School Score','Middle School Score','High School Score']]
score_notnull_df = score_df.loc[(score_df['score sum'].notnull())]
score_isnull_df = score_df.loc[(score_df['score sum'].isnull())]
score_notnull_df, score_isnull_df, "***"
score_X = score_notnull_df.values[:, 1:]
score_Y = score_notnull_df.values[:, 0]
for col in ['Elementary School Score','Middle School Score','High School Score']:
score_isnull_df[col] = score_isnull_df[col].fillna(score_isnull_df[col].median())
rf = RandomForestRegressor(n_estimators = 100)
rf.fit(score_X, score_Y)
score_pred = rf.predict(score_isnull_df.values[:, 1:])
features.loc[features['score sum'].isnull(), ['score sum']] = score_pred
features[['score sum', 'Sold Price']].corr(), features['score sum'].describe(), features['score sum']
填补之后的相关性
features[['score sum', 'Sold Price']].corr()
1.4.2 time interval
根据year built(建造年份)想到构造这个距离挂牌时间长短的特征。
time interval = year built - listed on year
但是相关性并不强,所以最后没有加入这个特征。
效果不佳,不加入
features['Year built'] = features['Year built'].fillna(1920)
features['time interval'] = features['listed on year'].astype('int') - features['Year built']
features['Year built'] = features['Year built'].fillna(1920)
features[['Sold Price', 'time interval']].corr(), features[['Sold Price', 'Year built']].corr()
1.5 选择特征
from scipy import stats
def fix_skew(features):
"""
This function takes in a dataframe and return fixed skewed dataframe
"""
## Import necessary modules
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
## Getting all the data that are not of "object" type.
numerical_columns = features.select_dtypes(include=['int64','float64']).columns
# Check the skew of all numerical features
skewed_features = features[numerical_columns].apply(lambda x: stats.skew(x)).sort_values(ascending=False)
high_skew = skewed_features[abs(skewed_features) > 0.5]
skewed_features = high_skew.index
# Perform the Box-Cox transformation
for column in skewed_features:
features[column] = boxcox1p(features[column], boxcox_normmax(features[column] + 1))
return features
def reset_zero(x):
return max(x,0)
features['Garage spaces']=features['Garage spaces'].apply(lambda x: reset_zero(x))
features['Total spaces']=features['Total spaces'].apply(lambda x: reset_zero(x))
features = fix_skew(features)
features.head()
x = features.iloc[:len(y), :]
x_test = features.iloc[len(y):, :]
selected=['Listed Price','Tax assessed value','Annual tax amount','Listed On','Elementary School Distance','Last Sold On',
'Zip','Total interior livable area','Last Sold Price','Lot','Year built','Bathrooms','High School Distance',
'Elementary School Score','Full bathrooms','Middle School Distance','Heating features','Bedrooms',
'Elementary School','Laundry features','Region','Middle School Score','Type',
'Total spaces','High School Score','Parking', 'score sum', 'price per bedroom','price per bathroom','price per fullbath','total room','price per room']
x=x[selected]
x_test=x_test[selected+['Id']]
x.head()
1.5.1 缺失值补充
def handle_missing(features):
zero_fill=['Last Sold Price','Lot','Full bathrooms','Annual tax amount','Tax assessed value','Bathrooms',
'Bedrooms','Total interior livable area','Total spaces','Garage spaces']
none_fill=['Last Sold On','Middle School','Appliances included','Flooring','Laundry features','Cooling features',
'Cooling','Heating features','Heating','Elementary School','High School','Parking features','Parking','Summary']
max_fill=['Middle School Score','Middle School Distance','Elementary School Score','Elementary School Distance',
'High School Score','High School Distance']
mode_fill=['Year built','Region', 'score sum', 'price per bedroom','price per bathroom','price per fullbath','total room','price per room']
for c in zero_fill:
features[c]=features[c].fillna(0)
for c in max_fill:
features[c]=features[c].fillna(features[c].max())
for c in none_fill:
features[c]=features[c].fillna('None')
for c in mode_fill:
features[c]=features[c].fillna(features[c].mode()[0])
return features
features = handle_missing(features)
features.shape, features
x.select_dtypes(include=['float64']).astype('int64')
x = features.iloc[:len(y), :]
x_test = features.iloc[len(y):, :]
'Region'和City相关性强,去除region; 因为有school score,去除'Elementary School', 'Middle School', 'High School'; 根据1.4.2 Listed On删除; 'Heating', 'Cooling'因为相关度,先删除
selected=['Listed Price', 'Last Sold Price', 'Tax assessed value', 'Garage spaces','Bedrooms','Bathrooms','Full bathrooms',
'Total interior livable area', 'Year built', 'Lot', 'Annual tax amount','City','Zip','Elementary School Score','Middle School Score','High School Score',
'Elementary School Distance','Middle School Distance','High School Distance','park_garage','park_attached','park_detached','park_driveway','park_coverd','no_park', 'score sum', 'price per bedroom','price per bathroom','price per fullbath','total room','price per room']
x=x[selected]
x_test=x_test[selected+['Id']]
x.head(), x.select_dtypes(exclude=['int64','float64']).columns
############# float64 ot int64
1.6 Modeling
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error
X_train,X_val,y_train,y_val = train_test_split(x, y, test_size = 0.1,random_state = 23)
X_train.columns, X_train.shape, y_train.shape
def rmse(y, y_pred):
return np.sqrt(mean_squared_error(y, y_pred))
from catboost import CatBoostRegressor, Pool
category_features_ind = []
cat = CatBoostRegressor(iterations = 950,
learning_rate = 0.05,
depth = 10,
subsample = 0.7,
random_seed = 0,
cat_features = category_features_ind)
train_pool = Pool(X_train, y_train, cat_features = category_features_ind)
cat.fit(train_pool, verbose = 0)
cat_pred = cat.predict(X_val)
cat_score = rmse(y_val, cat_pred)
cat_score
feat_importances = pd.Series(cat.feature_importances_, index = cat.feature_names_).sort_values()
plt.figure(figsize=(16, 9))
plt.title('feature importance')
feat_importances.plot(kind = 'barh')
plt.show()
Test CSV Submission
test_pred = cat.predict(x_test)
submission = pd.DataFrame(test_id, columns = ['Id'])
test_pred = np.expm1(test_pred)
submission['Sold Price'] = test_pred
submission.to_csv("submission.csv", index = False, header = True)
test_pred, submission.head()