代码改变世界

机器学习入门之房价预测(线性回归)

2018-10-20 10:02  木丨头人  阅读(2591)  评论(0编辑  收藏  举报
#!/usr/bin/env python
# coding: utf-8

# In[1]:


# 1.定义问题

# 2.导入数据

# 导入类库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
get_ipython().run_line_magic('matplotlib', 'inline')

import warnings
warnings.filterwarnings('ignore')

# 显示所有列
pd.set_option('display.max_columns', None)

# 导入数据
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')


# In[2]:


# 3.理解数据
# 数据信息
train_data.info()


# In[3]:


# 数据维度
train_data.shape


# In[4]:


# 前5个数据
train_data.head(5)


# In[5]:


# 描述性统计数据
train_data.describe().T


# In[6]:


# 4.数据可视化

# 分析SalePrice
train_data['SalePrice'].describe()
sns.distplot(train_data['SalePrice'])
plt.show()


# In[7]:


# 关系矩阵
corr = train_data.corr()
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corr, vmax=1, vmin=-1,square=True)


# In[8]:


# 选取特征
# 选择相关系数绝对值大于0.5的特征(共十个)
train_data.corr()[train_data.corr()['SalePrice'].values > abs(0.5)]


# In[9]:


cols = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'SalePrice']
train_data = train_data[cols]
train_data.info()


# In[10]:


# 5.建立模型

# 分离数据集
X = train_data.values[:, 0:10]
Y = train_data.values[:, 10]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
# 建模
model = LinearRegression()
# 预测数据
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)
print('cost:'+ str(np.sum(abs(y_pred-Y_test)/len(y_pred))))


# In[11]:


# 由于原始数据所得cost太大,所以接下来对数据进行归一化处理(误差反而更大了,不知道为什么???)
X_scaled = StandardScaler().fit_transform(X)
Y_scaled = StandardScaler().fit_transform(Y.reshape(-1, 1))
X_scaled_train, X_scaled_test, Y_scaled_train, Y_scaled_test = train_test_split(X_scaled, Y_scaled, test_size=0.33, random_state=42)
model_scaled = LinearRegression()
model_scaled.fit(X_scaled_train,Y_scaled_train)
y_pred = model.predict(X_scaled_test)
y_pred
print('cost:'+ str(np.sum(abs(y_pred-Y_scaled_test)/len(y_pred))))


# In[12]:


test_data['SalePrice'] = None
test_data = test_data[cols]
# 填充缺失值
test_data['TotalBsmtSF'].fillna(test_data['TotalBsmtSF'].median(), inplace=True)
test_data['GarageCars'].fillna(test_data['GarageCars'].median(), inplace=True)
test_data['GarageArea'].fillna(test_data['GarageArea'].median(), inplace=True)


# In[13]:


X = test_data.values[:, 0:10]
y_test_pre = model.predict(X)
test_data['SalePrice'] = y_test_pre
test_data.head(10)