线性回归之房价预测

说明:这是比较早之前的参考的网上的例子,代码非原创。

import warnings
import pandas as pd
import numpy as np
from pylab import plot
from sklearn import preprocessing



housing_df = pd.read_csv('/Users/yfl/Downloads/Nashville_housing_data_20133.csv')
print(housing_df.info())
#print('前五行数据:', housing_df.head(), '\n')
#housing_df['Grade'] = housing_df['Grade'].factorize()[0]

import matplotlib.pyplot as plt

housing_df.plot(kind="scatter", x="Acreage", y="Sale Price")
# 设置X,Y坐标范
plt.xlim(0, 40)
plt.ylim(1000000, 10000000)
plt.savefig("Acreage-Sale Price.png")
#plt.show()

feature_col = ["Acreage", "Land Value", "Building Value", "Grade", "Bedrooms"]
enc = preprocessing.LabelEncoder()
housing_df['Grade'] = housing_df['Grade'].factorize()[0]
#DataFrame,包括了index,及值。

X_df = housing_df[feature_col]
Y_df = housing_df[['Sale Price']]
print("Grade:", X_df.head())

X = X_df.values
y = Y_df.values

warnings.filterwarnings('ignore')


print("Grade(等级) 编码以前:", X[:5, [3]])
enc.fit(X[:, [3]])
X[:, [3]] = enc.transform(X[:, [3]]).reshape(X[:, [3]].shape)
print("Grade(等级) 编码以后:", X[:5, [3]])


imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
print('before ', X[:5])
imp.fit(X)
X = imp.transform(X)
print('after ', X[:5])

imp.fit(y)
print("Y 缺省值填充之前", y[:5])
y = imp.transform(y)
print("Y 缺省值填充之后", y[:5])

min_max_scaler = preprocessing.MinMaxScaler()
print("X 归一化之前:", X[:5])
X = min_max_scaler.fit_transform(X)
print("X 归一化之后:", X[:5])

# 划分训练集和测试集
# # 划分训练集和测试集的方法: 留出法,交叉验证法,自助采样
# # 使用留出法,比例:train:test = 7:3
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

print('There are {} samples in the training set and {} samples in the test set'.format(
x_train.shape[0], x_test.shape[0]))

# 线性回归
# # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
y_test_prd = lr.predict(x_test)

# 特征重选择
# intercept_: Independent term in the linear model.
# coef_: Estimated coefficients for the linear regression problem.
# coef_ 给出了所有特征(X的每一项)对目标(Y)的重要程度,特征对应的数值越大,越重要,可以剔除不重要的特征,降低模型整体误差。
print('x_test', x_test[:5])
print("intercept_:", lr.intercept_)
print("coef_:", lr.coef_)
print("who is the most important:")
print(zip(feature_col, lr.coef_[0]))
print("thie first 5 rows of y_test_prd:")
print(y_test_prd[:5])
print("thie first 5 rows of y_test:")
print(y_test[:5])

# 评估回归模型
# # 评估回归模型方法:平均绝对值误差,均方误差,均方根误差等等
# # 采用均方根误差评价
from sklearn import metrics
print("均方根误差(RMSE):", np.sqrt(metrics.mean_squared_error(y_test, y_test_prd)))

 

posted @ 2017-11-25 21:59  Fall12  阅读(1009)  评论(0编辑  收藏  举报