线性回归之房价预测
说明:这是比较早之前的参考的网上的例子,代码非原创。
import warnings import pandas as pd import numpy as np from pylab import plot from sklearn import preprocessing housing_df = pd.read_csv('/Users/yfl/Downloads/Nashville_housing_data_20133.csv') print(housing_df.info()) #print('前五行数据:', housing_df.head(), '\n') #housing_df['Grade'] = housing_df['Grade'].factorize()[0] import matplotlib.pyplot as plt housing_df.plot(kind="scatter", x="Acreage", y="Sale Price") # 设置X,Y坐标范 plt.xlim(0, 40) plt.ylim(1000000, 10000000) plt.savefig("Acreage-Sale Price.png") #plt.show() feature_col = ["Acreage", "Land Value", "Building Value", "Grade", "Bedrooms"] enc = preprocessing.LabelEncoder() housing_df['Grade'] = housing_df['Grade'].factorize()[0] #DataFrame,包括了index,及值。 X_df = housing_df[feature_col] Y_df = housing_df[['Sale Price']] print("Grade:", X_df.head()) X = X_df.values y = Y_df.values warnings.filterwarnings('ignore') print("Grade(等级) 编码以前:", X[:5, [3]]) enc.fit(X[:, [3]]) X[:, [3]] = enc.transform(X[:, [3]]).reshape(X[:, [3]].shape) print("Grade(等级) 编码以后:", X[:5, [3]]) imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0) print('before ', X[:5]) imp.fit(X) X = imp.transform(X) print('after ', X[:5]) imp.fit(y) print("Y 缺省值填充之前", y[:5]) y = imp.transform(y) print("Y 缺省值填充之后", y[:5]) min_max_scaler = preprocessing.MinMaxScaler() print("X 归一化之前:", X[:5]) X = min_max_scaler.fit_transform(X) print("X 归一化之后:", X[:5]) # 划分训练集和测试集 # # 划分训练集和测试集的方法: 留出法,交叉验证法,自助采样 # # 使用留出法,比例:train:test = 7:3 from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0) print('There are {} samples in the training set and {} samples in the test set'.format( x_train.shape[0], x_test.shape[0])) # 线性回归 # # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html from sklearn.linear_model import LinearRegression lr = LinearRegression() lr.fit(x_train, y_train) y_test_prd = lr.predict(x_test) # 特征重选择 # intercept_: Independent term in the linear model. # coef_: Estimated coefficients for the linear regression problem. # coef_ 给出了所有特征(X的每一项)对目标(Y)的重要程度,特征对应的数值越大,越重要,可以剔除不重要的特征,降低模型整体误差。 print('x_test', x_test[:5]) print("intercept_:", lr.intercept_) print("coef_:", lr.coef_) print("who is the most important:") print(zip(feature_col, lr.coef_[0])) print("thie first 5 rows of y_test_prd:") print(y_test_prd[:5]) print("thie first 5 rows of y_test:") print(y_test[:5]) # 评估回归模型 # # 评估回归模型方法:平均绝对值误差,均方误差,均方根误差等等 # # 采用均方根误差评价 from sklearn import metrics print("均方根误差(RMSE):", np.sqrt(metrics.mean_squared_error(y_test, y_test_prd)))