向后选择法
向后选择法是一种用于处理多元线性回归问题的变量选择方法
首先要设定一个阈值,就是我们所期待的模型的准确度
每次去除和原方程相关度最低的变量,直到所有的变量都满足对应的阈值
import numpy as np import pandas as pd import seaborn as sns from scipy import stats import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split import statsmodels.api as sm # 向后选择部分 def backwardElimination(x_train,flo): tmp=np.zeros((17290,17)) for i in range(0,17): regressor_OLS=sm.OLS(endog=y_train,exog=x_train).fit() maxn=max(regressor_OLS.pvalues) adj_b=regressor_OLS.rsquared_adj if maxn>flo: for j in range(0,17-i): if regressor_OLS.pvalues[i]==maxn: tmp[:,j]=x_train[:,j] x_train=np.delete(x_train,j,axis=1) regressor_OLS=sm.OLS(endog=y_train,exog=x_train).fit() adj_a=regressor_OLS.rsquared_adj if adj_a<=adj_b: return np.delete(np.hstack((x_train,tmp[:,[0,j]])),j,1) else : continue return x_train #读入数据,对数据进行处理 data=pd.read_csv('house_data.csv') #Q1 arr1=data['bedrooms'] print('Q1:',stats.mode(arr1)[0].tolist()[0]) #Q2 arr2=data[['price','bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront','view','condition','grade','sqft_above','sqft_basement', 'yr_built','yr_renovated','zipcode','lat','long']] arr2=arr2.corr() ans=abs(arr2['price']).sort_values(ascending=False) print ('Q2:',ans[1:4].index.tolist()) #Q3 arr3=data[['lat','long']] print('Q3') arr3.plot(kind='scatter',x='lat',y='long') plt.show() x_data=data.iloc[:,2:].values y_data=data['price'].values x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.2) #数据切分 #创建线性回归模型 clf=LinearRegression() clf.fit(x_train,y_train) #训练 x_train=backwardElimination(x_train,0.05) y_pred=clf.predict(x_test) print('score:',clf.score(x_test,y_test))