向后选择法

向后选择法是一种用于处理多元线性回归问题的变量选择方法

首先要设定一个阈值,就是我们所期待的模型的准确度

每次去除和原方程相关度最低的变量,直到所有的变量都满足对应的阈值

import numpy as np
import pandas as pd 
import seaborn as sns 
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
# 向后选择部分
def backwardElimination(x_train,flo):
    tmp=np.zeros((17290,17))
    for i in range(0,17):
        regressor_OLS=sm.OLS(endog=y_train,exog=x_train).fit()
        maxn=max(regressor_OLS.pvalues)
        adj_b=regressor_OLS.rsquared_adj
        if maxn>flo:
            for j in range(0,17-i):
                if regressor_OLS.pvalues[i]==maxn:
                    tmp[:,j]=x_train[:,j]
                    x_train=np.delete(x_train,j,axis=1)
                    regressor_OLS=sm.OLS(endog=y_train,exog=x_train).fit()
                    adj_a=regressor_OLS.rsquared_adj
                    if adj_a<=adj_b:
                        return np.delete(np.hstack((x_train,tmp[:,[0,j]])),j,1)
                    else :
                        continue
    return x_train

#读入数据,对数据进行处理
data=pd.read_csv('house_data.csv')
#Q1
arr1=data['bedrooms'] 
print('Q1:',stats.mode(arr1)[0].tolist()[0])
#Q2
arr2=data[['price','bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront','view','condition','grade','sqft_above','sqft_basement',
            'yr_built','yr_renovated','zipcode','lat','long']]
arr2=arr2.corr()
ans=abs(arr2['price']).sort_values(ascending=False)
print ('Q2:',ans[1:4].index.tolist())
#Q3
arr3=data[['lat','long']]
print('Q3')
arr3.plot(kind='scatter',x='lat',y='long')
plt.show()

x_data=data.iloc[:,2:].values
y_data=data['price'].values
x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.2) #数据切分

#创建线性回归模型
clf=LinearRegression()
clf.fit(x_train,y_train)

#训练
x_train=backwardElimination(x_train,0.05)
y_pred=clf.predict(x_test)
print('score:',clf.score(x_test,y_test))

  

posted @ 2020-08-30 11:25  古城独钓  阅读(622)  评论(0编辑  收藏  举报