线性回归
线性回归算法介绍
- 解决回归问题
- 思想简单,实现容易
- 许多强大的非线性模型的基础
- 结果具有很好的解释性
- 蕴含机器学习中很多重要的思想
简单线性回归代价函数推导
实现简单线性回归
import numpy as np import matplotlib.pyplot as plt x = np.array([1,2,3,4,5]) y = np.array([1,3,2,3,5]) x_mean = np.mean(x) y_mean = np.mean(y) a = np.sum(np.subtract(x,x_mean)*np.subtract(y,y_mean))/np.sum(np.power((x-x_mean),2)) b = y_mean-a*x_mean y_predict = a*x + b print("a =",a,"\n","b = ",b) print(y_predict) plt.scatter(x,y) plt.plot(x,y_predict)
自己实现简单线性回归
class SimpleLinearRegression: def __init__(self): """初始化简单线性模型""" self.a_ = None self.b_ = None def fit(self,trainX,trainY): """根据数据训练简单线性模型""" assert trainX.ndim==1,"SimpleLinearRegression must be input 1D" x_mean = np.mean(trainX) y_mean = np.mean(trainY) self.a_ = np.dot(np.subtract(trainX,x_mean),np.subtract(trainY,y_mean))/np.sum(np.power((trainX-x_mean),2)) self.b_ = y_mean-self.a_*x_mean return self def _predict(self,test_x): """给定单个待测数,返回预测值""" test_y = self.a_*test_x + self.b_ return test_y def predict(self,testX): """给定待测数据集,返回结果向量""" return np.array([self._predict(i) for i in testX])
simpleLinearRegression = SimpleLinearRegression() simpleLinearRegression.fit(x,y) print(simpleLinearRegression.a_) print(simpleLinearRegression.b_) simpleLinearRegression.predict(x)
衡量线性回归的指标
均方误差MSE
均方根误差RMSE
平均绝对误差MAE
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split boston = datasets.load_boston() x = boston.data[:,5] # 只使用房间数量这特征 y = bsoton.target x = x[y<50] y = y[y<50] plt.scatter(x,y) plt.xlim(2,10) plt.ylim(0,60)
trainX,testX,trainY,testY = train_test_split(x,y,test_size=0.2) simpleLinearRegression = SimpleLinearRegression() simpleLinearRegression.fit(trainX,trainY) train_y = simpleLinearRegression.predict(trainX) test_y = simpleLinearRegression.predict(testX) plt.subplot(1,2,1) plt.scatter(trainX,trainY) plt.plot(trainX,train_y) plt.subplot(1,2,2) plt.scatter(testX,testY) plt.plot(testX,test_y)
MSE
mse = np.sum(np.power(np.subtract(test_y,testY),2))/test_y.shape[0] mse >>>40.95154236464736
RMSE
rmse = np.sqrt(mse) rmse >>>6.39933921312563
MAE
mae = np.sum(np.abs(test_y-testY))/test_y.shape[0] mae >>>4.399066068730565
R Squared
r = 1-mse/np.var(testY) r >>>0.2541624531120106 from sklearn.metrics import r2_score r2_score(testY,test_y) >>>0.2541624531120107
多元线性回归
优点:不需要对数据做归一化处理
缺点:时间复杂度高
自己实现正规方程法LinearRegressor
from sklearn.metrics import r2_score class LinearRegressor: def __init__(self): self.coef_ = None # θ向量 self.intercept_ = None # 截距 self._theta = None def fit(self,trainX,trainY): x = np.hstack((np.ones((trainX.shape[0],1)),trainX)) self._theta = np.linalg.inv(np.dot(x.T,x)).dot(x.T).dot(trainY) self.intercept_ = self._theta[0] # 截距 self.coef_ = self._theta[1:] # 权重参数weights return self def predict(self,testX): x = np.hstack((np.ones((testX.shape[0],1)),testX)) # y = np.dot(x,self._theta.reshape(-1,1)) y = np.dot(x,self._theta) return y def score(self,testX,testY): return r2_score(testY,self.predict(testX))
trainX,t testX,trainY,testY = train_test_split(x,y,test_size=0.2,random_state=666) lin_reg = LinearRegressor() lin_reg.fit(trainX,trainY) print(lin_reg.predict(testX).shape) print(lin_reg.coef_) print(lin_reg.intercept_) print(lin_reg.score(testX,testY)) >>>(98, 1) >>>[-1.18919477e-01 3.63991462e-02 -3.56494193e-02 5.66737830e-02 -1.16195486e+01 3.42022185e+00 -2.31470282e-02 -1.19509560e+00 2.59339091e-01 -1.40112724e-02 -8.36521175e-01 7.92283639e-03 -3.81966137e-01] >>>34.16143549624022 >>>0.8129802602658537
使用sklearn中的LinearRegressor
boston = datasets.load_boston() x = boston.data y = bsoton.target x = x[y<50] y = y[y<50] x.shape >>>(490, 13) from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split trainX,testX,trainY,testY = train_test_split(x,y,test_size=0.2,random_state=666) lin_reg = LinearRegression() lin_reg.fit(trainX,trainY) print(lin_reg.predict(testX).shape) print(lin_reg.coef_) print(lin_reg.intercept_) print(lin_reg.score(testX,testY)) >>>(98,) >>>[-1.18919477e-01 3.63991462e-02 -3.56494193e-02 5.66737830e-02 -1.16195486e+01 3.42022185e+00 -2.31470282e-02 -1.19509560e+00 2.59339091e-01 -1.40112724e-02 -8.36521175e-01 7.92283639e-03 -3.81966137e-01] >>>34.16143549624665 >>>0.8129802602658495
KNN Regressor
from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import r2_score kneighborsRegressor = KNeighborsRegressor() kneighborsRegressor.fit(trainX,trainY) y_predict = kneighborsRegressor.predict(testX) r2_score(testY,y_predict) >>>0.5865412198300899
更多关于线性回归模型的讨论
- 对数据具有强解释性,正负相关影响因素大小,从而使我们有目标性的采集数据
- 对数据有假设:线性
- 拿到一组数据先用线性回归试试看,总归是没有坏处的。