线性回归和逻辑回归LR代码
# %load ../../standard_import.txt import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from mpl_toolkits.mplot3d import axes3d pd.set_option('display.notebook_repr_html', False) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', 150) pd.set_option('display.max_seq_items', None) #%config InlineBackend.figure_formats = {'pdf',} %matplotlib inline import seaborn as sns sns.set_context('notebook') sns.set_style('white') # %load ../../standard_import.txt import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from mpl_toolkits.mplot3d import axes3d pd.set_option('display.notebook_repr_html', False) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', 150) pd.set_option('display.max_seq_items', None) #%config InlineBackend.figure_formats = {'pdf',} %matplotlib inline import seaborn as sns sns.set_context('notebook') sns.set_style('white') /Library/Python/2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment. warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.') /Library/Python/2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`. "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning) 熟悉numpy In [2]: def warmUpExercise(): return(np.identity(5)) In [3]: warmUpExercise() Out[3]: array([[ 1., 0., 0., 0., 0.], [ 0., 1., 0., 0., 0.], [ 0., 0., 1., 0., 0.], [ 0., 0., 0., 1., 0.], [ 0., 0., 0., 0., 1.]]) 单变量线性回归 In [4]: data = np.loadtxt('linear_regression_data1.txt', delimiter=',') X = np.c_[np.ones(data.shape[0]),data[:,0]] y = np.c_[data[:,1]] In [5]: plt.scatter(X[:,1], y, s=30, c='r', marker='x', linewidths=1) plt.xlim(4,24) plt.xlabel('Population of City in 10,000s') plt.ylabel('Profit in $10,000s'); 梯度下降 In [31]: # 计算损失函数 def computeCost(X, y, theta=[[0],[0]]): m = y.size J = 0 h = X.dot(theta) J = 1.0/(2*m)*(np.sum(np.square(h-y))) return J In [32]: computeCost(X,y) Out[32]: 32.072733877455676 In [3]: # 梯度下降 def gradientDescent(X, y, theta=[[0],[0]], alpha=0.01, num_iters=1500): m = y.size J_history = np.zeros(num_iters) for iter in np.arange(num_iters): h = X.dot(theta) theta = theta - alpha*(1.0/m)*(X.T.dot(h-y)) J_history[iter] = computeCost(X, y, theta) return(theta, J_history) In [2]: # 画出每一次迭代和损失函数变化 theta , Cost_J = gradientDescent(X, y) print('theta: ',theta.ravel()) plt.plot(Cost_J) plt.ylabel('Cost J') plt.xlabel('Iterations'); --------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-2-699228c1adc0> in <module> 1 # 画出每一次迭代和损失函数变化 ----> 2 theta , Cost_J = gradientDescent(X, y) 3 print('theta: ',theta.ravel()) 4 5 plt.plot(Cost_J) NameError: name 'gradientDescent' is not defined In [38]: xx = np.arange(5,23) yy = theta[0]+theta[1]*xx # 画出我们自己写的线性回归梯度下降收敛的情况 plt.scatter(X[:,1], y, s=30, c='r', marker='x', linewidths=1) plt.plot(xx,yy, label='Linear regression (Gradient descent)') # 和Scikit-learn中的线性回归对比一下 regr = LinearRegression() regr.fit(X[:,1].reshape(-1,1), y.ravel()) plt.plot(xx, regr.intercept_+regr.coef_*xx, label='Linear regression (Scikit-learn GLM)') plt.xlim(4,24) plt.xlabel('Population of City in 10,000s') plt.ylabel('Profit in $10,000s') plt.legend(loc=4); In [39]: # 预测一下人口为35000和70000的城市的结果 print(theta.T.dot([1, 3.5])*10000) print(theta.T.dot([1, 7])*10000) [ 4519.7678677] [ 45342.45012945]
逻辑斯特回归示例 逻辑斯特回归 正则化后的逻辑斯特回归 In [1]: # %load ../../standard_import.txt import pandas as pd import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt from scipy.optimize import minimize from sklearn.preprocessing import PolynomialFeatures pd.set_option('display.notebook_repr_html', False) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', 150) pd.set_option('display.max_seq_items', None) #%config InlineBackend.figure_formats = {'pdf',} %matplotlib inline import seaborn as sns sns.set_context('notebook') sns.set_style('white') /Library/Python/2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment. warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.') /Library/Python/2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`. "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning) In [11]: def loaddata(file, delimeter): data = np.loadtxt(file, delimiter=delimeter) print('Dimensions: ',data.shape) print(data[1:6,:]) return(data) In [12]: def plotData(data, label_x, label_y, label_pos, label_neg, axes=None): # 获得正负样本的下标(即哪些是正样本,哪些是负样本) neg = data[:,2] == 0 pos = data[:,2] == 1 if axes == None: axes = plt.gca() axes.scatter(data[pos][:,0], data[pos][:,1], marker='+', c='k', s=60, linewidth=2, label=label_pos) axes.scatter(data[neg][:,0], data[neg][:,1], c='y', s=60, label=label_neg) axes.set_xlabel(label_x) axes.set_ylabel(label_y) axes.legend(frameon= True, fancybox = True); 逻辑斯特回归 In [13]: data = loaddata('data1.txt', ',') ('Dimensions: ', (100, 3)) [[ 30.28671077 43.89499752 0. ] [ 35.84740877 72.90219803 0. ] [ 60.18259939 86.3085521 1. ] [ 79.03273605 75.34437644 1. ] [ 45.08327748 56.31637178 0. ]] In [14]: X = np.c_[np.ones((data.shape[0],1)), data[:,0:2]] y = np.c_[data[:,2]] In [15]: plotData(data, 'Exam 1 score', 'Exam 2 score', 'Pass', 'Fail') 逻辑斯特回归假设 ℎ𝜃(𝑥)=𝑔(𝜃𝑇𝑥) hθ(x)=g(θTx) 𝑔(𝑧)=11+𝑒−𝑧 g(z)=11+e−z In [16]: #定义sigmoid函数 def sigmoid(z): return(1 / (1 + np.exp(-z))) 其实scipy包里有一个函数可以完成一样的功能: http://docs.scipy.org/doc/scipy/reference/generated/scipy.special.expit.html#scipy.special.expit 损失函数 𝐽(𝜃)=1𝑚∑𝑖=1𝑚[−𝑦(𝑖)𝑙𝑜𝑔(ℎ𝜃(𝑥(𝑖)))−(1−𝑦(𝑖))𝑙𝑜𝑔(1−ℎ𝜃(𝑥(𝑖)))] J(θ)=1m∑i=1m[−y(i)log(hθ(x(i)))−(1−y(i))log(1−hθ(x(i)))] 向量化的损失函数(矩阵形式) 𝐽(𝜃)=1𝑚((𝑙𝑜𝑔(𝑔(𝑋𝜃))𝑇𝑦+(𝑙𝑜𝑔(1−𝑔(𝑋𝜃))𝑇(1−𝑦)) J(θ)=1m((log(g(Xθ))Ty+(log(1−g(Xθ))T(1−y)) In [20]: #定义损失函数 def costFunction(theta, X, y): m = y.size h = sigmoid(X.dot(theta)) J = -1.0*(1.0/m)*(np.log(h).T.dot(y)+np.log(1-h).T.dot(1-y)) if np.isnan(J[0]): return(np.inf) return J[0] 求偏导(梯度) 𝛿𝐽(𝜃)𝛿𝜃𝑗=1𝑚∑𝑖=1𝑚(ℎ𝜃(𝑥(𝑖))−𝑦(𝑖))𝑥(𝑖)𝑗 δJ(θ)δθj=1m∑i=1m(hθ(x(i))−y(i))xj(i) 向量化的偏导(梯度) 𝛿𝐽(𝜃)𝛿𝜃𝑗=1𝑚𝑋𝑇(𝑔(𝑋𝜃)−𝑦) δJ(θ)δθj=1mXT(g(Xθ)−y) In [18]: #求解梯度 def gradient(theta, X, y): m = y.size h = sigmoid(X.dot(theta.reshape(-1,1))) grad =(1.0/m)*X.T.dot(h-y) return(grad.flatten()) In [19]: initial_theta = np.zeros(X.shape[1]) cost = costFunction(initial_theta, X, y) grad = gradient(initial_theta, X, y) print('Cost: \n', cost) print('Grad: \n', grad) ('Cost: \n', 0.69314718055994518) ('Grad: \n', array([ -0.1 , -12.00921659, -11.26284221])) 最小化损失函数 In [22]: res = minimize(costFunction, initial_theta, args=(X,y), jac=gradient, options={'maxiter':400}) res Out[22]: status: 0 success: True njev: 28 nfev: 28 hess_inv: array([[ 3.24739469e+03, -2.59380769e+01, -2.63469561e+01], [ -2.59380769e+01, 2.21449124e-01, 1.97772068e-01], [ -2.63469561e+01, 1.97772068e-01, 2.29018831e-01]]) fun: 0.20349770158944075 x: array([-25.16133401, 0.20623172, 0.2014716 ]) message: 'Optimization terminated successfully.' jac: array([ -2.73305312e-10, 1.43144026e-07, -1.58965802e-07]) 做一下预测吧 In [23]: def predict(theta, X, threshold=0.5): p = sigmoid(X.dot(theta.T)) >= threshold return(p.astype('int')) 咱们来看看考试1得分45,考试2得分85的同学通过概率有多高 In [24]: sigmoid(np.array([1, 45, 85]).dot(res.x.T)) Out[24]: 0.77629066133254787 画决策边界 In [25]: plt.scatter(45, 85, s=60, c='r', marker='v', label='(45, 85)') plotData(data, 'Exam 1 score', 'Exam 2 score', 'Admitted', 'Not admitted') x1_min, x1_max = X[:,1].min(), X[:,1].max(), x2_min, x2_max = X[:,2].min(), X[:,2].max(), xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max)) h = sigmoid(np.c_[np.ones((xx1.ravel().shape[0],1)), xx1.ravel(), xx2.ravel()].dot(res.x)) h = h.reshape(xx1.shape) plt.contour(xx1, xx2, h, [0.5], linewidths=1, colors='b'); 加正则化项的逻辑斯特回归 In [1]: data2 = loaddata('data2.txt', ',') --------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-27484805db53> in <module> ----> 1 data2 = loaddata('data2.txt', ',') NameError: name 'loaddata' is not defined In [42]: # 拿到X和y y = np.c_[data2[:,2]] X = data2[:,0:2] In [43]: # 画个图 plotData(data2, 'Microchip Test 1', 'Microchip Test 2', 'y = 1', 'y = 0') 咱们整一点多项式特征出来(最高6阶) In [44]: poly = PolynomialFeatures(6) XX = poly.fit_transform(data2[:,0:2]) # 看看形状(特征映射后x有多少维了) XX.shape Out[44]: (118, 28) 正则化后损失函数 𝐽(𝜃)=1𝑚∑𝑖=1𝑚[−𝑦(𝑖)𝑙𝑜𝑔(ℎ𝜃(𝑥(𝑖)))−(1−𝑦(𝑖))𝑙𝑜𝑔(1−ℎ𝜃(𝑥(𝑖)))]+𝜆2𝑚∑𝑗=1𝑛𝜃2𝑗 J(θ)=1m∑i=1m[−y(i)log(hθ(x(i)))−(1−y(i))log(1−hθ(x(i)))]+λ2m∑j=1nθj2 向量化的损失函数(矩阵形式) 𝐽(𝜃)=1𝑚((𝑙𝑜𝑔(𝑔(𝑋𝜃))𝑇𝑦+(𝑙𝑜𝑔(1−𝑔(𝑋𝜃))𝑇(1−𝑦))+𝜆2𝑚∑𝑗=1𝑛𝜃2𝑗 J(θ)=1m((log(g(Xθ))Ty+(log(1−g(Xθ))T(1−y))+λ2m∑j=1nθj2 In [45]: # 定义损失函数 def costFunctionReg(theta, reg, *args): m = y.size h = sigmoid(XX.dot(theta)) J = -1.0*(1.0/m)*(np.log(h).T.dot(y)+np.log(1-h).T.dot(1-y)) + (reg/(2.0*m))*np.sum(np.square(theta[1:])) if np.isnan(J[0]): return(np.inf) return(J[0]) 偏导(梯度) 𝛿𝐽(𝜃)𝛿𝜃𝑗=1𝑚∑𝑖=1𝑚(ℎ𝜃(𝑥(𝑖))−𝑦(𝑖))𝑥(𝑖)𝑗+𝜆𝑚𝜃𝑗 δJ(θ)δθj=1m∑i=1m(hθ(x(i))−y(i))xj(i)+λmθj 向量化的偏导(梯度) 𝛿𝐽(𝜃)𝛿𝜃𝑗=1𝑚𝑋𝑇(𝑔(𝑋𝜃)−𝑦)+𝜆𝑚𝜃𝑗 δJ(θ)δθj=1mXT(g(Xθ)−y)+λmθj 注意,我们另外自己加的参数 𝜃0 不需要被正则化 注意,我们另外自己加的参数 θ0 不需要被正则化 In [39]: def gradientReg(theta, reg, *args): m = y.size h = sigmoid(XX.dot(theta.reshape(-1,1))) grad = (1.0/m)*XX.T.dot(h-y) + (reg/m)*np.r_[[[0]],theta[1:].reshape(-1,1)] return(grad.flatten()) In [46]: initial_theta = np.zeros(XX.shape[1]) costFunctionReg(initial_theta, 1, XX, y) Out[46]: 0.69314718055994529 In [48]: fig, axes = plt.subplots(1,3, sharey = True, figsize=(17,5)) # 决策边界,咱们分别来看看正则化系数lambda太大太小分别会出现什么情况 # Lambda = 0 : 就是没有正则化,这样的话,就过拟合咯 # Lambda = 1 : 这才是正确的打开方式 # Lambda = 100 : 卧槽,正则化项太激进,导致基本就没拟合出决策边界 for i, C in enumerate([0.0, 1.0, 100.0]): # 最优化 costFunctionReg res2 = minimize(costFunctionReg, initial_theta, args=(C, XX, y), jac=gradientReg, options={'maxiter':3000}) # 准确率 accuracy = 100.0*sum(predict(res2.x, XX) == y.ravel())/y.size # 对X,y的散列绘图 plotData(data2, 'Microchip Test 1', 'Microchip Test 2', 'y = 1', 'y = 0', axes.flatten()[i]) # 画出决策边界 x1_min, x1_max = X[:,0].min(), X[:,0].max(), x2_min, x2_max = X[:,1].min(), X[:,1].max(), xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max)) h = sigmoid(poly.fit_transform(np.c_[xx1.ravel(), xx2.ravel()]).dot(res2.x)) h = h.reshape(xx1.shape) axes.flatten()[i].contour(xx1, xx2, h, [0.5], linewidths=1, colors='g'); axes.flatten()[i].set_title('Train accuracy {}% with Lambda = {}'.format(np.round(accuracy, decimals=2), C)) In [ ]: