线性回归和逻辑回归LR代码

# %load ../../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from mpl_toolkits.mplot3d import axes3d

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_seq_items', None)
 
#%config InlineBackend.figure_formats = {'pdf',}
%matplotlib inline  

import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')
# %load ../../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
​
from sklearn.linear_model import LinearRegression
from mpl_toolkits.mplot3d import axes3d
​
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_seq_items', None)
 
#%config InlineBackend.figure_formats = {'pdf',}
%matplotlib inline  
​
import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')
/Library/Python/2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/Library/Python/2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)
熟悉numpy
In [2]:

def warmUpExercise():
    return(np.identity(5))
In [3]:

warmUpExercise()
Out[3]:
array([[ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.]])
单变量线性回归
In [4]:

data = np.loadtxt('linear_regression_data1.txt', delimiter=',')
​
X = np.c_[np.ones(data.shape[0]),data[:,0]]
y = np.c_[data[:,1]]
In [5]:

plt.scatter(X[:,1], y, s=30, c='r', marker='x', linewidths=1)
plt.xlim(4,24)
plt.xlabel('Population of City in 10,000s')
plt.ylabel('Profit in $10,000s');

梯度下降
In [31]:

# 计算损失函数
def computeCost(X, y, theta=[[0],[0]]):
    m = y.size
    J = 0
    
    h = X.dot(theta)
    
    J = 1.0/(2*m)*(np.sum(np.square(h-y)))
    
    return J
In [32]:

computeCost(X,y)
Out[32]:
32.072733877455676
In [3]:

# 梯度下降
def gradientDescent(X, y, theta=[[0],[0]], alpha=0.01, num_iters=1500):
    m = y.size
    J_history = np.zeros(num_iters)
    
    for iter in np.arange(num_iters):
        h = X.dot(theta)
        theta = theta - alpha*(1.0/m)*(X.T.dot(h-y))
        J_history[iter] = computeCost(X, y, theta)
    return(theta, J_history)
In [2]:

# 画出每一次迭代和损失函数变化
theta , Cost_J = gradientDescent(X, y)
print('theta: ',theta.ravel())
​
plt.plot(Cost_J)
plt.ylabel('Cost J')
plt.xlabel('Iterations');
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-699228c1adc0> in <module>
      1 # 画出每一次迭代和损失函数变化
----> 2 theta , Cost_J = gradientDescent(X, y)
      3 print('theta: ',theta.ravel())
      4 
      5 plt.plot(Cost_J)

NameError: name 'gradientDescent' is not defined

In [38]:

xx = np.arange(5,23)
yy = theta[0]+theta[1]*xx
​
# 画出我们自己写的线性回归梯度下降收敛的情况
plt.scatter(X[:,1], y, s=30, c='r', marker='x', linewidths=1)
plt.plot(xx,yy, label='Linear regression (Gradient descent)')
​
# 和Scikit-learn中的线性回归对比一下 
regr = LinearRegression()
regr.fit(X[:,1].reshape(-1,1), y.ravel())
plt.plot(xx, regr.intercept_+regr.coef_*xx, label='Linear regression (Scikit-learn GLM)')
​
plt.xlim(4,24)
plt.xlabel('Population of City in 10,000s')
plt.ylabel('Profit in $10,000s')
plt.legend(loc=4);

In [39]:

# 预测一下人口为35000和70000的城市的结果
print(theta.T.dot([1, 3.5])*10000)
print(theta.T.dot([1, 7])*10000)
[ 4519.7678677]
[ 45342.45012945]
逻辑斯特回归示例
逻辑斯特回归
正则化后的逻辑斯特回归
In [1]:

# %load ../../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
​
from scipy.optimize import minimize
​
from sklearn.preprocessing import PolynomialFeatures
​
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_seq_items', None)
 
#%config InlineBackend.figure_formats = {'pdf',}
%matplotlib inline
​
import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')
/Library/Python/2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/Library/Python/2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)
In [11]:

def loaddata(file, delimeter):
    data = np.loadtxt(file, delimiter=delimeter)
    print('Dimensions: ',data.shape)
    print(data[1:6,:])
    return(data)
In [12]:

def plotData(data, label_x, label_y, label_pos, label_neg, axes=None):
    # 获得正负样本的下标(即哪些是正样本,哪些是负样本)
    neg = data[:,2] == 0
    pos = data[:,2] == 1
    
    if axes == None:
        axes = plt.gca()
    axes.scatter(data[pos][:,0], data[pos][:,1], marker='+', c='k', s=60, linewidth=2, label=label_pos)
    axes.scatter(data[neg][:,0], data[neg][:,1], c='y', s=60, label=label_neg)
    axes.set_xlabel(label_x)
    axes.set_ylabel(label_y)
    axes.legend(frameon= True, fancybox = True);
逻辑斯特回归
In [13]:

data = loaddata('data1.txt', ',')
('Dimensions: ', (100, 3))
[[ 30.28671077  43.89499752   0.        ]
 [ 35.84740877  72.90219803   0.        ]
 [ 60.18259939  86.3085521    1.        ]
 [ 79.03273605  75.34437644   1.        ]
 [ 45.08327748  56.31637178   0.        ]]
In [14]:

X = np.c_[np.ones((data.shape[0],1)), data[:,0:2]]
y = np.c_[data[:,2]]
In [15]:

plotData(data, 'Exam 1 score', 'Exam 2 score', 'Pass', 'Fail')

逻辑斯特回归假设
ℎ𝜃(𝑥)=𝑔(𝜃𝑇𝑥)
hθ(x)=g(θTx)
𝑔(𝑧)=11+𝑒−𝑧
g(z)=11+e−z
In [16]:

#定义sigmoid函数
def sigmoid(z):
    return(1 / (1 + np.exp(-z)))
其实scipy包里有一个函数可以完成一样的功能:
http://docs.scipy.org/doc/scipy/reference/generated/scipy.special.expit.html#scipy.special.expit
损失函数
𝐽(𝜃)=1𝑚∑𝑖=1𝑚[−𝑦(𝑖)𝑙𝑜𝑔(ℎ𝜃(𝑥(𝑖)))−(1−𝑦(𝑖))𝑙𝑜𝑔(1−ℎ𝜃(𝑥(𝑖)))]
J(θ)=1m∑i=1m[−y(i)log(hθ(x(i)))−(1−y(i))log(1−hθ(x(i)))]
向量化的损失函数(矩阵形式)
𝐽(𝜃)=1𝑚((𝑙𝑜𝑔(𝑔(𝑋𝜃))𝑇𝑦+(𝑙𝑜𝑔(1−𝑔(𝑋𝜃))𝑇(1−𝑦))
J(θ)=1m((log(g(Xθ))Ty+(log(1−g(Xθ))T(1−y))
In [20]:

#定义损失函数
def costFunction(theta, X, y):
    m = y.size
    h = sigmoid(X.dot(theta))
    
    J = -1.0*(1.0/m)*(np.log(h).T.dot(y)+np.log(1-h).T.dot(1-y))
               
    if np.isnan(J[0]):
        return(np.inf)
    return J[0]
求偏导(梯度)
𝛿𝐽(𝜃)𝛿𝜃𝑗=1𝑚∑𝑖=1𝑚(ℎ𝜃(𝑥(𝑖))−𝑦(𝑖))𝑥(𝑖)𝑗
δJ(θ)δθj=1m∑i=1m(hθ(x(i))−y(i))xj(i)
向量化的偏导(梯度)
𝛿𝐽(𝜃)𝛿𝜃𝑗=1𝑚𝑋𝑇(𝑔(𝑋𝜃)−𝑦)
δJ(θ)δθj=1mXT(g(Xθ)−y)
In [18]:

#求解梯度
def gradient(theta, X, y):
    m = y.size
    h = sigmoid(X.dot(theta.reshape(-1,1)))
    
    grad =(1.0/m)*X.T.dot(h-y)
​
    return(grad.flatten())
In [19]:

initial_theta = np.zeros(X.shape[1])
cost = costFunction(initial_theta, X, y)
grad = gradient(initial_theta, X, y)
print('Cost: \n', cost)
print('Grad: \n', grad)
('Cost: \n', 0.69314718055994518)
('Grad: \n', array([ -0.1       , -12.00921659, -11.26284221]))
最小化损失函数
In [22]:

res = minimize(costFunction, initial_theta, args=(X,y), jac=gradient, options={'maxiter':400})
res
Out[22]:
   status: 0
  success: True
     njev: 28
     nfev: 28
 hess_inv: array([[  3.24739469e+03,  -2.59380769e+01,  -2.63469561e+01],
       [ -2.59380769e+01,   2.21449124e-01,   1.97772068e-01],
       [ -2.63469561e+01,   1.97772068e-01,   2.29018831e-01]])
      fun: 0.20349770158944075
        x: array([-25.16133401,   0.20623172,   0.2014716 ])
  message: 'Optimization terminated successfully.'
      jac: array([ -2.73305312e-10,   1.43144026e-07,  -1.58965802e-07])
做一下预测吧
In [23]:

def predict(theta, X, threshold=0.5):
    p = sigmoid(X.dot(theta.T)) >= threshold
    return(p.astype('int'))
咱们来看看考试1得分45,考试2得分85的同学通过概率有多高
In [24]:

sigmoid(np.array([1, 45, 85]).dot(res.x.T))
Out[24]:
0.77629066133254787
画决策边界
In [25]:

plt.scatter(45, 85, s=60, c='r', marker='v', label='(45, 85)')
plotData(data, 'Exam 1 score', 'Exam 2 score', 'Admitted', 'Not admitted')
x1_min, x1_max = X[:,1].min(), X[:,1].max(),
x2_min, x2_max = X[:,2].min(), X[:,2].max(),
xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max))
h = sigmoid(np.c_[np.ones((xx1.ravel().shape[0],1)), xx1.ravel(), xx2.ravel()].dot(res.x))
h = h.reshape(xx1.shape)
plt.contour(xx1, xx2, h, [0.5], linewidths=1, colors='b');

加正则化项的逻辑斯特回归
In [1]:

data2 = loaddata('data2.txt', ',')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-27484805db53> in <module>
----> 1 data2 = loaddata('data2.txt', ',')

NameError: name 'loaddata' is not defined

In [42]:

# 拿到X和y
y = np.c_[data2[:,2]]
X = data2[:,0:2]
In [43]:

# 画个图
plotData(data2, 'Microchip Test 1', 'Microchip Test 2', 'y = 1', 'y = 0')

咱们整一点多项式特征出来(最高6阶)
In [44]:

poly = PolynomialFeatures(6)
XX = poly.fit_transform(data2[:,0:2])
# 看看形状(特征映射后x有多少维了)
XX.shape
Out[44]:
(118, 28)
正则化后损失函数
𝐽(𝜃)=1𝑚∑𝑖=1𝑚[−𝑦(𝑖)𝑙𝑜𝑔(ℎ𝜃(𝑥(𝑖)))−(1−𝑦(𝑖))𝑙𝑜𝑔(1−ℎ𝜃(𝑥(𝑖)))]+𝜆2𝑚∑𝑗=1𝑛𝜃2𝑗
J(θ)=1m∑i=1m[−y(i)log(hθ(x(i)))−(1−y(i))log(1−hθ(x(i)))]+λ2m∑j=1nθj2
向量化的损失函数(矩阵形式)
𝐽(𝜃)=1𝑚((𝑙𝑜𝑔(𝑔(𝑋𝜃))𝑇𝑦+(𝑙𝑜𝑔(1−𝑔(𝑋𝜃))𝑇(1−𝑦))+𝜆2𝑚∑𝑗=1𝑛𝜃2𝑗
J(θ)=1m((log(g(Xθ))Ty+(log(1−g(Xθ))T(1−y))+λ2m∑j=1nθj2
In [45]:

# 定义损失函数
def costFunctionReg(theta, reg, *args):
    m = y.size
    h = sigmoid(XX.dot(theta))
    
    J = -1.0*(1.0/m)*(np.log(h).T.dot(y)+np.log(1-h).T.dot(1-y)) + (reg/(2.0*m))*np.sum(np.square(theta[1:]))
    
    if np.isnan(J[0]):
        return(np.inf)
    return(J[0])
偏导(梯度)
𝛿𝐽(𝜃)𝛿𝜃𝑗=1𝑚∑𝑖=1𝑚(ℎ𝜃(𝑥(𝑖))−𝑦(𝑖))𝑥(𝑖)𝑗+𝜆𝑚𝜃𝑗
δJ(θ)δθj=1m∑i=1m(hθ(x(i))−y(i))xj(i)+λmθj
向量化的偏导(梯度)
𝛿𝐽(𝜃)𝛿𝜃𝑗=1𝑚𝑋𝑇(𝑔(𝑋𝜃)−𝑦)+𝜆𝑚𝜃𝑗
δJ(θ)δθj=1mXT(g(Xθ)−y)+λmθj
注意,我们另外自己加的参数 𝜃0 不需要被正则化
注意,我们另外自己加的参数 θ0 不需要被正则化
In [39]:

def gradientReg(theta, reg, *args):
    m = y.size
    h = sigmoid(XX.dot(theta.reshape(-1,1)))
      
    grad = (1.0/m)*XX.T.dot(h-y) + (reg/m)*np.r_[[[0]],theta[1:].reshape(-1,1)]
        
    return(grad.flatten())
In [46]:

initial_theta = np.zeros(XX.shape[1])
costFunctionReg(initial_theta, 1, XX, y)
Out[46]:
0.69314718055994529
In [48]:

fig, axes = plt.subplots(1,3, sharey = True, figsize=(17,5))
​
# 决策边界,咱们分别来看看正则化系数lambda太大太小分别会出现什么情况
# Lambda = 0 : 就是没有正则化,这样的话,就过拟合咯
# Lambda = 1 : 这才是正确的打开方式
# Lambda = 100 : 卧槽,正则化项太激进,导致基本就没拟合出决策边界
for i, C in enumerate([0.0, 1.0, 100.0]):
    # 最优化 costFunctionReg
    res2 = minimize(costFunctionReg, initial_theta, args=(C, XX, y), jac=gradientReg, options={'maxiter':3000})
    
    # 准确率
    accuracy = 100.0*sum(predict(res2.x, XX) == y.ravel())/y.size    
​
    # 对X,y的散列绘图
    plotData(data2, 'Microchip Test 1', 'Microchip Test 2', 'y = 1', 'y = 0', axes.flatten()[i])
    
    # 画出决策边界
    x1_min, x1_max = X[:,0].min(), X[:,0].max(),
    x2_min, x2_max = X[:,1].min(), X[:,1].max(),
    xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max))
    h = sigmoid(poly.fit_transform(np.c_[xx1.ravel(), xx2.ravel()]).dot(res2.x))
    h = h.reshape(xx1.shape)
    axes.flatten()[i].contour(xx1, xx2, h, [0.5], linewidths=1, colors='g');       
    axes.flatten()[i].set_title('Train accuracy {}% with Lambda = {}'.format(np.round(accuracy, decimals=2), C))

In [ ]:

​

 

posted @ 2019-01-04 09:52  吕晓宁  阅读(829)  评论(0编辑  收藏  举报