机器学习之逻辑回归(Logistic Regression)

  1.  
    """逻辑回归中的Sigmoid函数"""
  2.  
    import numpy as np
  3.  
    import matplotlib.pyplot as plt
  4.  
     
  5.  
    def sigmoid(t):
  6.  
    return 1/(1+np.exp(-t))
  7.  
     
  8.  
    x=np.linspace(-10,10,500)
  9.  
    y=sigmoid(x)
  10.  
     
  11.  
    plt.plot(x,y)
  12.  
    plt.show()

 结果:

 逻辑回归损失函数的梯度:

 

 

 

 

 

 

 

 

 

逻辑回归算法: 

  1.  
    import numpy as np
  2.  
    from metrics import accuracy_score
  3.  
     
  4.  
    class LogisticRegression:
  5.  
     
  6.  
    def __init__(self):
  7.  
    """初始化Logistic Regression模型"""
  8.  
    self.coef_ = None
  9.  
    self.intercept_ = None
  10.  
    self._theta = None
  11.  
     
  12.  
    def _sigmoid(self,t):
  13.  
    return 1. / (1. + np.exp(-t))
  14.  
     
  15.  
    def fit(self, X_train, y_train, eta=0.01, n_iters=1e4):
  16.  
    """根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
  17.  
    assert X_train.shape[0] == y_train.shape[0], \
  18.  
    "the size of X_train must be equal to the size of y_train"
  19.  
     
  20.  
     
  21.  
     
  22.  
    def J(theta, X_b, y):
  23.  
    """求损失函数"""
  24.  
    y_hat=self._sigmoid(X_b.dot(theta))
  25.  
    try:
  26.  
    return -np.sum(y*np.log(y_hat) + (1-y)*np.log(1-y_hat))/ len(y)
  27.  
    except:
  28.  
    return float('inf')
  29.  
     
  30.  
    def dJ(theta, X_b, y):
  31.  
    """求梯度"""
  32.  
    # res = np.empty(len(theta))
  33.  
    # res[0] = np.sum(X_b.dot(theta) - y)
  34.  
    # for i in range(1, len(theta)):
  35.  
    # res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])
  36.  
    # return res * 2 / len(X_b)
  37.  
    return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(X_b)
  38.  
     
  39.  
    def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
  40.  
    """使用批量梯度下降法寻找theta"""
  41.  
    theta = initial_theta
  42.  
    cur_iter = 0
  43.  
     
  44.  
    while cur_iter < n_iters:
  45.  
    gradient = dJ(theta, X_b, y)
  46.  
    last_theta = theta
  47.  
    theta = theta - eta * gradient
  48.  
    if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
  49.  
    break
  50.  
     
  51.  
    cur_iter += 1
  52.  
     
  53.  
    return theta
  54.  
     
  55.  
    X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
  56.  
    initial_theta = np.zeros(X_b.shape[1])
  57.  
    self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
  58.  
     
  59.  
    self.intercept_ = self._theta[0]
  60.  
    self.coef_ = self._theta[1:]
  61.  
     
  62.  
    return self
  63.  
     
  64.  
    def predict_proba(self, X_predict):
  65.  
    """给定待预测数据集X_predict,返回表示X_predict的结果概率向量"""
  66.  
    assert self.intercept_ is not None and self.coef_ is not None, \
  67.  
    "must fit before predict!"
  68.  
    assert X_predict.shape[1] == len(self.coef_), \
  69.  
    "the feature number of X_predict must be equal to X_train"
  70.  
     
  71.  
    X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
  72.  
    return self._sigmoid(X_b.dot(self._theta))
  73.  
     
  74.  
    def predict(self, X_predict):
  75.  
    """给定待预测数据集X_predict,返回表示X_predict的结果向量"""
  76.  
    assert self.intercept_ is not None and self.coef_ is not None, \
  77.  
    "must fit before predict!"
  78.  
    assert X_predict.shape[1] == len(self.coef_), \
  79.  
    "the feature number of X_predict must be equal to X_train"
  80.  
     
  81.  
    proba=self.predict_proba(X_predict)
  82.  
    return np.array(proba>=0.5,dtype='int')
  83.  
     
  84.  
    def score(self, X_test, y_test):
  85.  
    """根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
  86.  
     
  87.  
    y_predict = self.predict(X_test)
  88.  
    return accuracy_score(y_test, y_predict)
  89.  
     
  90.  
    def __repr__(self):
  91.  
    return "LogisticRegression()"
  1.  
    """实现逻辑回归"""
  2.  
    import numpy as np
  3.  
    import matplotlib.pyplot as plt
  4.  
    from sklearn import datasets
  5.  
     
  6.  
    iris=datasets.load_iris()
  7.  
    X=iris.data
  8.  
    y=iris.target
  9.  
     
  10.  
    X=X[y<2,:2]
  11.  
    y=y[y<2]
  12.  
     
  13.  
    plt.scatter(X[y==0,0],X[y==0,1],color='red')
  14.  
    plt.scatter(X[y==1,0],X[y==1,1],color='blue')
  15.  
    plt.show()
  16.  
     
  17.  
    """使用逻辑回归"""
  18.  
    from model_selection import train_test_split
  19.  
    from LogisticRegression import LogisticRegression
  20.  
     
  21.  
    X_train,X_test,y_train,y_test=train_test_split(X,y,seed=666)
  22.  
    log_reg=LogisticRegression()
  23.  
    log_reg.fit(X_train,y_train)
  24.  
    print(log_reg.score(X_test,y_test))
  25.  
    print(log_reg.predict_proba(X_test))

 结果:

  1.  
    E:\pythonspace\KNN_function\venv\Scripts\python.exe E:/pythonspace/KNN_function/try.py
  2.  
    1.0
  3.  
    [0.92972035 0.98664939 0.14852024 0.17601199 0.0369836 0.0186637
  4.  
    0.04936918 0.99669244 0.97993941 0.74524655 0.04473194 0.00339285
  5.  
    0.26131273 0.0369836 0.84192923 0.79892262 0.82890209 0.32358166
  6.  
    0.06535323 0.20735334]
  7.  
     
  8.  
    Process finished with exit code 0

逻辑回归中的决策边界和添加多项式特征:

  1.  
    """在逻辑回归中添加多项式特征"""
  2.  
    import numpy as np
  3.  
    import matplotlib.pyplot as plt
  4.  
     
  5.  
    np.random.seed(666)
  6.  
    X=np.random.normal(0,1,size=(100,2))
  7.  
    y=np.array(X[:,0]**2+X[:,1]**2<1.5,dtype='int')
  8.  
     
  9.  
     
  10.  
    """使用逻辑回归"""
  11.  
    from LogisticRegression import LogisticRegression
  12.  
     
  13.  
    log_reg=LogisticRegression()
  14.  
    log_reg.fit(X,y)
  15.  
     
  16.  
    """绘制思路"""
  17.  
    def plot_decision_boundary(model,axis):
  18.  
    x0,x1 = np.meshgrid(
  19.  
    np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)),
  20.  
    np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100))
  21.  
    )
  22.  
    X_new = np.c_[x0.ravel(),x1.ravel()]
  23.  
    y_predict = model.predict(X_new)
  24.  
    zz = y_predict.reshape(x0.shape)
  25.  
    from matplotlib.colors import ListedColormap
  26.  
    custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
  27.  
    plt.contourf(x0,x1,zz,linewidth=5,cmap=custom_cmap)
  28.  
     
  29.  
    plot_decision_boundary(log_reg,axis=[-4,4,-4,4])
  30.  
    plt.scatter(X[y==0,0],X[y==0,1])
  31.  
    plt.scatter(X[y==1,0],X[y==1,1])
  32.  
    plt.show()
  33.  
     
  34.  
     
  35.  
    """添加特征值,即升维"""
  36.  
    from sklearn.preprocessing import PolynomialFeatures
  37.  
    from sklearn.preprocessing import StandardScaler
  38.  
    from sklearn.pipeline import Pipeline
  39.  
    def PolynomialLogisticRegression(degree):
  40.  
    return Pipeline([
  41.  
    ('Poly',PolynomialFeatures(degree=degree)),
  42.  
    ('std_scaler',StandardScaler()),
  43.  
    ('Logistic',LogisticRegression())
  44.  
    ])
  45.  
    poly_log_reg = PolynomialLogisticRegression(degree=2)
  46.  
    poly_log_reg.fit(X,y)
  47.  
    plot_decision_boundary(poly_log_reg,axis=[-4,4,-4,4])
  48.  
    plt.scatter(X[y==0,0],X[y==0,1])
  49.  
    plt.scatter(X[y==1,0],X[y==1,1])
  50.  
    plt.show()

 结果:

 

 

  1.  
    """逻辑回归中使用正则化"""
  2.  
    import numpy as np
  3.  
    import matplotlib.pyplot as plt
  4.  
    from sklearn.model_selection import train_test_split
  5.  
    from sklearn.linear_model import LogisticRegression
  6.  
    from sklearn.preprocessing import StandardScaler
  7.  
    from sklearn.pipeline import Pipeline
  8.  
    from sklearn.preprocessing import PolynomialFeatures
  9.  
     
  10.  
    np.random.seed(666)
  11.  
    X=np.random.normal(0,1,size=(200,2))
  12.  
    y=np.array(X[:,0]**2+X[:,1]<1.5,dtype='int')
  13.  
    for _ in range(20):
  14.  
    y[np.random.randint(200)] = 1
  15.  
    plt .scatter(X[y==0,0],X[y==0,1])
  16.  
    plt .scatter(X[y==1,0],X[y==1,1])
  17.  
    plt.show()
  18.  
     
  19.  
    X_train,X_test,y_train,y_test=train_test_split(X,y)
  20.  
    log_reg=LogisticRegression()
  21.  
    log_reg.fit(X,y)
  22.  
    def plot_decision_boundary(model,axis):
  23.  
    x0,x1 = np.meshgrid(
  24.  
    np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)),
  25.  
    np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100))
  26.  
    )
  27.  
    X_new = np.c_[x0.ravel(),x1.ravel()]
  28.  
    y_predict = model.predict(X_new)
  29.  
    zz = y_predict.reshape(x0.shape)
  30.  
    from matplotlib.colors import ListedColormap
  31.  
    custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
  32.  
    plt.contourf(x0,x1,zz,linewidth=5,cmap=custom_cmap)
  33.  
     
  34.  
    def PolynomialLogisticRegression(degree,C=1.0,penalty='l2'):
  35.  
    return Pipeline([
  36.  
    ('Poly',PolynomialFeatures(degree=degree)),
  37.  
    ('std_scaler',StandardScaler()),
  38.  
    ('Logistic',LogisticRegression(C=C,penalty=penalty))
  39.  
    ])
  40.  
    poly_log_reg = PolynomialLogisticRegression(degree=20,C=0.1,penalty='l1')
  41.  
    poly_log_reg.fit(X_train,y_train)
  42.  
     
  43.  
    plot_decision_boundary(poly_log_reg,axis=[-4,4,-4,4])
  44.  
    plt.scatter(X[y==0,0],X[y==0,1])
  45.  
    plt.scatter(X[y==1,0],X[y==1,1])
  46.  
    plt.show()

 结果

应用OVR和OVO使逻辑回归处理多分类问题
 

  1.  
    """OVR和OVO"""
  2.  
    #为了数据可视化方便,我们只使用鸢尾花数据集的前两列特征
  3.  
    from sklearn import datasets
  4.  
    from sklearn.linear_model import LogisticRegression
  5.  
    from sklearn.model_selection import train_test_split
  6.  
    import matplotlib.pyplot as plt
  7.  
    import numpy as np
  8.  
     
  9.  
    iris = datasets.load_iris()
  10.  
    X = iris['data'][:,:2]
  11.  
    y = iris['target']
  12.  
    X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=666)
  13.  
     
  14.  
     
  15.  
    #log_reg = LogisticRegression(multi_class='ovr') #传入multi_class参数可以指定使用ovr或ovo,默认ovr #由于只使用前两列特征,导致分类准确度较低
  16.  
    log_reg = LogisticRegression(multi_class='ovr',solver='newton-cg')
  17.  
    log_reg.fit(X_train,y_train)
  18.  
    log_reg.score(X_test,y_test)
  19.  
    def plot_decision_boundary(model,axis):
  20.  
    x0,x1 = np.meshgrid(
  21.  
    np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)),
  22.  
    np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100))
  23.  
    )
  24.  
    X_new = np.c_[x0.ravel(),x1.ravel()]
  25.  
    y_predict = model.predict(X_new)
  26.  
    zz = y_predict.reshape(x0.shape)
  27.  
    from matplotlib.colors import ListedColormap
  28.  
    custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
  29.  
    plt.contourf(x0,x1,zz,linewidth=5,cmap=custom_cmap)
  30.  
     
  31.  
    plot_decision_boundary(log_reg,axis=[4,8.5,1.5,4.5])
  32.  
    plt.scatter(X[y==0,0],X[y==0,1])
  33.  
    plt.scatter(X[y==1,0],X[y==1,1])
  34.  
    plt.scatter(X[y==2,0],X[y==2,1])
  35.  
    plt.show()
  36.  
     
  37.  
     
  38.  
     
  39.  
    """使用全部数据 OVR and OVO"""
  40.  
    from sklearn.multiclass import OneVsOneClassifier
  41.  
    from sklearn.multiclass import OneVsRestClassifier
  42.  
     
  43.  
    from sklearn import datasets
  44.  
    from sklearn.linear_model import LogisticRegression
  45.  
    from sklearn.model_selection import train_test_split
  46.  
    iris = datasets.load_iris()
  47.  
    X = iris.data
  48.  
    y = iris.target
  49.  
    X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=666)
  50.  
     
  51.  
    ovr = OneVsRestClassifier(log_reg) #参数为二分类器
  52.  
    ovr.fit(X_train,y_train)
  53.  
    print(ovr.score(X_test,y_test))
  54.  
    ovo = OneVsOneClassifier(log_reg)
  55.  
    ovo.fit(X_train,y_train)
  56.  
    print(ovo.score(X_test,y_test))

结果:

  1.  
    E:\pythonspace\KNN_function\venv\Scripts\python.exe E:/pythonspace/KNN_function/try.py
  2.  
    E:\pythonspace\KNN_function\venv\lib\site-packages\matplotlib\contour.py:960: UserWarning: The following kwargs were not used by contour: 'linewidth'
  3.  
    s)
  4.  
    0.9736842105263158
  5.  
    1.0
  6.  
     
  7.  
    Process finished with exit code 0

 

 

posted @ 2018-08-11 12:22  生活百科  阅读(1149)  评论(0编辑  收藏  举报