机器学习实战-逻辑回归

逻辑回归:简单的来说,在线性回归的基础上加入了Sigmoid函数!

 

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
#加载数据集
def loadData(filename):
    dataMat = []
    labelMat = []
    with open(filename) as f:
        for line in f.readlines():
            line = line.strip().split()
            dataMat.append([1,float(line[0]),float(line[1])])
            labelMat.append(int(line[2]))
        return dataMat,labelMat

#绘制数据集
def plot(dataMat,labelMat):
    x0cord1 = []
    x0cord2 = []
    x1cord1 = []
    x1cord2 = []
    n = len(labelMat)
    for i in range(n):
        if labelMat[i] == 1:
            x0cord1.append(dataMat[i][1])
            x0cord2.append(dataMat[i][2])
        else:
            x1cord1.append(dataMat[i][1])
            x1cord2.append(dataMat[i][2])
    plt.scatter(x0cord1,x0cord2,c='red',s=20,alpha=0.5,marker='s')
    plt.scatter(x1cord1,x1cord2,c='green',s=20,alpha=0.5)
    plt.title('DataSet')
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.show()

#随机梯度上升进行LR训练
def stogradAscent(dataMat,labelMat,num_iter=150):
    dataMat = np.array(dataMat)
    m,n = np.shape(dataMat)      #矩阵有多少列 m=100 n=3
    weights = np.ones(n)        #即待优化的参数[1,1,1]
    weights_array = np.array([])
    for j in range(num_iter):
        dataIndex = list(range(m))
        for i in range(m):
            alpha = 1 / (i+j+1.0) + 0.001
            rangeIndex = int(np.random.uniform(0,len(dataIndex)))
            error = labelMat[rangeIndex] - sigmoid(sum(dataMat[rangeIndex] * weights))  # 一个数
            weights = weights + alpha * dataMat[rangeIndex] * error
            weights_array = np.append(weights_array,weights,axis=0)
            del(dataIndex[rangeIndex])
    weights_array = weights_array.reshape(num_iter*m,n)
    return weights,weights_array

#批量梯度上升进行LR训练
def gradAscent(dataMat,labelMat):
    dataMartix = np.mat(dataMat)
    labelMartix = np.mat(labelMat).transpose()
    n = np.shape(dataMartix)[1]       #矩阵有多少列
    weights = np.ones((n,1))        #即待优化的参数
    alpha = 0.001
    maxiter = 500
    weights_array = np.array([])
    for i in range(maxiter):
        error = labelMartix - sigmoid(dataMartix * weights)  # 100×1
        weights = weights + alpha * dataMartix.transpose() * error
        weights_array = np.append(weights_array, weights)
    weights_array = weights_array.reshape(maxiter,n)
    return np.asarray(weights),weights_array

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def plotWeights(weights_array1,weights_array2):
    #设置汉字格式
    font = FontProperties(fname=r"C:\Windows\Fonts\simsun.ttc", size=14)
    #将fig画布分隔成1行1列,不共享x轴和y轴,fig画布的大小为(13,8)
    #当nrow=3,nclos=2时,代表fig画布被分为六个区域,axs[0][0]表示第一行第一列
    fig, axs = plt.subplots(nrows=3, ncols=2,sharex=False, sharey=False, figsize=(20,10))
    x1 = np.arange(0, len(weights_array1), 1)
    #绘制w0与迭代次数的关系
    axs[0][0].plot(x1,weights_array1[:,0])
    axs0_title_text = axs[0][0].set_title(u'梯度上升算法:回归系数与迭代次数关系',FontProperties=font)
    axs0_ylabel_text = axs[0][0].set_ylabel(u'W0',FontProperties=font)
    plt.setp(axs0_title_text, size=20, weight='bold', color='black')
    plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black')
    #绘制w1与迭代次数的关系
    axs[1][0].plot(x1,weights_array1[:,1])
    axs1_ylabel_text = axs[1][0].set_ylabel(u'W1',FontProperties=font)
    plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black')
    #绘制w2与迭代次数的关系
    axs[2][0].plot(x1,weights_array1[:,2])
    axs2_xlabel_text = axs[2][0].set_xlabel(u'迭代次数',FontProperties=font)
    axs2_ylabel_text = axs[2][0].set_ylabel(u'W1',FontProperties=font)
    plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black')
    plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black')


    x2 = np.arange(0, len(weights_array2), 1)
    #绘制w0与迭代次数的关系
    axs[0][1].plot(x2,weights_array2[:,0])
    axs0_title_text = axs[0][1].set_title(u'改进的随机梯度上升算法:回归系数与迭代次数关系',FontProperties=font)
    axs0_ylabel_text = axs[0][1].set_ylabel(u'W0',FontProperties=font)
    plt.setp(axs0_title_text, size=20, weight='bold', color='black')
    plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black')
    #绘制w1与迭代次数的关系
    axs[1][1].plot(x2,weights_array2[:,1])
    axs1_ylabel_text = axs[1][1].set_ylabel(u'W1',FontProperties=font)
    plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black')
    #绘制w2与迭代次数的关系
    axs[2][1].plot(x2,weights_array2[:,2])
    axs2_xlabel_text = axs[2][1].set_xlabel(u'迭代次数',FontProperties=font)
    axs2_ylabel_text = axs[2][1].set_ylabel(u'W1',FontProperties=font)
    plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black')
    plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black')

    plt.show()
def plotBestFit(weights,dataMat,labelMat):
    x0cord1 = []
    x0cord2 = []
    x1cord1 = []
    x1cord2 = []
    n = len(labelMat)
    for i in range(n):
        if labelMat[i] == 1:
            x0cord1.append(dataMat[i][1])
            x0cord2.append(dataMat[i][2])
        else:
            x1cord1.append(dataMat[i][1])
            x1cord2.append(dataMat[i][2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(x0cord1,x0cord2,c='red',s=20,alpha=0.5,marker='s')
    ax.scatter(x1cord1,x1cord2,c='green',s=20,alpha=0.5)
    # plt.title('DataSet')
    # plt.xlabel('x1')
    # plt.ylabel('x2')
    # plt.show()

    w = - weights[1] / weights[2]
    b = -weights[0] / weights[2]
    x = np.arange(-3,3,0.1)
    y = w * x + b
    ax.plot(x,y)
    plt.show()


if __name__=='__main__':
    dataMat,labelMat = loadData('testSet.txt')
    # print(dataMat)
    # plot(dataMat,labelMat)
    weights,weights_array1 = stogradAscent(dataMat,labelMat)
    plotBestFit(weights,dataMat,labelMat)
    print(weights)


    weights2,weights_array2 = gradAscent(dataMat,labelMat)
    # print(weights2)
    plotWeights(weights_array2, weights_array1)

 

posted on 2018-12-22 23:01  Magic_chao  阅读(355)  评论(0编辑  收藏  举报

导航