【机器学习实战】--第五章Logistic回归完整代码及注释

可参考博客:

https://blog.csdn.net/rujin_shi/article/details/78997271?utm_medium=distribute.pc_relevant_right.none-task-blog-BlogCommendFromMachineLearnPai2-5.nonecase&depth_1-utm_source=distribute.pc_relevant_right.none-task-blog-BlogCommendFromMachineLearnPai2-5.nonecase

完整代码如下:

 

  1 import numpy as np
  2 import matplotlib.pyplot as plt
  3 # 参考https://blog.csdn.net/rujin_shi/article/details/78997271?utm_medium=distribute.pc_relevant_right.none-task-blog-BlogCommendFromMachineLearnPai2-5.nonecase&depth_1-utm_source=distribute.pc_relevant_right.none-task-blog-BlogCommendFromMachineLearnPai2-5.nonecase
  4 
  5 def loadDataSet():
  6     dataMat = []
  7     labelMat = []
  8     fr = open('../machinelearninginaction/Ch05/testSet.txt')
  9     for line in fr.readlines():
 10         lineArr = line.strip().split()
 11         dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])  # 存放3个特征X0,X1,X2
 12         labelMat.append(float(lineArr[2]))  # 存放标签
 13     return dataMat, labelMat
 14 
 15 
 16 def sigmoid(inX):
 17     return 1.0/(1+np.exp(-inX))
 18 
 19 
 20 def gradAscent(dataMatIn, classLabels):  # 梯度上升法
 21     dataMatrix = np.mat(dataMatIn)  # 转换为numpy矩阵
 22     labelMat = np.mat(classLabels).transpose()  # 转置为列向量
 23     m, n = np.shape(dataMatrix)  # 矩阵形状
 24     alpha = 0.01  # 向目标移动的步长
 25     maxCycles = 500  # 最大迭代次数
 26     weights = np.ones((n,1))  # 初始化参数为1
 27     for k in range(maxCycles):
 28         h = sigmoid(dataMatrix * weights)
 29         error = labelMat - h  # 为什么要这样计算?
 30         weights = weights + alpha * dataMatrix.transpose() * error
 31     return weights
 32 
 33 
 34 def stocGradAscent0(dataMatrix, classLabels):  # 随机梯度上升法
 35     m, n = np.shape(dataMatrix)
 36     alpha = 0.01
 37     weights = np.ones(n)
 38     for i in range(m):
 39         product = dataMatrix[i]*weights  # 对应元素相乘
 40         sum0 = sum(product)  # 将所有元素相加
 41         h = sigmoid(sum0)  # 求取sigmoid值
 42         error = classLabels[i] - h  # 做差
 43         weights = weights + alpha * error * dataMatrix[i]  # 更新权重
 44     return weights
 45 
 46 
 47 def stocGradAscent1(dataMatrix, classLabels, numIter=150):  # 改进的随机梯度上升法
 48     m,n = np.shape(dataMatrix)  # m为行,n为列
 49     weights = np.ones(n)
 50     for j in range(numIter):  # 默认迭代150次
 51         dataIndex = list(range(m))  # 这里要将range(m)改为list(range(m)),否则报错
 52         for i in range(m):
 53             alpha = 4/(1.0+j+i)+0.01  # 动态调整alpha
 54             randIndex = int(np.random.uniform(0, len(dataIndex)))  # 随机在0到len(dataIndex)之间选择一个数
 55             h = sigmoid(sum(dataMatrix[randIndex]*weights))  # 做sigmoid
 56             error = classLabels[randIndex] - h  # 做差
 57             weights = weights + alpha * error *dataMatrix[randIndex]  # 更新权重
 58             del(dataIndex[randIndex])  # 删除该值进行下一次迭代
 59     return weights
 60 
 61 
 62 def classifyVector(inX, weights):
 63     prob = sigmoid(sum(inX*weights))  # 做sigmoid
 64     if prob > 0.5:  # 大于0.5时输出类别为1
 65         return 1.0
 66     else:  # 小于0.5时输出类别为0
 67         return 0.0
 68 
 69 
 70 def colicTest():
 71     frTrain = open("../machinelearninginaction/Ch05/horseColicTraining.txt")  # 读取训练文件
 72     frTest = open("../machinelearninginaction/Ch05/horseColicTest.txt")  # 读取测试文件
 73     trainingSet = []; trainingLabels = []  # 设置两个空列表分别存放训练数据和训练标签
 74     for line in frTrain.readlines():  # 读取训练文件的每一行
 75         currLine = line.strip().split('\t')  # 删除当前行的行首行尾空格,并按照Tab\t分隔数据
 76         lineArr = []  # 建立空列表存放训练数据,每次循环都重新置为空
 77         for i in range(21):  # 有21个特征
 78             lineArr.append(float(currLine[i]))  # 将每个特征都存放在lineArr列表中
 79         trainingSet.append(lineArr)  # 将当前行的21个特征组成的列表添加到trainingSet中作为求解最佳拟合参数的输入数据
 80         trainingLabels.append(float(currLine[21]))  # 将当前行的标签添加到trainingLabels中
 81     trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 500)  # 使用改进的随机梯度上升法求解最佳拟合参数
 82     errorCount = 0; numTestVec = 0.0  # 定义错误数量和测试向量的数量
 83     for line in frTest.readlines():  # 读取测试数据的每一行
 84         numTestVec += 1.0  # 累计测试样本的个数
 85         currLine = line.strip().split('\t')  # 删除空格分隔所有数据
 86         lineArr = []  # 用于存放测试数据的21个特征
 87         for i in range(21):
 88             lineArr.append(float(currLine[i]))  # 将测试数据的21个特征逐个添加到lineArr列表中
 89         if int(classifyVector(np.array(lineArr), trainWeights)) != int(currLine[21]):
 90             # 如果分类错误,errorCount加1
 91             errorCount += 1
 92     errorRate = (float(errorCount)/numTestVec)  # 计算错误率,分类错误的数量除以测试样本的总数
 93     print('the error rate of this is: %f' % errorRate)
 94     return errorRate
 95 
 96 
 97 def multiTest():
 98     numTests = 10; errorSum = 0.0  # 定义测试次数及错误数量的总和
 99     for k in range(numTests):
100         errorSum += colicTest()  # 将每次测试的错误率相加求平均值
101     print('after %d iterations the average error rate is: %f'%(numTests, errorSum/float(numTests)))
102 
103 
104 def plotBestFit(dataArr, weights):
105     dataArr = np.array(dataArr)
106     n = np.shape(dataArr)[0]
107     xcord1 = []; ycord1 = []
108     xcord2 = []; ycord2 = []
109     for i in range(n):
110         if int(labelMat[i]) == 1:
111             xcord1.append(dataArr[i, 1]);ycord1.append(dataArr[i,2])
112         else:
113             xcord2.append(dataArr[i,1]);ycord2.append(dataArr[i,2])
114     fig = plt.figure()
115     ax = fig.add_subplot(111)  # “111”表示“1×1网格,第一子图”,“234”表示“2×3网格,第四子图”
116     ax.scatter(xcord1, ycord1, s=30, c='red', marker='^')
117     # s表示size,(xcord1, ycord1)记录的是类别为1时X1与X2特征的值
118     ax.scatter(xcord2, ycord2, s=30, c='green')
119     # (xcord2, ycord2)记录的是类别为0时X1与X2特征的值
120     x = np.arange(-3.0, 3.0, 0.1)  # 设置横坐标
121     y = (-weights[0]-weights[1]*x)/weights[2]  # z=0时X2与X1的关系式,z=w0x0+w1x1+w2x2,其中z=0,x0=1,y即为X2
122     # 当z=0,也就是sigmoid(z)=0.5时是0类和1类的分界点,z=0时对应着X1和X2的关系式,即可作为0,1类别的分界线
123     ax.plot(x, y)
124     plt.xlabel('X1');plt.ylabel('X2')
125     plt.show()
126 
127 
128 if __name__ == '__main__':
129     dataArr, labelMat = loadDataSet()
130     weights_0 = gradAscent(dataArr, labelMat)  # 梯度上升法求解最佳拟合参数
131     # plotBestFit(dataArr, weights_0.getA())
132     # getA()此函数将矩阵类型转化为数组,与mat函数正好相反;
133     # print(weights)
134     weights_1 = stocGradAscent0(np.array(dataArr), labelMat)  # 随机梯度上升法求解
135     # plotBestFit(dataArr, weights_1)
136     weights_2 = stocGradAscent1(np.array(dataArr), labelMat)  # 改进的随机梯度上升法
137     # plotBestFit(dataArr, weights_2)
138     multiTest()

 

运行结果如下:

 1 the error rate of this is: 0.402985
 2 the error rate of this is: 0.432836
 3 the error rate of this is: 0.328358
 4 the error rate of this is: 0.373134
 5 the error rate of this is: 0.268657
 6 the error rate of this is: 0.358209
 7 the error rate of this is: 0.343284
 8 the error rate of this is: 0.477612
 9 the error rate of this is: 0.268657
10 the error rate of this is: 0.298507
11 after 10 iterations the average error rate is: 0.355224

 

posted @ 2020-06-09 23:09  DJames23  阅读(434)  评论(0编辑  收藏  举报