【机器学习实战】--第五章Logistic回归完整代码及注释
可参考博客:
完整代码如下:
1 import numpy as np 2 import matplotlib.pyplot as plt 3 # 参考https://blog.csdn.net/rujin_shi/article/details/78997271?utm_medium=distribute.pc_relevant_right.none-task-blog-BlogCommendFromMachineLearnPai2-5.nonecase&depth_1-utm_source=distribute.pc_relevant_right.none-task-blog-BlogCommendFromMachineLearnPai2-5.nonecase 4 5 def loadDataSet(): 6 dataMat = [] 7 labelMat = [] 8 fr = open('../machinelearninginaction/Ch05/testSet.txt') 9 for line in fr.readlines(): 10 lineArr = line.strip().split() 11 dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) # 存放3个特征X0,X1,X2 12 labelMat.append(float(lineArr[2])) # 存放标签 13 return dataMat, labelMat 14 15 16 def sigmoid(inX): 17 return 1.0/(1+np.exp(-inX)) 18 19 20 def gradAscent(dataMatIn, classLabels): # 梯度上升法 21 dataMatrix = np.mat(dataMatIn) # 转换为numpy矩阵 22 labelMat = np.mat(classLabels).transpose() # 转置为列向量 23 m, n = np.shape(dataMatrix) # 矩阵形状 24 alpha = 0.01 # 向目标移动的步长 25 maxCycles = 500 # 最大迭代次数 26 weights = np.ones((n,1)) # 初始化参数为1 27 for k in range(maxCycles): 28 h = sigmoid(dataMatrix * weights) 29 error = labelMat - h # 为什么要这样计算? 30 weights = weights + alpha * dataMatrix.transpose() * error 31 return weights 32 33 34 def stocGradAscent0(dataMatrix, classLabels): # 随机梯度上升法 35 m, n = np.shape(dataMatrix) 36 alpha = 0.01 37 weights = np.ones(n) 38 for i in range(m): 39 product = dataMatrix[i]*weights # 对应元素相乘 40 sum0 = sum(product) # 将所有元素相加 41 h = sigmoid(sum0) # 求取sigmoid值 42 error = classLabels[i] - h # 做差 43 weights = weights + alpha * error * dataMatrix[i] # 更新权重 44 return weights 45 46 47 def stocGradAscent1(dataMatrix, classLabels, numIter=150): # 改进的随机梯度上升法 48 m,n = np.shape(dataMatrix) # m为行,n为列 49 weights = np.ones(n) 50 for j in range(numIter): # 默认迭代150次 51 dataIndex = list(range(m)) # 这里要将range(m)改为list(range(m)),否则报错 52 for i in range(m): 53 alpha = 4/(1.0+j+i)+0.01 # 动态调整alpha 54 randIndex = int(np.random.uniform(0, len(dataIndex))) # 随机在0到len(dataIndex)之间选择一个数 55 h = sigmoid(sum(dataMatrix[randIndex]*weights)) # 做sigmoid 56 error = classLabels[randIndex] - h # 做差 57 weights = weights + alpha * error *dataMatrix[randIndex] # 更新权重 58 del(dataIndex[randIndex]) # 删除该值进行下一次迭代 59 return weights 60 61 62 def classifyVector(inX, weights): 63 prob = sigmoid(sum(inX*weights)) # 做sigmoid 64 if prob > 0.5: # 大于0.5时输出类别为1 65 return 1.0 66 else: # 小于0.5时输出类别为0 67 return 0.0 68 69 70 def colicTest(): 71 frTrain = open("../machinelearninginaction/Ch05/horseColicTraining.txt") # 读取训练文件 72 frTest = open("../machinelearninginaction/Ch05/horseColicTest.txt") # 读取测试文件 73 trainingSet = []; trainingLabels = [] # 设置两个空列表分别存放训练数据和训练标签 74 for line in frTrain.readlines(): # 读取训练文件的每一行 75 currLine = line.strip().split('\t') # 删除当前行的行首行尾空格,并按照Tab\t分隔数据 76 lineArr = [] # 建立空列表存放训练数据,每次循环都重新置为空 77 for i in range(21): # 有21个特征 78 lineArr.append(float(currLine[i])) # 将每个特征都存放在lineArr列表中 79 trainingSet.append(lineArr) # 将当前行的21个特征组成的列表添加到trainingSet中作为求解最佳拟合参数的输入数据 80 trainingLabels.append(float(currLine[21])) # 将当前行的标签添加到trainingLabels中 81 trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 500) # 使用改进的随机梯度上升法求解最佳拟合参数 82 errorCount = 0; numTestVec = 0.0 # 定义错误数量和测试向量的数量 83 for line in frTest.readlines(): # 读取测试数据的每一行 84 numTestVec += 1.0 # 累计测试样本的个数 85 currLine = line.strip().split('\t') # 删除空格分隔所有数据 86 lineArr = [] # 用于存放测试数据的21个特征 87 for i in range(21): 88 lineArr.append(float(currLine[i])) # 将测试数据的21个特征逐个添加到lineArr列表中 89 if int(classifyVector(np.array(lineArr), trainWeights)) != int(currLine[21]): 90 # 如果分类错误,errorCount加1 91 errorCount += 1 92 errorRate = (float(errorCount)/numTestVec) # 计算错误率,分类错误的数量除以测试样本的总数 93 print('the error rate of this is: %f' % errorRate) 94 return errorRate 95 96 97 def multiTest(): 98 numTests = 10; errorSum = 0.0 # 定义测试次数及错误数量的总和 99 for k in range(numTests): 100 errorSum += colicTest() # 将每次测试的错误率相加求平均值 101 print('after %d iterations the average error rate is: %f'%(numTests, errorSum/float(numTests))) 102 103 104 def plotBestFit(dataArr, weights): 105 dataArr = np.array(dataArr) 106 n = np.shape(dataArr)[0] 107 xcord1 = []; ycord1 = [] 108 xcord2 = []; ycord2 = [] 109 for i in range(n): 110 if int(labelMat[i]) == 1: 111 xcord1.append(dataArr[i, 1]);ycord1.append(dataArr[i,2]) 112 else: 113 xcord2.append(dataArr[i,1]);ycord2.append(dataArr[i,2]) 114 fig = plt.figure() 115 ax = fig.add_subplot(111) # “111”表示“1×1网格,第一子图”,“234”表示“2×3网格,第四子图” 116 ax.scatter(xcord1, ycord1, s=30, c='red', marker='^') 117 # s表示size,(xcord1, ycord1)记录的是类别为1时X1与X2特征的值 118 ax.scatter(xcord2, ycord2, s=30, c='green') 119 # (xcord2, ycord2)记录的是类别为0时X1与X2特征的值 120 x = np.arange(-3.0, 3.0, 0.1) # 设置横坐标 121 y = (-weights[0]-weights[1]*x)/weights[2] # z=0时X2与X1的关系式,z=w0x0+w1x1+w2x2,其中z=0,x0=1,y即为X2 122 # 当z=0,也就是sigmoid(z)=0.5时是0类和1类的分界点,z=0时对应着X1和X2的关系式,即可作为0,1类别的分界线 123 ax.plot(x, y) 124 plt.xlabel('X1');plt.ylabel('X2') 125 plt.show() 126 127 128 if __name__ == '__main__': 129 dataArr, labelMat = loadDataSet() 130 weights_0 = gradAscent(dataArr, labelMat) # 梯度上升法求解最佳拟合参数 131 # plotBestFit(dataArr, weights_0.getA()) 132 # getA()此函数将矩阵类型转化为数组,与mat函数正好相反; 133 # print(weights) 134 weights_1 = stocGradAscent0(np.array(dataArr), labelMat) # 随机梯度上升法求解 135 # plotBestFit(dataArr, weights_1) 136 weights_2 = stocGradAscent1(np.array(dataArr), labelMat) # 改进的随机梯度上升法 137 # plotBestFit(dataArr, weights_2) 138 multiTest()
运行结果如下:
1 the error rate of this is: 0.402985 2 the error rate of this is: 0.432836 3 the error rate of this is: 0.328358 4 the error rate of this is: 0.373134 5 the error rate of this is: 0.268657 6 the error rate of this is: 0.358209 7 the error rate of this is: 0.343284 8 the error rate of this is: 0.477612 9 the error rate of this is: 0.268657 10 the error rate of this is: 0.298507 11 after 10 iterations the average error rate is: 0.355224