【机器学习实战】第六章--支持向量机
1 import numpy as np 2 import os 3 4 5 class optStruct: 6 # 建立一个数据结构来保存所有重要的值,仅包含__init__方法,该方法可以实现其成员变量的填充 7 def __init__(self, dataMatIn, classLabels, C, toler, kTup): 8 # kTup是一个包含核函数信息的元组 9 self.X = dataMatIn 10 self.labelMat = classLabels 11 self.C = C 12 self.tol = toler 13 self.m = np.shape(dataMatIn)[0] 14 self.alphas = np.mat(np.zeros((self.m, 1))) 15 self.b = 0 16 # eCache的第一列给出的是eCache是否有效的标志位,第二列给出的是实际的E值 17 self.eCache = np.mat(np.zeros((self.m, 2))) 18 self.K = np.mat(np.zeros((self.m, self.m))) 19 for i in range(self.m): 20 self.K[:, i] = kernelTrans(self.X, self.X[i, :], kTup) 21 22 23 def calcEk(oS, k): 24 # 计算E值并返回 25 # fXk = float(np.multiply(oS.alphas, oS.labelMat).T*(oS.X*oS.X[k,:].T)) + oS.b 26 fXk = float(np.multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b) 27 Ek = fXk - float(oS.labelMat[k]) 28 return Ek 29 30 31 def selectJ(i, oS, Ei): 32 # 用于选择第二个alpha或者说内循环的alpha值,这里的目标是选择合适的第二个alpha值以保证在每次优化中采用最大步长 33 # 该函数的误差值与第一个alpha值Ei和下标i有关 34 maxK = -1 35 maxDeltaE = 0 36 Ej = 0 37 oS.eCache[i] = [1, Ei] 38 validEcacheList = np.nonzero(oS.eCache[:, 0].A)[0] # 构建一个非零表。返回的是非零E值所对应的alpha值 39 # np.nonzero函数是numpy中用于得到数组array中非零元素的位置(数组索引)的函数。 40 if len(validEcacheList) > 1: 41 for k in validEcacheList: 42 if k == i: 43 continue 44 Ek = calcEk(oS, k) 45 deltaE = abs(Ei - Ek) 46 if deltaE > maxDeltaE: 47 maxK = k 48 maxDeltaE = deltaE 49 Ej = Ek 50 # 选择具有最大步长的j 51 return maxK, Ej 52 else: 53 j = selectJrand(i, oS.m) 54 Ej = calcEk(oS, j) 55 return j, Ej 56 57 58 def updateEk(oS, k): 59 # 计算误差值并存入缓存当中。在对alpha值进行优化之后会用到这个值 60 Ek = calcEk(oS, k) 61 oS.eCache[k] = [1, Ek] 62 63 64 def innerL(i, oS): 65 Ei = calcEk(oS, i) 66 if ((oS.labelMat[i]*Ei<-oS.tol) and (oS.alphas[i]<oS.C)) or ((oS.labelMat[i]*Ei>oS.tol) and (oS.alphas[i]>0)): 67 j, Ej = selectJ(i, oS, Ei) 68 alphaIold = oS.alphas[i].copy() 69 alphaJold = oS.alphas[j].copy() 70 if oS.labelMat[i] != oS.labelMat[j]: 71 L = max(0, oS.alphas[j] - oS.alphas[i]) 72 H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) 73 else: 74 L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) 75 H = min(oS.C, oS.alphas[j] + oS.alphas[i]) 76 if L == H: 77 print('L == H') 78 return 0 79 # eta = 2.0*oS.X[i,:]*oS.X[j,:].T-oS.X[i,:]*oS.X[i,:].T-oS.X[j,:]*oS.X[j,:].T 80 eta = 2.0*oS.K[i,j]-oS.K[i,i]-oS.K[j,j] 81 if eta >= 0: 82 print('eta>=0') 83 return 0 84 oS.alphas[j] -= oS.labelMat[j]*(Ei-Ej)/eta 85 oS.alphas[j] = clipAlpha(oS.alphas[j],H,L) 86 updateEk(oS,j) 87 if abs(oS.alphas[j] - alphaJold) < 0.00001: 88 print('j not moving enough') 89 return 0 90 oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i]*(alphaJold - oS.alphas[j]) 91 updateEk(oS, i) 92 # b1 = oS.b - Ei - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[i,:].T-oS.labelMat[j]*\ 93 # (oS.alphas[j]-alphaJold)*oS.X[i,:]*oS.X[j,:].T 94 # b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[j,:].T-oS.labelMat[j]*\ 95 # (oS.alphas[j]-alphaJold)*oS.X[j,:]*oS.X[j,:].T 96 b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, i] - oS.labelMat[j] * \ 97 (oS.alphas[j]-alphaJold)*oS.K[i,j] 98 b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]-oS.labelMat[j]*\ 99 (oS.alphas[j]-alphaJold)*oS.K[j,j] 100 if (0<oS.alphas[i]) and (oS.C>oS.alphas[i]): 101 oS.b = b1 102 elif (0<oS.alphas[j]) and (oS.C>oS.alphas[j]): 103 oS.b = b2 104 else: 105 oS.b = (b1+b2)/2.0 106 return 1 107 else: 108 return 0 109 110 111 def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup=('lin', 0)): 112 # 实例化对象oS,用来容纳所有数据 113 oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).transpose(),C,toler, kTup) 114 iter = 0 115 entireSet = True 116 alphaPairsChanged = 0 117 while iter < maxIter and (alphaPairsChanged >0 or entireSet): 118 # 当迭代次数超过指定的最大值,或者遍历整个集合都未对任意alpha对进行修改时,就退出循环 119 alphaPairsChanged = 0 120 if entireSet: 121 for i in range(oS.m): 122 alphaPairsChanged += innerL(i, oS) 123 print('fullSet, iter: %d i: %d, pairs changed %d'%(iter, i, alphaPairsChanged)) 124 iter += 1 125 else: 126 nonBoundsIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0] 127 # matrix.A是将矩阵转换为ndarray 128 # >>> type(c) 129 # <class 'numpy.matrix'> 130 # >>> type(c.A) 131 # <class 'numpy.ndarray'> 132 for i in nonBoundsIs: 133 print('non-bound,iter: %d i: %d, pairs changed %d' % (iter, i, alphaPairsChanged)) 134 iter += 1 135 if entireSet: 136 entireSet = False 137 elif alphaPairsChanged == 0: 138 entireSet = True 139 print('iteration number: %d' % iter) 140 return oS.b, oS.alphas 141 142 143 def clacWs(alphas, dataArr, classLabels): 144 X = np.mat(dataArr) 145 labelMat = np.mat(classLabels).transpose() 146 m, n = np.shape(X) 147 w = np.zeros((n, 1)) 148 for i in range(m): 149 w += np.multiply(alphas[i] * labelMat[i], X[i,:].T) 150 return w 151 152 153 def kernelTrans(X, A, kTup): 154 ''' 155 156 :param X: 所有数据集 157 :param A: 数据集中的一行 158 :param kTup: 元组kTup给出的是核函数的信息,元组的第一个参数是描述所用核函数类型的一个字符串, 159 其他两个参数则都是核函数可能需要的可选参数 160 :return: 161 ''' 162 m, n = np.shape(X) 163 K = np.mat(np.zeros((m, 1))) 164 if kTup[0] == 'lin': 165 # 线性核函数 166 K = X * A.T 167 elif kTup[0] == 'rbf': 168 # 径向基核函数 169 for j in range(m): 170 deltaRow = X[j, :] - A 171 K[j] = deltaRow * deltaRow.T 172 K = np.exp(K / (-1*kTup[1]**2)) 173 else: 174 raise NameError('Houston We Have a Problem -- That Kernel is not recognized') 175 return K 176 177 178 def testRbf(k1 = 1.3): 179 ''' 180 181 :param k1: 高斯径向基函数中的一个用户定义变量 182 :return: 183 ''' 184 # 读入数据集 185 dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSetRBF.txt") 186 # 运行Platt Smo算法,核函数类型为rbf 187 b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1)) 188 # 建立数据矩阵副本 189 dataMat = np.mat(dataArr) 190 labelMat = np.mat(labelArr).transpose() 191 # 找出非零的alpha值 192 svInd = np.nonzero(alphas.A > 0)[0] 193 # 得到所需要的支持向量 194 sVs = dataMat[svInd] 195 # 得到类别标签值 196 labelSV = labelMat[svInd] 197 print('there are %d Support Vectors' % np.shape(sVs)[0]) 198 m, n = np.shape(dataMat) 199 errorCount = 0 200 for i in range(m): 201 # 利用核函数进行分类 202 kernelEval = kernelTrans(sVs, dataMat[i, :], ('rbf', k1)) # 得到转换后的数据 203 predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b 204 if np.sign(predict) != np.sign(labelArr[i]): 205 # sign()是Python的Numpy中的取数字符号(数字前的正负号)的函数。大于0时取1,小于0时取-1,等于0时取0 206 errorCount += 1 207 print('the training error rate is: %f' % (float(errorCount)/m)) 208 dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSetRBF2.txt") 209 errorCount = 0 210 dataMat = np.mat(dataArr) 211 labelMat = np.mat(labelArr).transpose() 212 m, n = np.shape(dataMat) 213 for i in range(m): 214 kernelEval = kernelTrans(sVs, dataMat[i, :], ('rbf', k1)) 215 predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b 216 if np.sign(predict) != np.sign(labelArr[i]): 217 errorCount += 1 218 print('the test error rate is: %f' % (float(errorCount)/m)) 219 220 221 ''' 222 *************************************************************************** 223 ''' 224 # 手写数字识别 225 def img2vector(filename): 226 returnVect = np.zeros((1, 1024)) # 创建1行1024列的零数组 227 fr = open(filename) 228 for i in range(32): 229 lineStr = fr.readline() 230 for j in range(32): 231 returnVect[0, 32*i+j] = int(lineStr[j]) # 将32x32的图片转换成1x1024的行向量 232 return returnVect 233 234 235 def loadImages(dirName): 236 hwLabels = [] 237 trainingFileList = os.listdir(dirName) # 返回包含文件夹下的所有文件名的列表 238 m = len(trainingFileList) # 获取所有文件的个数 239 trainingMat = np.zeros((m, 1024)) 240 for i in range(m): 241 fileNameStr = trainingFileList[i] 242 fileStr = fileNameStr.split('.')[0] 243 classNumStr = int(fileStr.split('_')[0]) 244 # 二分类,只识别9 245 if classNumStr == 9: 246 hwLabels.append(-1) 247 else: 248 hwLabels.append(1) 249 trainingMat[i, :] = img2vector('%s/%s'%(dirName, fileNameStr)) 250 return trainingMat, hwLabels 251 252 253 def testDigits(kTup=('rbf', 10)): 254 dataArr, labelArr = loadImages('../machinelearninginaction/Ch02/digits/trainingDigits') 255 b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup) 256 dataMat = np.mat(dataArr) 257 labelMat = np.mat(labelArr).transpose() 258 svInd = np.nonzero(alphas.A>0)[0] 259 sVs = dataMat[svInd] 260 labelSV = labelMat[svInd] 261 print('there are %d Support Vectors' % np.shape(sVs)[0]) 262 m, n = np.shape(dataMat) 263 errorCount = 0 264 for i in range(m): 265 kernelEval = kernelTrans(sVs, dataMat[i, :], kTup) 266 predict = kernelEval.T*np.multiply(labelSV, alphas[svInd]) + b 267 if np.sign(predict) != np.sign(labelArr[i]): 268 errorCount += 1 269 print('the training error rate is %f' % (float(errorCount)/m)) 270 dataArr, labelArr = loadImages('../machinelearninginaction/Ch02/digits/testDigits') 271 dataMat = np.mat(dataArr) 272 labelMat = np.mat(labelArr).transpose() 273 m, n = np.shape(dataMat) 274 275 for i in range(m): 276 kernelEval = kernelTrans(sVs, dataMat[i, :], kTup) 277 predict = kernelEval.T*np.multiply(labelSV, alphas[svInd]) + b 278 if np.sign(predict) != np.sign(labelArr[i]): 279 errorCount += 1 280 print('the test error rate is %f' % (float(errorCount)/m)) 281 282 283 ''' 284 *************************************************************************** 285 ''' 286 287 288 def loadDataSet(filename): 289 # 该函数打开文件并对其进行逐行解析,从而得到每行的类标签和整个数据矩阵 290 dataMat = [] 291 labelMat = [] 292 fr = open(filename) 293 for line in fr.readlines(): 294 lineArr = line.strip().split('\t') 295 dataMat.append([float(lineArr[0]), float(lineArr[1])]) 296 labelMat.append(float(lineArr[2])) 297 return dataMat,labelMat 298 299 300 def selectJrand(i, m): 301 ''' 302 303 :param i:第一个alpha的下标 304 :param m: 所有alpha的数目 305 :return: 306 ''' 307 j = i 308 while j == i: 309 j = int(np.random.uniform(0, m)) # 随机选择不等于i的j值 310 return j 311 312 313 def clipAlpha(aj, H, L): 314 # 此函数用于调整大于H或小于L的alpha值 315 if aj > H: 316 aj = H 317 if aj < L: 318 aj = L 319 return aj 320 321 322 ''' 323 SMO函数的伪代码大致如下: 324 创建一个alpha向量并将其初始化为0向量 325 当迭代次数小于最大迭代次数时(外循环): 326 对数据集中的每个数据向量(内循环): 327 如果该数据向量可以被优化: 328 随机选择另外一个数据向量 329 同时优化这两个向量 330 如果两个向量都不能被优化,退出内循环 331 如果所有向量都没被优化,增加迭代数目,继续下一次循环 332 333 ''' 334 335 336 def smoSimple(dataMatIn, classLabels, C, toler, maxIter): 337 ''' 338 339 :param dataMatIn:数据集 340 :param classLabels: 类别标签 341 :param C: 常数C 342 :param toler: 容错率 343 :param maxIter: 最大循环次数 344 :return: 345 ''' 346 dataMatrix = np.mat(dataMatIn) # 创建矩阵 347 labelMat = np.mat(classLabels).transpose() # 矩阵转置,变成列向量 348 b = 0 349 m, n = np.shape(dataMatrix) # 获取行和列 350 alphas = np.mat(np.zeros((m,1))) 351 iter = 0 352 353 while iter < maxIter: 354 alphaPairsChanged = 0 # 用于记录alpha是否已经进行优化 355 for i in range(m): 356 # fXi是预测的类别 357 fXi = float(np.multiply(alphas, labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b 358 # np.multiply为点乘,即对应元素相乘 359 Ei = fXi - float(labelMat[i]) # 预测结果和真实结果比对,计算误差Ei 360 if((labelMat[i] * Ei < -toler) and (alphas[i] < C)) or ((labelMat[i] * Ei > toler) and (alphas[i]>0)): 361 # 不管是正间隔还是负间隔都会被测试,也要同时检查alpha的值,以保证其不能等于0或C 362 j = selectJrand(i, m) # 随机选择第二个alpha值,即alpha[j] 363 fXj = float(np.multiply(alphas, labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b 364 Ej = fXj - float(labelMat[j]) # 计算第二个alpha的误差 365 alphaIold = alphas[i].copy() 366 alphaJold = alphas[j].copy() 367 if labelMat[i] != labelMat[j]: 368 L = max(0, alphas[j] - alphas[i]) 369 H = min(C, C + alphas[j] - alphas[i]) 370 else: 371 L = max(0, alphas[j] + alphas[i] - C) 372 H = min(C, alphas[j] + alphas[i]) 373 # 以上将alpha[j]调整到0到C之间 374 if L == H: 375 print("L == H") 376 continue # 本次循环结束直接进行下一次for循环 377 eta = 2.0 * dataMatrix[i, :]*dataMatrix[j, :].T - dataMatrix[i, :]*dataMatrix[i,:].T - \ 378 dataMatrix[j,:]*dataMatrix[j,:].T 379 # eta是alpha[j]的最优修改量 380 if eta >= 0: 381 print(eta >= 0) 382 continue 383 alphas[j] -= labelMat[j]*(Ei - Ej)/eta 384 alphas[j] = clipAlpha(alphas[j], H, L) # 计算新的alpha[j] 385 if abs(alphas[j] - alphaJold) < 0.00001: 386 # 检查alpha[j]是否有轻微改变,如果是的话就退出for循环 387 print("j not moving enough") 388 continue 389 # 对alpha[i]进行修改,修改量与alpha[j]相同,但反向相反 390 alphas[i] += labelMat[j] * labelMat[i] * (alphaJold - alphas[j]) 391 # 给两个alpha值设置常数项b 392 b1 = b - Ei - labelMat[i] * (alphas[i] - alphaIold) * dataMatrix[i,:]*dataMatrix[i,:].T - \ 393 labelMat[j]*(alphas[j] - alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T 394 b2 = b - Ej - labelMat[i] * (alphas[i] - alphaIold) * dataMatrix[i,:]*dataMatrix[j,:].T - \ 395 labelMat[j]*(alphas[j] - alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T 396 if (0 < alphas[i]) and (C > alphas[i]): 397 b = b1 398 elif (0 < alphas[j]) and (C > alphas[j]): 399 b = b2 400 else: 401 # 如果程序执行到for循环的最后一行都不执行continue语句,那么就已经成功地改变了一对alpha,增加alphaPairsChanged 402 b = (b1 + b2) / 2.0 403 alphaPairsChanged += 1 404 print('iter: %d i: %d, pairs changed %d'%(iter, i, alphaPairsChanged)) 405 # 检查alpha值是否做了更新,如果有更新则将iter设为0后继续运行程序。 406 # 只有在所有数据集上遍历maxIter次,且不再发生任何alpha修改之后,程序才会停止并退出while循环 407 if alphaPairsChanged == 0: 408 iter += 1 409 else: 410 iter = 0 411 print('iteration number: %d' % iter) 412 return b, alphas 413 414 415 if __name__ == "__main__": 416 # testRbf() 417 # testDigits(('lin', 0)) 418 testDigits(('rbf', 10)) 419 420 ''' 421 # Python中Numpy mat的使用 https://www.cnblogs.com/lfri/p/10561914.html 422 dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSet.txt") 423 # print(dataArr) 424 print(labelArr) 425 # b, alphas = smoSimple(dataArr, labelArr, 0.6, 0.001, 40) 426 b, alphas = smoP(dataArr, labelArr, 0.6, 0.001, 40) 427 # print(b) 428 # print(alphas[alphas>0]) # 数组过滤,只对nunpy类型有用 429 print(np.shape(alphas[alphas>0])) # 支持向量的个数 430 for i in range(100): 431 # 输出支持向量 432 if alphas[i] > 0.0: 433 print(dataArr[i], labelArr[i]) 434 ws = clacWs(alphas, dataArr, labelArr) 435 print(ws) 436 dataMat = np.mat(dataArr) 437 errorCount = 0 438 for k in range(np.shape(dataArr)[0]): 439 res = dataMat[k] * np.mat(ws) + b 440 if res < 0: 441 res = -1 442 else: 443 res = 1 444 if res != labelArr[k]: 445 errorCount += 1 446 means = errorCount/(np.shape(dataArr)[0]) 447 print('error rate= %.5f'%means) 448 '''
1 there are 328 Support Vectors 2 the training error rate is 0.000000 3 the test error rate is 0.006342