【机器学习实战】第六章--支持向量机

  1 import numpy as np
  2 import os
  3 
  4 
  5 class optStruct:
  6     # 建立一个数据结构来保存所有重要的值,仅包含__init__方法,该方法可以实现其成员变量的填充
  7     def __init__(self, dataMatIn, classLabels, C, toler, kTup):
  8         # kTup是一个包含核函数信息的元组
  9         self.X = dataMatIn
 10         self.labelMat = classLabels
 11         self.C = C
 12         self.tol = toler
 13         self.m = np.shape(dataMatIn)[0]
 14         self.alphas = np.mat(np.zeros((self.m, 1)))
 15         self.b = 0
 16         # eCache的第一列给出的是eCache是否有效的标志位,第二列给出的是实际的E值
 17         self.eCache = np.mat(np.zeros((self.m, 2)))
 18         self.K = np.mat(np.zeros((self.m, self.m)))
 19         for i in range(self.m):
 20             self.K[:, i] = kernelTrans(self.X, self.X[i, :], kTup)
 21 
 22 
 23 def calcEk(oS, k):
 24     # 计算E值并返回
 25     # fXk = float(np.multiply(oS.alphas, oS.labelMat).T*(oS.X*oS.X[k,:].T)) + oS.b
 26     fXk = float(np.multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b)
 27     Ek = fXk - float(oS.labelMat[k])
 28     return Ek
 29 
 30 
 31 def selectJ(i, oS, Ei):
 32     # 用于选择第二个alpha或者说内循环的alpha值,这里的目标是选择合适的第二个alpha值以保证在每次优化中采用最大步长
 33     # 该函数的误差值与第一个alpha值Ei和下标i有关
 34     maxK = -1
 35     maxDeltaE = 0
 36     Ej = 0
 37     oS.eCache[i] = [1, Ei]
 38     validEcacheList = np.nonzero(oS.eCache[:, 0].A)[0]  # 构建一个非零表。返回的是非零E值所对应的alpha值
 39     # np.nonzero函数是numpy中用于得到数组array中非零元素的位置(数组索引)的函数。
 40     if len(validEcacheList) > 1:
 41         for k in validEcacheList:
 42             if k == i:
 43                 continue
 44             Ek = calcEk(oS, k)
 45             deltaE = abs(Ei - Ek)
 46             if deltaE > maxDeltaE:
 47                 maxK = k
 48                 maxDeltaE = deltaE
 49                 Ej = Ek
 50                 # 选择具有最大步长的j
 51         return maxK, Ej
 52     else:
 53         j = selectJrand(i, oS.m)
 54         Ej = calcEk(oS, j)
 55     return j, Ej
 56 
 57 
 58 def updateEk(oS, k):
 59     # 计算误差值并存入缓存当中。在对alpha值进行优化之后会用到这个值
 60     Ek = calcEk(oS, k)
 61     oS.eCache[k] = [1, Ek]
 62 
 63 
 64 def innerL(i, oS):
 65     Ei = calcEk(oS, i)
 66     if ((oS.labelMat[i]*Ei<-oS.tol) and (oS.alphas[i]<oS.C)) or ((oS.labelMat[i]*Ei>oS.tol) and (oS.alphas[i]>0)):
 67         j, Ej = selectJ(i, oS, Ei)
 68         alphaIold = oS.alphas[i].copy()
 69         alphaJold = oS.alphas[j].copy()
 70         if oS.labelMat[i] != oS.labelMat[j]:
 71             L = max(0, oS.alphas[j] - oS.alphas[i])
 72             H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
 73         else:
 74             L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
 75             H = min(oS.C, oS.alphas[j] + oS.alphas[i])
 76         if L == H:
 77             print('L == H')
 78             return 0
 79         # eta = 2.0*oS.X[i,:]*oS.X[j,:].T-oS.X[i,:]*oS.X[i,:].T-oS.X[j,:]*oS.X[j,:].T
 80         eta = 2.0*oS.K[i,j]-oS.K[i,i]-oS.K[j,j]
 81         if eta >= 0:
 82             print('eta>=0')
 83             return 0
 84         oS.alphas[j] -= oS.labelMat[j]*(Ei-Ej)/eta
 85         oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
 86         updateEk(oS,j)
 87         if abs(oS.alphas[j] - alphaJold) < 0.00001:
 88             print('j not moving enough')
 89             return 0
 90         oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i]*(alphaJold - oS.alphas[j])
 91         updateEk(oS, i)
 92         # b1 = oS.b - Ei - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[i,:].T-oS.labelMat[j]*\
 93              # (oS.alphas[j]-alphaJold)*oS.X[i,:]*oS.X[j,:].T
 94         # b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[j,:].T-oS.labelMat[j]*\
 95              # (oS.alphas[j]-alphaJold)*oS.X[j,:]*oS.X[j,:].T
 96         b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, i] - oS.labelMat[j] * \
 97             (oS.alphas[j]-alphaJold)*oS.K[i,j]
 98         b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]-oS.labelMat[j]*\
 99             (oS.alphas[j]-alphaJold)*oS.K[j,j]
100         if (0<oS.alphas[i]) and (oS.C>oS.alphas[i]):
101             oS.b = b1
102         elif (0<oS.alphas[j]) and (oS.C>oS.alphas[j]):
103             oS.b = b2
104         else:
105             oS.b = (b1+b2)/2.0
106         return 1
107     else:
108         return 0
109 
110 
111 def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup=('lin', 0)):
112     # 实例化对象oS,用来容纳所有数据
113     oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).transpose(),C,toler, kTup)
114     iter = 0
115     entireSet = True
116     alphaPairsChanged = 0
117     while iter < maxIter and (alphaPairsChanged >0 or entireSet):
118         # 当迭代次数超过指定的最大值,或者遍历整个集合都未对任意alpha对进行修改时,就退出循环
119         alphaPairsChanged = 0
120         if entireSet:
121             for i in range(oS.m):
122                 alphaPairsChanged += innerL(i, oS)
123                 print('fullSet, iter: %d i: %d, pairs changed %d'%(iter, i, alphaPairsChanged))
124             iter += 1
125         else:
126             nonBoundsIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
127             # matrix.A是将矩阵转换为ndarray
128             # >>> type(c)
129             # <class 'numpy.matrix'>
130             # >>> type(c.A)
131             # <class 'numpy.ndarray'>
132             for i in nonBoundsIs:
133                 print('non-bound,iter: %d i: %d, pairs changed %d' % (iter, i, alphaPairsChanged))
134             iter += 1
135         if entireSet:
136             entireSet = False
137         elif alphaPairsChanged == 0:
138             entireSet = True
139         print('iteration number: %d' % iter)
140     return oS.b, oS.alphas
141 
142 
143 def clacWs(alphas, dataArr, classLabels):
144     X = np.mat(dataArr)
145     labelMat = np.mat(classLabels).transpose()
146     m, n = np.shape(X)
147     w = np.zeros((n, 1))
148     for i in range(m):
149         w += np.multiply(alphas[i] * labelMat[i], X[i,:].T)
150     return w
151 
152 
153 def kernelTrans(X, A, kTup):
154     '''
155 
156     :param X: 所有数据集
157     :param A: 数据集中的一行
158     :param kTup: 元组kTup给出的是核函数的信息,元组的第一个参数是描述所用核函数类型的一个字符串,
159                 其他两个参数则都是核函数可能需要的可选参数
160     :return:
161     '''
162     m, n = np.shape(X)
163     K = np.mat(np.zeros((m, 1)))
164     if kTup[0] == 'lin':
165         # 线性核函数
166         K = X * A.T
167     elif kTup[0] == 'rbf':
168         # 径向基核函数
169         for j in range(m):
170             deltaRow = X[j, :] - A
171             K[j] = deltaRow * deltaRow.T
172         K = np.exp(K / (-1*kTup[1]**2))
173     else:
174         raise NameError('Houston We Have a Problem -- That Kernel is not recognized')
175     return K
176 
177 
178 def testRbf(k1 = 1.3):
179     '''
180 
181     :param k1: 高斯径向基函数中的一个用户定义变量
182     :return:
183     '''
184     # 读入数据集
185     dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSetRBF.txt")
186     # 运行Platt Smo算法,核函数类型为rbf
187     b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1))
188     # 建立数据矩阵副本
189     dataMat = np.mat(dataArr)
190     labelMat = np.mat(labelArr).transpose()
191     # 找出非零的alpha值
192     svInd = np.nonzero(alphas.A > 0)[0]
193     # 得到所需要的支持向量
194     sVs = dataMat[svInd]
195     # 得到类别标签值
196     labelSV = labelMat[svInd]
197     print('there are %d Support Vectors' % np.shape(sVs)[0])
198     m, n = np.shape(dataMat)
199     errorCount = 0
200     for i in range(m):
201         # 利用核函数进行分类
202         kernelEval = kernelTrans(sVs, dataMat[i, :], ('rbf', k1))  # 得到转换后的数据
203         predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b
204         if np.sign(predict) != np.sign(labelArr[i]):
205             # sign()是Python的Numpy中的取数字符号(数字前的正负号)的函数。大于0时取1,小于0时取-1,等于0时取0
206             errorCount += 1
207     print('the training error rate is: %f' % (float(errorCount)/m))
208     dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSetRBF2.txt")
209     errorCount = 0
210     dataMat = np.mat(dataArr)
211     labelMat = np.mat(labelArr).transpose()
212     m, n = np.shape(dataMat)
213     for i in range(m):
214         kernelEval = kernelTrans(sVs, dataMat[i, :], ('rbf', k1))
215         predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b
216         if np.sign(predict) != np.sign(labelArr[i]):
217             errorCount += 1
218     print('the test error rate is: %f' % (float(errorCount)/m))
219 
220 
221 '''
222 ***************************************************************************
223 '''
224 # 手写数字识别
225 def img2vector(filename):
226     returnVect = np.zeros((1, 1024))  # 创建1行1024列的零数组
227     fr = open(filename)
228     for i in range(32):
229         lineStr = fr.readline()
230         for j in range(32):
231             returnVect[0, 32*i+j] = int(lineStr[j])  # 将32x32的图片转换成1x1024的行向量
232     return returnVect
233 
234 
235 def loadImages(dirName):
236     hwLabels = []
237     trainingFileList = os.listdir(dirName)  # 返回包含文件夹下的所有文件名的列表
238     m = len(trainingFileList)  # 获取所有文件的个数
239     trainingMat = np.zeros((m, 1024))
240     for i in range(m):
241         fileNameStr = trainingFileList[i]
242         fileStr = fileNameStr.split('.')[0]
243         classNumStr = int(fileStr.split('_')[0])
244         # 二分类,只识别9
245         if classNumStr == 9:
246             hwLabels.append(-1)
247         else:
248             hwLabels.append(1)
249         trainingMat[i, :] = img2vector('%s/%s'%(dirName, fileNameStr))
250     return trainingMat, hwLabels
251 
252 
253 def testDigits(kTup=('rbf', 10)):
254     dataArr, labelArr = loadImages('../machinelearninginaction/Ch02/digits/trainingDigits')
255     b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)
256     dataMat = np.mat(dataArr)
257     labelMat = np.mat(labelArr).transpose()
258     svInd = np.nonzero(alphas.A>0)[0]
259     sVs = dataMat[svInd]
260     labelSV = labelMat[svInd]
261     print('there are %d Support Vectors' % np.shape(sVs)[0])
262     m, n = np.shape(dataMat)
263     errorCount = 0
264     for i in range(m):
265         kernelEval = kernelTrans(sVs, dataMat[i, :], kTup)
266         predict = kernelEval.T*np.multiply(labelSV, alphas[svInd]) + b
267         if np.sign(predict) != np.sign(labelArr[i]):
268             errorCount += 1
269     print('the training error rate is %f' % (float(errorCount)/m))
270     dataArr, labelArr = loadImages('../machinelearninginaction/Ch02/digits/testDigits')
271     dataMat = np.mat(dataArr)
272     labelMat = np.mat(labelArr).transpose()
273     m, n = np.shape(dataMat)
274 
275     for i in range(m):
276         kernelEval = kernelTrans(sVs, dataMat[i, :], kTup)
277         predict = kernelEval.T*np.multiply(labelSV, alphas[svInd]) + b
278         if np.sign(predict) != np.sign(labelArr[i]):
279             errorCount += 1
280     print('the test error rate is %f' % (float(errorCount)/m))
281 
282 
283 '''
284 ***************************************************************************
285 '''
286 
287 
288 def loadDataSet(filename):
289     # 该函数打开文件并对其进行逐行解析,从而得到每行的类标签和整个数据矩阵
290     dataMat = []
291     labelMat = []
292     fr = open(filename)
293     for line in fr.readlines():
294         lineArr = line.strip().split('\t')
295         dataMat.append([float(lineArr[0]), float(lineArr[1])])
296         labelMat.append(float(lineArr[2]))
297     return dataMat,labelMat
298 
299 
300 def selectJrand(i, m):
301     '''
302 
303     :param i:第一个alpha的下标
304     :param m: 所有alpha的数目
305     :return:
306     '''
307     j = i
308     while j == i:
309         j = int(np.random.uniform(0, m))  # 随机选择不等于i的j值
310     return j
311 
312 
313 def clipAlpha(aj, H, L):
314     # 此函数用于调整大于H或小于L的alpha值
315     if aj > H:
316         aj = H
317     if aj < L:
318         aj = L
319     return aj
320 
321 
322 '''
323 SMO函数的伪代码大致如下:
324     创建一个alpha向量并将其初始化为0向量
325     当迭代次数小于最大迭代次数时(外循环):
326         对数据集中的每个数据向量(内循环):
327             如果该数据向量可以被优化:
328                 随机选择另外一个数据向量
329                 同时优化这两个向量
330                 如果两个向量都不能被优化,退出内循环
331         如果所有向量都没被优化,增加迭代数目,继续下一次循环
332 
333 '''
334 
335 
336 def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
337     '''
338 
339     :param dataMatIn:数据集
340     :param classLabels: 类别标签
341     :param C: 常数C
342     :param toler: 容错率
343     :param maxIter: 最大循环次数
344     :return:
345     '''
346     dataMatrix = np.mat(dataMatIn)  # 创建矩阵
347     labelMat = np.mat(classLabels).transpose()  # 矩阵转置,变成列向量
348     b = 0
349     m, n = np.shape(dataMatrix)  # 获取行和列
350     alphas = np.mat(np.zeros((m,1)))
351     iter = 0
352 
353     while iter < maxIter:
354         alphaPairsChanged = 0  # 用于记录alpha是否已经进行优化
355         for i in range(m):
356             # fXi是预测的类别
357             fXi = float(np.multiply(alphas, labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b
358             # np.multiply为点乘,即对应元素相乘
359             Ei = fXi - float(labelMat[i])  # 预测结果和真实结果比对,计算误差Ei
360             if((labelMat[i] * Ei < -toler) and (alphas[i] < C)) or ((labelMat[i] * Ei > toler) and (alphas[i]>0)):
361                 # 不管是正间隔还是负间隔都会被测试,也要同时检查alpha的值,以保证其不能等于0或C
362                 j = selectJrand(i, m)  # 随机选择第二个alpha值,即alpha[j]
363                 fXj = float(np.multiply(alphas, labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b
364                 Ej = fXj - float(labelMat[j])  # 计算第二个alpha的误差
365                 alphaIold = alphas[i].copy()
366                 alphaJold = alphas[j].copy()
367                 if labelMat[i] != labelMat[j]:
368                     L = max(0, alphas[j] - alphas[i])
369                     H = min(C, C + alphas[j] - alphas[i])
370                 else:
371                     L = max(0, alphas[j] + alphas[i] - C)
372                     H = min(C, alphas[j] + alphas[i])
373                     # 以上将alpha[j]调整到0到C之间
374                 if L == H:
375                     print("L == H")
376                     continue  # 本次循环结束直接进行下一次for循环
377                 eta = 2.0 * dataMatrix[i, :]*dataMatrix[j, :].T - dataMatrix[i, :]*dataMatrix[i,:].T - \
378                       dataMatrix[j,:]*dataMatrix[j,:].T
379                 # eta是alpha[j]的最优修改量
380                 if eta >= 0:
381                     print(eta >= 0)
382                     continue
383                 alphas[j] -= labelMat[j]*(Ei - Ej)/eta
384                 alphas[j] = clipAlpha(alphas[j], H, L)  # 计算新的alpha[j]
385                 if abs(alphas[j] - alphaJold) < 0.00001:
386                     # 检查alpha[j]是否有轻微改变,如果是的话就退出for循环
387                     print("j not moving enough")
388                     continue
389                 # 对alpha[i]进行修改,修改量与alpha[j]相同,但反向相反
390                 alphas[i] += labelMat[j] * labelMat[i] * (alphaJold - alphas[j])
391                 # 给两个alpha值设置常数项b
392                 b1 = b - Ei - labelMat[i] * (alphas[i] - alphaIold) * dataMatrix[i,:]*dataMatrix[i,:].T - \
393                      labelMat[j]*(alphas[j] - alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
394                 b2 = b - Ej - labelMat[i] * (alphas[i] - alphaIold) * dataMatrix[i,:]*dataMatrix[j,:].T - \
395                      labelMat[j]*(alphas[j] - alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
396                 if (0 < alphas[i]) and (C > alphas[i]):
397                     b = b1
398                 elif (0 < alphas[j]) and (C > alphas[j]):
399                     b = b2
400                 else:
401                     # 如果程序执行到for循环的最后一行都不执行continue语句,那么就已经成功地改变了一对alpha,增加alphaPairsChanged
402                     b = (b1 + b2) / 2.0
403                     alphaPairsChanged += 1
404                     print('iter: %d i: %d, pairs changed %d'%(iter, i, alphaPairsChanged))
405         # 检查alpha值是否做了更新,如果有更新则将iter设为0后继续运行程序。
406         # 只有在所有数据集上遍历maxIter次,且不再发生任何alpha修改之后,程序才会停止并退出while循环
407         if alphaPairsChanged == 0:
408             iter += 1
409         else:
410             iter = 0
411         print('iteration number: %d' % iter)
412     return b, alphas
413 
414 
415 if __name__ == "__main__":
416     # testRbf()
417     # testDigits(('lin', 0))
418     testDigits(('rbf', 10))
419 
420     '''
421     # Python中Numpy mat的使用 https://www.cnblogs.com/lfri/p/10561914.html
422     dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSet.txt")
423     # print(dataArr)
424     print(labelArr)
425     # b, alphas = smoSimple(dataArr, labelArr, 0.6, 0.001, 40)
426     b, alphas = smoP(dataArr, labelArr, 0.6, 0.001, 40)
427     # print(b)
428     # print(alphas[alphas>0])  # 数组过滤,只对nunpy类型有用
429     print(np.shape(alphas[alphas>0]))  # 支持向量的个数
430     for i in range(100):
431         # 输出支持向量
432         if alphas[i] > 0.0:
433             print(dataArr[i], labelArr[i])
434     ws = clacWs(alphas, dataArr, labelArr)
435     print(ws)
436     dataMat = np.mat(dataArr)
437     errorCount = 0
438     for k in range(np.shape(dataArr)[0]):
439         res = dataMat[k] * np.mat(ws) + b
440         if res < 0:
441             res = -1
442         else:
443             res = 1
444         if res != labelArr[k]:
445             errorCount += 1
446     means = errorCount/(np.shape(dataArr)[0])
447     print('error rate= %.5f'%means)
448     '''
1 there are 328 Support Vectors
2 the training error rate is 0.000000
3 the test error rate is 0.006342

 

posted @ 2020-07-15 11:11  DJames23  阅读(349)  评论(0编辑  收藏  举报