机器学习实战——kNN近邻算法
1
#后期有时间我会逐行注释代码
from numpy import * 2 import operator 3 4 5 def creatDataSet(): 6 group = array([[1.0,1.1], 7 [1.0,1.0], 8 [0,0], 9 [0,0.1]]) 10 labels = ["A","A","B",'B'] 11 return group,labels 12 13 14 def classify0(inX,dataSet,labels,k): 15 dataSetSize = dataSet.shape[0] 16 diffMat = tile(inX,(dataSetSize,1))-dataSet 17 sqDiffMat = diffMat**2 18 distances = (sqDiffMat.sum(axis=1))**0.5 19 sortedDisIndicies = distances.argsort()#返回由小到大排列的索引值 20 #argsort()? 21 classCount = {} 22 for i in range(k): 23 voteIlabel = labels[sortedDisIndicies[i]] 24 classCount[voteIlabel] = classCount.get(voteIlabel,0)+1 25 sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) 26 #operator.itemgetter(i)获取序号为i的值 27 return sortedClassCount[0][0] 28 29 #将文本记录转换为Numpy的解析程序 30 def file2matrix(filename): 31 fr = open(filename) 32 arrayOLinrs = fr.readlines() 33 numberOfLines = len(arrayOLinrs) 34 returnMat = zeros((numberOfLines,3)) 35 classLabelVector = [] 36 index = 0 37 for line in arrayOLinrs: 38 line = line.strip() 39 listFromLine = line.split('\t') 40 returnMat[index,:] = listFromLine[0:3] 41 classLabelVector.append(listFromLine[-1]) 42 index+=1 43 return returnMat,classLabelVector 44 45 46 #归一化特征值 47 def autoNorm(dataSet): 48 minVals = dataSet.min(0) 49 maxVals = dataSet.max(0) 50 ranges = maxVals-minVals 51 normDataSet = zeros(shape(dataSet)) 52 m = dataSet.shape[0] 53 normDataSet = dataSet-tile(minVals,(m,1)) 54 normDataSet = normDataSet/tile(ranges,(m,1)) 55 return normDataSet,ranges,minVals 56 57 58 #分类器针对约会网站的测试代码
file1 = 'D:\\Code\\ML\\lianxi\\MLiA_SourceCode\\machinelearninginaction\\Ch02\\datingTestSet.txt'
59 def datingClassTest():
60 hoRatio = 0.10
61 datingDataMat,datingLabels = file2matrix(file1)
62 normMat,ranges,minVals = autoNorm(datingDataMat)
63 m = normMat.shape[0]
64 numTestVecs = int(m*hoRatio)
65 errorCount = 0.0
66 for i in range(numTestVecs):
67 #classify0() inX传入的是待分类的数据一行,dataSet传入的是多行已分类好的数据,labels是dataSet对应的多行数据,k是取的最近的k个点
68 classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
69 print("The classifier came back with:%s,the real answer is:%s"%(classifierResult,datingLabels[i]))
70 if (classifierResult!=datingLabels[i]):
71 errorCount+=1.0
72 print("The total error rate is:%f"%(errorCount/float(numTestVecs)))
73
74 #约会网站的预测函数
75 def classifyPerson():
76 resultList = ['not at all','in small doses','in large doses']
77 percentTats = float(input('Percentage of time spent playing video games?'))
78 ffMiles = float(input("Frequent flier miles earned per year?"))
79 iceCream = float(input("Liters of ice cream consumed per year?"))
80
81 datingDataMat,datingLabels = file2matrix('D:\\Code\\ML\\lianxi\\MLiA_SourceCode\\machinelearninginaction\\Ch02\\datingTestSet2.txt')
82 normMat,ranges,minVals = autoNorm(datingDataMat)
83 inArr = array([ffMiles,percentTats,iceCream])
84 classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
85 print(classifierResult)
86 print("You will probably like this person:%s"%resultList[int(classifierResult)-1])
测试:
1 import kNN
2
3 filename = 'D:\\Code\\ML\\lianxi\\MLiA_SourceCode\\machinelearninginaction\\Ch02\\datingTestSet.txt'
4 import matplotlib
5 import matplotlib.pyplot as plt
6
7 datingDataMat,datingLabels = kNN.file2matrix(filename)
8
9 normMat,ranges,minVals = kNN.autoNorm(datingDataMat)
10 print(normMat)
11 print(kNN.datingClassTest())
12 print(kNN.classifyPerson())
运行结果:
[[0.44832535 0.39805139 0.56233353]
[0.15873259 0.34195467 0.98724416]
[0.28542943 0.06892523 0.47449629]
...
[0.29115949 0.50910294 0.51079493]
[0.52711097 0.43665451 0.4290048 ]
[0.47940793 0.3768091 0.78571804]]
The classifier came back with:largeDoses,the real answer is:largeDoses
The classifier came back with:smallDoses,the real answer is:smallDoses
....................................
The classifier came back with:didntLike,the real answer is:didntLike
The classifier came back with:largeDoses,the real answer is:didntLike
The total error rate is:0.050000
Percentage of time spent playing video games?0.13
Frequent flier miles earned per year?520
Liters of ice cream consumed per year?13
You will probably like this person:in small doses
Process finished with exit code 0