k-近邻算法
本文是《机器学习实战》第二章的笔记
构造数据集
from numpy import *
import operator
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group, labels
程序清单2-1 k-近邻算法
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0] # dataSet的行数
diffMat = tile(inX, (dataSetSize, 1)) - dataSet # diffMat是inX与dataSet的差。
sqDiffMat = diffMat**2 # diffMat求每一个数的平方,不是直接求矩阵的平方
sqDistances = sqDiffMat.sum(axis=1) # 按行求和
distances = sqDistances**0.5 # 欧式距离
sortedDistIndicies = distances.argsort() # 返回distances从小到大排序后的索引值
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # 优美的写法
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
# python3没有iteritems(),可用items()代替
return sortedClassCount[0][0]
函数调用
>>> import kNN
>>> kNN.classify0{[0,0], group, labels, 3}
B
classify0(inX, dataSet, labels, k)参数说明:
inX:用于分类的输入向量
dataSet:输入的训练样本集
labels:标签向量
k:选择最近邻居的数目
numpy.shape
numpy.tile(A, reps)
作用是重复某个数组。
A和reps都是array_like的。
A可以是array,list,touple,dict,matrix以及基本类型int,float,string,bool
reps可以使tuple,list,dict,array,int,bool,但不能是float,string,matrix
这里的diffMat是跟dataSet大小一样的矩阵,每一行是inX。
matrix.sum(axis=None)
对矩阵求和,axis默认为None,表示求矩阵所有值得和
axis=0,按列求和
axis=1,按行求和
matrix.argsort(axis=-1, kind='quicksort', order=None)
从小到大排序,返回的是从小到大值的索引位置。[0]是最小值的索引位置
文档url:https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
默认按行排序,axis=0按列排序,axis=1按行排序
dict.get(key ,default=None)
key是在字典中查找的值
dafault:如果指定键的值不存在时,返回该默认值
dict[key] = dict.get(key, 0) + 1
优美的写法
sorted()
程序清单2-2 将文本记录到转换NumPy的解析程序
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines, 3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
numpy.zeros(shape, dtype=float, roder='C')
官方文档:https://docs.scipy.org/doc/numpy/reference/generated/numpy.zeros.html
生成大小为shape的全零矩阵
numpy.ndarray[index,:]
numpy.ndarray下标操作,以逗号为分隔符
官方文档:https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
这里是取index列数据
arr[i:j, :] 取第i行到第j行的数据
arr[:, 0] 取第0行的数据,以行的形式返回
arr[:, :1] 取第0行的数据,以列的形式返回
程序执行命令
>>> reload(kNN)
>>> datingDataMat, datingLabels = kNN.file2matrix('datingTestSet.txt')
>>> datingDataMat
array([[]
[]])
>>> datingLabels[0:20]
[]
reload(module)
重新载入之前载入的模块
使用Matplotlib创建散点图
>>> import matplotlib
>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.scatter(datingDataMat[:,1], datingDataMat[:,2]) # 第1列,第2列
>>> plt.show()
>>> ax.scatter(datingDataMat[:,1], datingDataMat[:,2], 15.0*array(datingLabels),
15.0*array(datingLabels))
Matplotlib 官方文档:http://matplotlib.org/api/index.html
fig.add_subplot(321)
增加子图,将画布分割为3行2列,图像画在从左到右从上到下的第1块
ax.scatter()
matplotlib.pyplot.scatter(x, y, s=None, c=None, marker=None, cmap=None, norm=None, vmin=None, vmax=None, alpha=None, linewidths=None, verts=None, edgecolors=None, hold=None, data=None, **kwargs)
程序清单2-3 归一化特征值
def autoNorm(dataSet):
minVals = dataSet.min(0) # minVals在此例中为1*3的矩阵
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet)) # 生成大小shape(dataSet)的零矩阵
m = dataSet.shape[0] # 行
normDataSet = dataSet - tile(minVals, (m,1))
normDataSet = normDataSet/tile(ranges, (m,1))
return normDataSet, ranges, minVals
dataSet.min(0)
这里参数为0的含义是,按列求最小值,minVals的大小在此例中为1*3
numpy.tile(A, reps)
见上文。
程序执行命令
>>> reload(kNN)
>>> normMat, ranges, minVals = kNN.autoNorm(datingDataMat)
>>> normMat
array([[]
[]])
>>> ranges
array([*, *, *])
>>> minVals
array([*, *, *])
程序清单2-4 分类器针对约会网站的测试代码
def datingClassTest():
hotRatio = 0.10 # 测试集所占比例
datingDataMat, datingLabels = file2matrix('datingTestSet.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,],
datingLabels[numTestVecs:m], 3)
print "the classifier came back with: %d, the real answer is: %d"
% (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]):
errorCount += 1.0
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
程序执行命令
>>> reload(kNN)
>>> kNN.datingClassTest()
the classifier came back with: 1, the real answer is: 1
......
the total error rate is: 0.024000
程序清单2-5 约会网站预测函数
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(raw_input("percentage of time spent playing video games?"))
ffMiles = float(raw_input("frequent flier miles earned per year?"))
iceCream = float(raw_input("liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3)
print "You will probably like this person: ", resultList[classifierResult - 1]
raw_input()
获取用户输入
程序执行命令
>>> reload(kNN)
>>> kNN.classifyPerson()
...
图像转向量
def img2vector(filename):
returnVect = zeros((1, 1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32*i+j] = int(lineStr[j])
return returnVect
程序执行命令
>>> reload(kNN)
>>> testVector = kNN.img2vector('testDigits/0_13.txt')
>>> testVector[0,0:31]
array([......])
程序清单2-6 手写数字识别系统的测试代码
from os import listdir
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('trainingDigits') # listdir(dir)
m = len(trainingFileList)
trainingMat = zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
testFileList = listdir('testDigits')
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLaels, 3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
if (classifierResult != classNumStr): errorCount += 1.0
print "\nthe total number of errors is: %d" % errorCount
print "\nthe total error rate is: %f" % (errorCount/float(mTest))
os.listdir(dir)
列出dir下文件(不含目录),返回list
程序执行命令
>>> reload(kNN)
>>> kNN.handwritingClassTest()
...