k-近邻算法

本文是《机器学习实战》第二章的笔记

构造数据集

from numpy import *
import operator

def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group, labels

程序清单2-1 k-近邻算法

def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]      # dataSet的行数
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet    # diffMat是inX与dataSet的差。
    sqDiffMat = diffMat**2      # diffMat求每一个数的平方,不是直接求矩阵的平方
    sqDistances = sqDiffMat.sum(axis=1) # 按行求和
    distances = sqDistances**0.5 # 欧式距离
    sortedDistIndicies = distances.argsort() # 返回distances从小到大排序后的索引值
    classCount={}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1  # 优美的写法
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    # python3没有iteritems(),可用items()代替
    return sortedClassCount[0][0]

函数调用

>>> import kNN
>>> kNN.classify0{[0,0], group, labels, 3}
B

classify0(inX, dataSet, labels, k)参数说明:

    inX:用于分类的输入向量
    dataSet:输入的训练样本集
    labels:标签向量
    k:选择最近邻居的数目

numpy.shape

numpy.tile(A, reps)

作用是重复某个数组。
A和reps都是array_like的。
A可以是array,list,touple,dict,matrix以及基本类型int,float,string,bool
reps可以使tuple,list,dict,array,int,bool,但不能是float,string,matrix
这里的diffMat是跟dataSet大小一样的矩阵,每一行是inX。

matrix.sum(axis=None)

对矩阵求和,axis默认为None,表示求矩阵所有值得和
                    axis=0,按列求和
                    axis=1,按行求和

matrix.argsort(axis=-1, kind='quicksort', order=None)

从小到大排序,返回的是从小到大值的索引位置。[0]是最小值的索引位置
文档url:https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
        默认按行排序,axis=0按列排序,axis=1按行排序

dict.get(key ,default=None)

key是在字典中查找的值
dafault:如果指定键的值不存在时,返回该默认值

dict[key] = dict.get(key, 0) + 1

优美的写法

sorted()

程序清单2-2 将文本记录到转换NumPy的解析程序

def file2matrix(filename):
    fr = open(filename)
    arrayOLines = fr.readlines()
    numberOfLines = len(arrayOLines)
    returnMat = zeros((numberOfLines, 3))
    classLabelVector = []
    index = 0
    for line in arrayOLines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat, classLabelVector

numpy.zeros(shape, dtype=float, roder='C')

    官方文档:https://docs.scipy.org/doc/numpy/reference/generated/numpy.zeros.html
    生成大小为shape的全零矩阵

numpy.ndarray[index,:]

    numpy.ndarray下标操作,以逗号为分隔符
    官方文档:https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
    这里是取index列数据
    arr[i:j, :]     取第i行到第j行的数据
    arr[:, 0]       取第0行的数据,以行的形式返回
    arr[:, :1]      取第0行的数据,以列的形式返回

程序执行命令

>>> reload(kNN)
>>> datingDataMat, datingLabels = kNN.file2matrix('datingTestSet.txt')
>>> datingDataMat
array([[]
        []])
>>> datingLabels[0:20]
[]

reload(module)

    重新载入之前载入的模块

使用Matplotlib创建散点图

>>> import matplotlib
>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.scatter(datingDataMat[:,1], datingDataMat[:,2]) # 第1列,第2列
>>> plt.show()

>>> ax.scatter(datingDataMat[:,1], datingDataMat[:,2], 15.0*array(datingLabels), 
            15.0*array(datingLabels))

Matplotlib 官方文档:http://matplotlib.org/api/index.html

fig.add_subplot(321)

    增加子图,将画布分割为3行2列,图像画在从左到右从上到下的第1块

ax.scatter()

    matplotlib.pyplot.scatter(x, y, s=None, c=None, marker=None, cmap=None, norm=None, vmin=None, vmax=None, alpha=None, linewidths=None, verts=None, edgecolors=None, hold=None, data=None, **kwargs)

程序清单2-3 归一化特征值

def autoNorm(dataSet):
    minVals = dataSet.min(0)    # minVals在此例中为1*3的矩阵
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet)) # 生成大小shape(dataSet)的零矩阵
    m = dataSet.shape[0]        # 行
    normDataSet = dataSet - tile(minVals, (m,1))
    normDataSet = normDataSet/tile(ranges, (m,1))
    return normDataSet, ranges, minVals

dataSet.min(0)

    这里参数为0的含义是,按列求最小值,minVals的大小在此例中为1*3

numpy.tile(A, reps)

    见上文。

程序执行命令

>>> reload(kNN)
>>> normMat, ranges, minVals = kNN.autoNorm(datingDataMat)
>>> normMat
array([[]
        []])
>>> ranges
array([*, *, *])
>>> minVals
array([*, *, *])

程序清单2-4 分类器针对约会网站的测试代码

def datingClassTest():
    hotRatio = 0.10 # 测试集所占比例
    datingDataMat, datingLabels = file2matrix('datingTestSet.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,], 
                datingLabels[numTestVecs:m], 3)
        print "the classifier came back with: %d, the real answer is: %d" 
                % (classifierResult, datingLabels[i])
        if (classifierResult != datingLabels[i]):
            errorCount += 1.0
    print "the total error rate is: %f" % (errorCount/float(numTestVecs))

程序执行命令

>>> reload(kNN)
>>> kNN.datingClassTest()
the classifier came back with: 1, the real answer is: 1
......
the total error rate is: 0.024000

程序清单2-5 约会网站预测函数

def classifyPerson():
    resultList = ['not at all', 'in small doses', 'in large doses']
    percentTats = float(raw_input("percentage of time spent playing video games?"))
    ffMiles = float(raw_input("frequent flier miles earned per year?"))
    iceCream = float(raw_input("liters of ice cream consumed per year?"))
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = array([ffMiles, percentTats, iceCream])
    classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3)
    print "You will probably like this person: ", resultList[classifierResult - 1]

raw_input()

    获取用户输入

程序执行命令

>>> reload(kNN)
>>> kNN.classifyPerson()
...

图像转向量

def img2vector(filename):
    returnVect = zeros((1, 1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0, 32*i+j] = int(lineStr[j])
    return returnVect

程序执行命令

>>> reload(kNN)
>>> testVector = kNN.img2vector('testDigits/0_13.txt')
>>> testVector[0,0:31]
array([......])

程序清单2-6 手写数字识别系统的测试代码

from os import listdir
def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir('trainingDigits') # listdir(dir)
    m = len(trainingFileList)
    trainingMat = zeros((m,1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
    testFileList = listdir('testDigits')
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLaels, 3)
        print "the classifier came back with: %d, the real answer is: %d" %      (classifierResult, classNumStr)
        if (classifierResult != classNumStr): errorCount += 1.0
    print "\nthe total number of errors is: %d" % errorCount
    print "\nthe total error rate is: %f" % (errorCount/float(mTest))

os.listdir(dir)

列出dir下文件(不含目录),返回list

程序执行命令

>>> reload(kNN)
>>> kNN.handwritingClassTest()
...
posted @ 2017-10-10 22:59  随意orz  阅读(219)  评论(0编辑  收藏  举报