1. 使用python导入数据
from numpy import *
import operator #提供排序操作需要的函数
def createDataSet():
group = array([[1.0, 1.1], [1.0,1.1],[0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
group, labels = createDataSet()
array([[1. , 1.1],
[1. , 1.1],
[0. , 0. ],
[0. , 0.1]])
['A', 'A', 'B', 'B']
- 计算已知类别数据集中的点与当前点之间的距离
- 按照距离递增次序排序
- 选取与当前点距离最小的k个点
- 确定前k个点所在类别的出现频率
- 返回前k个点出现频率最高的类别作为当前点的预测分类
def classify0(inX, dataSet, labels, k): #inX是输入(待分类)
dataSetSize = dataSet.shape[0] #利用属性shape[0]得到样本的个数
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5 #距离计算
sortedDistIndicies = distances.argsort() #对距离进行排序
classCount = {}
for i in range(k): #选取临近的k个样本
voteIlabel = labels[sortedDistIndicies[i]] #获得样本标签
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 #样本标签统计
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
. 其使用的语法为:dict.get(key, default=None)
classify0([0,0], group, labels, 3)
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines() #arrayOLines是一个列表,包括所有的行
numberOfLines = len(arrayOLines) #获得行数
returnMat = zeros((numberOfLines,3)) #创建一个返回的NumPy矩阵
classLabelVector = [] #类标签
index = 0
for line in arrayOLines:
line = line.strip() #去掉换行符
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3] #将数据装填到returnMat
labels = {'didntLike':1, 'smallDoses':2, 'largeDoses':3}
classLabelVector.append(labels[(listFromLine[-1])]) #获得每个样例的标签
index += 1
return returnMat, classLabelVector
datingDataMat, datingLabels = file2matrix('datingTestSet.txt')
array([[4.0920000e+04, 8.3269760e+00, 9.5395200e-01],
[1.4488000e+04, 7.1534690e+00, 1.6739040e+00],
[2.6052000e+04, 1.4418710e+00, 8.0512400e-01],
[2.6575000e+04, 1.0650102e+01, 8.6662700e-01],
[4.8111000e+04, 9.1345280e+00, 7.2804500e-01],
[4.3757000e+04, 7.8826010e+00, 1.3324460e+00]])
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]
是字符串方法,通过指定分隔符对字符串进行切片,返回一个字符串列表。如果参数 num 有指定值,则分隔 num+1 个子字符串,语法是str.split(str="", num=string.count(str)).
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,0], datingDataMat[:,1],15.0*array(datingLabels),15.0*array(datingLabels))
def autoNorm(dataset):
minVals = dataset.min(0)
maxVals = dataset.max(0) #获得每一列的最大、最小值
ranges = maxVals - minVals #获得范围
normDataSet = zeros(shape(dataset))
m = dataset.shape[0] #数据集的个数
normDataSet = dataset - tile(minVals, (m,1))
normDataSet = normDataSet/tile(ranges, (m,1))#归一化的公式,再次用到tile()
return normDataSet, ranges, minVals
normMat, ranges, minVals = autoNorm(datingDataMat)
array([[0.44832535, 0.39805139, 0.56233353],
[0.15873259, 0.34195467, 0.98724416],
[0.28542943, 0.06892523, 0.47449629],
[0.29115949, 0.50910294, 0.51079493],
[0.52711097, 0.43665451, 0.4290048 ],
[0.47940793, 0.3768091 , 0.78571804]])
array([9.1273000e+04, 2.0919349e+01, 1.6943610e+00])
array([0. , 0. , 0.001156])
def datingClassTest():
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix('datingTestSet.txt') #读取文件
normMat, ranges, minVals = autoNorm(datingDataMat) #归一化
m = normMat.shape[0] #获得样本总体数目
numTestVecs = int(m*hoRatio) #选出一部分作为测试,另一部分作为训练
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:],\
datingLabels[numTestVecs:m],3) #得到分类结果
print("the classifier came back with: %d, the real answer is: %d"\
%(classifierResult, datingLabels[i]))
if(classifierResult != datingLabels[i]): errorCount += 1.0 #如果错误就记录
print("the total error rate is: %f" %(errorCount/float(numTestVecs))) #计算错误率
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the total error rate is: 0.050000