k-NN——算法实现
k-NN 没有特别的训练过程,给定训练集,标签,k,计算待预测特征到训练集的所有距离,选取前k个距离最小的训练集,k个中标签最多的为预测标签
约会类型分类、手写数字识别分类
- 计算输入数据到每一个训练数据的距离
- 选择前k个,判断其中类别最多的类作为预测类
import numpy as np import operator import matplotlib import matplotlib.pyplot as plt # inX: test data, N features (1xN) # dataSet: M samples, N features (MxN) # label: for M samples (1xM) # k: k-Nearest Neighbor def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet distances = np.sum(diffMat**2, axis=1)**0.5 sortDistances = distances.argsort() # 计算距离 classCount = {} for i in range(k): voteLable = labels[sortDistances[i]] classCount[voteLable] = classCount.get(voteLable, 0) + 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 找出最多投票的类 result = sortedClassCount[0][0] # print("Predict: ", result) return result # 将一个文件写入矩阵,文件有4列,最后一列为labels,以\t间隔 def file2matrix(filename): with open(filename) as f: arrayLines = f.readlines() # print(arrayLines) # 有\n numberOfLines = len(arrayLines) # 将txt文件按行读入为一个list,一行为一个元素 returnMat = np.zeros((numberOfLines, 3)) classLabelVector = [] index = 0 for line in arrayLines: line = line.strip() listFromLine = line.split('\t') returnMat[index,:] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index += 1 return returnMat, classLabelVector # 画一些图 def ex3(): datingDateMat, datingLables = file2matrix("datingTestSet2.txt") fig = plt.figure() ax = fig.add_subplot(1,2,1) ax.scatter(datingDateMat[:,1], datingDateMat[:,2], s=15.0*np.array(datingLables), c=15.0*np.array(datingLables)) ax2 = fig.add_subplot(1,2,2) ax2.scatter(datingDateMat[:,0], datingDateMat[:,1], s=15.0*np.array(datingLables), c=15.0*np.array(datingLables)) plt.show() # 将数据集归一化[0 1]之间 (value - min)/(max - min) def autoNorm(dataSet): minVals = dataSet.min(axis=0) maxVals = dataSet.max(axis=0) ranges = maxVals - minVals m = dataSet.shape[0] normDataSet = dataSet - np.tile(minVals, (m,1)) normDataSet = normDataSet/np.tile(ranges, (m,1)) return normDataSet, ranges, minVals # 分类器,输入数据集,归一化参数,labels,70%作为训练集,30%测试集 def datingClassTest(normDataSet, ranges, minVals, labels): m = normDataSet.shape[0] numOfTrain = int(m*0.7) trainIndex = np.arange(m) np.random.shuffle(trainIndex) dataSet = normDataSet[trainIndex[0:numOfTrain],:] testSet = normDataSet[trainIndex[numOfTrain:],:] labels = np.array(labels) dataSetLabels = labels[trainIndex[0:numOfTrain]] testSetLabels = labels[trainIndex[numOfTrain:]] k = int(input("Input k: ")) results = [] for inX in testSet: result = classify0(inX, dataSet, dataSetLabels, k) results.append(result) compResultsAndLable = np.argwhere(results==testSetLabels) acc = len(compResultsAndLable)/len(testSetLabels) print("Accuracy: {:.2f}".format(acc)) print("Error: {:.2f}".format(1-acc)) classList = ['not at all', 'in small doses', 'in large doses'] inX1 = float(input("1: percentage of time spent playing video games? ")) inX2 = float(input("2: frequent flier miles earned per year? ")) inX3 = float(input("3: liters of ice cream consumed per year? ")) inXUser = [inX1,inX2,inX3] inXUser = (inXUser - minVals)/ranges result = classify0(inXUser, dataSet, dataSetLabels, k) print("Predict: ", classList[result]) if __name__ == '__main__': # # -- ex1 -- # inX = [1, 1] # dataSet = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) # labels = ['A', 'A', 'B', 'B'] # k = 3 # classify0(inX, dataSet, labels, k) # # -- ex2 -- datingDateMat, datingLables = file2matrix("datingTestSet2.txt") # # -- ex3 -- # ex3() # #-- ex4 -- # normDataSet, ranges, minVals = autoNorm(datingDateMat) # # -- ex5 -- # datingClassTest(normDataSet, ranges, minVals, datingLables)
import numpy as np import matplotlib import matplotlib.pyplot as plt import os import operator def img2vector(filename): with open(filename) as f: lines = f.readlines() return_vector = [] for line in lines: line = line.strip() for j in line: return_vector.append(int(j)) return return_vector # inX: test data, N features (1xN) # dataSet: M samples, N features (MxN) # label: for M samples (1xM) # k: k-Nearest Neighbor def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet distances = np.sum(diffMat**2, axis=1)**0.5 sortDistances = distances.argsort() # 计算距离 classCount = {} for i in range(k): voteLable = labels[sortDistances[i]] classCount[voteLable] = classCount.get(voteLable, 0) + 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 找出最多投票的类 result = sortedClassCount[0][0] # print("Predict: ", result) return result def handwriting_class_test(data_set, training_labels, test_set, test_labels, k): results = [] for i in range(len(test_set)): result = classify0(test_set[i], data_set, training_labels, k) results.append(result) # print('predict: ', result, 'answer: ', test_labels[i]) compare_results = np.argwhere(results==test_labels) acc = len(compare_results)/len(test_labels) print("Accuracy: {:.5f}".format(acc)) print("Error: {:.5f}".format(1-acc)) if __name__ == '__main__': dir_path = r'H:\ML\MachineLearninginAction\02kNN\digits' training_path = os.path.join(dir_path, r'trainingDigits') test_path = os.path.join(dir_path, r'testDigits') training_files_list = os.listdir(training_path) test_files_list = os.listdir(test_path) # 计算训练集矩阵与labels m = len(training_files_list) # m = 5 data_set = np.zeros((m, 1024)) training_labels = np.zeros(m) for i in range(m): data_set[i] = img2vector(os.path.join(training_path, training_files_list[i])) training_labels[i] = training_files_list[i].split('_')[0] # 测试集矩阵与labels mt = len(test_files_list) test_set = np.zeros((mt,1024)) test_labels = np.zeros(mt) for i in range(mt): test_set[i] = img2vector(os.path.join(test_path, test_files_list[i])) test_labels[i] = test_files_list[i].split('_')[0] k = 3 handwriting_class_test(data_set, training_labels, test_set, test_labels, k)
---
本文来自博客园,作者:Bingmous,转载请注明原文链接:https://www.cnblogs.com/bingmous/p/15643742.html
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?