python实现knn
邻近算法,或者说K最近邻(kNN,k-NearestNeighbor)分类算法是数据挖掘分类技术中最简单的方法之一。所谓K最近邻,就是k个最近的邻居的意思,说的是每个样本都可以用它最接近的k个邻居来代表。
kNN算法的核心思想是如果一个样本在特征空间中的k个最相邻的样本中的大多数属于某一个类别,则该样本也属于这个类别,并具有这个类别上样本的特性。
概念很简单,更多的解释可以参考百度百科,有图有示例,讲的非常清楚。
接下来看看怎么用python实现KNN,代码中都是详细的注释:
首先是对载入数据的部分函数,这里主要看看CIFIA10的数据格式就知道代码的意思了
1 from __future__ import print_function 2 3 from six.moves import cPickle as pickle 4 import numpy as np 5 import os 6 from scipy.misc import imread 7 import platform 8 9 def load_pickle(f): 10 version = platform.python_version_tuple() 11 if version[0] == '2': 12 return pickle.load(f) 13 elif version[0] == '3': 14 return pickle.load(f, encoding='latin1') 15 raise ValueError("invalid python version: {}".format(version)) 16 17 def load_CIFAR_batch(filename): 18 """ CIRAR的数据是分批的,这个函数的功能是载入一批数据 """ 19 with open(filename, 'rb') as f: 20 datadict = load_pickle(f) #以二进制方式打开文件 21 X = datadict['data'] 22 Y = datadict['labels'] 23 X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float") 24 Y = np.array(Y) 25 return X, Y 26 27 def load_CIFAR10(ROOT): 28 """ load 所有的数据 """ 29 xs = [] 30 ys = [] 31 for b in range(1,6): 32 f = os.path.join(ROOT, 'data_batch_%d' % (b, )) 33 X, Y = load_CIFAR_batch(f) 34 xs.append(X) 35 ys.append(Y) 36 Xtr = np.concatenate(xs) 37 Ytr = np.concatenate(ys) 38 del X, Y 39 Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch')) 40 return Xtr, Ytr, Xte, Yte
然后是KNN类,定义了KNN的距离的计算方式、训练和预测函数:
1 import numpy as np 2 3 class KNearestNeighbor(object): 4 """ 5 kNN 分类器 6 这里度量两张图片之间的距离就直接简单的采用L2距离 7 实际上要达到比较好的效果需要设计更好的距离距离方式 8 """ 9 10 def __init__(self): 11 pass 12 13 def train(self, X, y): 14 """ 15 训练过程基本上没有什么操作,只是简单的记录下所有的数据 16 17 Inputs: 18 - X(N, D) N个输入图片,每张图片表示为D位向量 19 - y(N,) 标签 20 """ 21 self.X_train = X 22 self.y_train = y 23 24 def predict(self, X, k=1, num_loops=0): 25 """ 26 对于新的输入,给出预测分类 27 28 Inputs: 29 - X(num_test, D) 30 - k: 选择用来决定输出的最相近邻居的个数 31 - num_loops:这里实现了3种方式来实现L2距离的计算,比较一下计算速度, 32 都是利用了numpy的broadcast机制。 33 可以看到使用numpy内置的方式计算速度远远高于自己写的循环 34 35 Returns: 36 - y(num_test,):预测的分类下标 37 38 """ 39 if num_loops == 0: 40 dists = self.compute_distances_no_loops(X) 41 elif num_loops == 1: 42 dists = self.compute_distances_one_loop(X) 43 elif num_loops == 2: 44 dists = self.compute_distances_two_loops(X) 45 else: 46 raise ValueError('Invalid value %d for num_loops' % num_loops) 47 48 return self.predict_labels(dists, k=k) 49 50 def compute_distances_two_loops(self, X): 51 """ 52 Inputs: 53 - X(num_test, D):test data. 54 55 Returns: 56 - dists(num_test, num_train):dists[i, j]表示测试数据i和训练数据j之间的L2距离 57 """ 58 59 num_test = X.shape[0] 60 num_train = self.X_train.shape[0] 61 dists = np.zeros((num_test, num_train)) 62 for i in range(num_test): 63 for j in range(num_train): 64 dists[i,j]=np.sqrt(np.sum(np.square(X[i]-self.X_train[j]))) 65 return dists 66 67 def compute_distances_one_loop(self, X): 68 num_test = X.shape[0] 69 num_train = self.X_train.shape[0] 70 dists = np.zeros((num_test, num_train)) 71 for i in range(num_test): 72 dists[i,:]=np.sqrt(np.sum(np.square(X[i]-self.X_train),axis=1)) 73 return dists 74 75 def compute_distances_no_loops(self, X): 76 77 num_test = X.shape[0] 78 num_train = self.X_train.shape[0] 79 dists = np.zeros((num_test, num_train)) 80 #这里需要使用一点矩阵和广播的小技巧,具体的看下面的操作自己体会 81 dists+=(np.sum(np.square(X),axis=1)).reshape(-1,1) 82 dists+=(np.sum(np.square(self.X_train),axis=1)).reshape(1,-1) 83 dists-=2*np.dot(X,self.X_train.T) 84 dists=np.sqrt(dists) 85 86 return dists 87 88 def predict_labels(self, dists, k=1): 89 """ 90 给出测试图片和训练图片的距离矩阵,为每个测试图片分类 91 92 Inputs: 93 - dists(num_test, num_train) 94 95 Returns: 96 - y: (num_test,) 97 """ 98 99 num_test = dists.shape[0] 100 y_pred = np.zeros(num_test) 101 for i in range(num_test): 102 # 长度为k的list保存第i张测试图片距离最近的训练数据的下标 103 closest_y = [] 104 closest_y=self.y_train[np.argsort(dists[i])[:k]] 105 y_pred[i]=np.argmax(np.bincount(closest_y)) 106 return y_pred
最后是主函数部分,载入数据,调用KNN类的实例去训练和预测。并使用k折交叉验证去选择合适的超参数k:
1 # coding: utf-8 2 3 # KNN 4 # KNN分类器主要分为两个步骤: 5 # - 训练阶段, 简单的记忆所有的输入数据(存储) 6 # - 预测阶段, 对与每一个输入,在所有的存储数据中选择k个与输入最接近的 7 # - k是超参数 8 # 9 10 11 import random 12 import numpy as np 13 from cs231n.data_utils import load_CIFAR10 14 import matplotlib.pyplot as plt 15 16 17 #get_ipython().run_line_magic('matplotlib', 'inline') 18 plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots 19 plt.rcParams['image.interpolation'] = 'nearest' 20 plt.rcParams['image.cmap'] = 'gray' 21 22 # Load CIFAR-10 的数据. 23 cifar10_dir = 'cs231n/datasets/cifar-10-batches-py' 24 X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir) 25 26 # 通过输出数据的维度检查数据加载是否正确 27 print('Training data shape: ', X_train.shape) 28 print('Training labels shape: ', y_train.shape) 29 print('Test data shape: ', X_test.shape) 30 print('Test labels shape: ', y_test.shape) 31 32 33 # 可视化一些数据集中的样例. 34 classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'] 35 num_classes = len(classes) 36 samples_per_class = 7 37 for y, cls in enumerate(classes): 38 idxs = np.flatnonzero(y_train == y) #得到每一类对应图片的下标 39 idxs = np.random.choice(idxs, samples_per_class, replace=False) #在该类的所有图片中随机选择 40 for i, idx in enumerate(idxs): 41 plt_idx = i * num_classes + y + 1 42 plt.subplot(samples_per_class, num_classes, plt_idx) 43 plt.imshow(X_train[idx].astype('uint8')) 44 plt.axis('off') 45 if i == 0: 46 plt.title(cls) 47 plt.show() 48 49 50 # 采样,不使用全部数据,训练的更快一点,先来看看效果 51 # 程序全部跑通之后可以优化一下方式,使用全部数据来试试效果 52 num_training = 5000 53 mask = list(range(num_training)) 54 X_train = X_train[mask] 55 y_train = y_train[mask] 56 57 num_test = 500 58 mask = list(range(num_test)) 59 X_test = X_test[mask] 60 y_test = y_test[mask] 61 62 63 # 把图片Reshape到一维 64 X_train = np.reshape(X_train, (X_train.shape[0], -1)) 65 X_test = np.reshape(X_test, (X_test.shape[0], -1)) 66 print(X_train.shape, X_test.shape) 67 68 69 from cs231n.classifiers import KNearestNeighbor 70 71 classifier = KNearestNeighbor() 72 classifier.train(X_train, y_train) 73 74 dists = classifier.compute_distances_two_loops(X_test) 75 print(dists.shape) 76 77 # 可视化距离矩阵,每一行代表一张输入图片到所有训练数据的距离 78 plt.imshow(dists, interpolation='none') 79 plt.show() 80 81 82 83 y_test_pred = classifier.predict_labels(dists, k=1) 84 num_correct = np.sum(y_test_pred == y_test) 85 accuracy = float(num_correct) / num_test 86 print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)) 87 88 89 90 y_test_pred = classifier.predict_labels(dists, k=5) 91 num_correct = np.sum(y_test_pred == y_test) 92 accuracy = float(num_correct) / num_test 93 print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)) 94 95 96 97 98 dists_one = classifier.compute_distances_one_loop(X_test) 99 100 # 验证2种实现方式得到的距离矩阵是否等价 101 difference = np.linalg.norm(dists - dists_one, ord='fro') 102 print('Difference was: %f' % (difference, )) 103 if difference < 0.001: 104 print('Good! The distance matrices are the same') 105 else: 106 print('Uh-oh! The distance matrices are different') 107 108 109 110 dists_two = classifier.compute_distances_no_loops(X_test) 111 difference = np.linalg.norm(dists - dists_two, ord='fro') 112 print('Difference was: %f' % (difference, )) 113 if difference < 0.001: 114 print('Good! The distance matrices are the same') 115 else: 116 print('Uh-oh! The distance matrices are different') 117 118 119 120 def time_function(f, *args): 121 """ 122 计算完成f函数花费的时间 123 """ 124 import time 125 tic = time.time() 126 f(*args) 127 toc = time.time() 128 return toc - tic 129 130 two_loop_time = time_function(classifier.compute_distances_two_loops, X_test) 131 print('Two loop version took %f seconds' % two_loop_time) 132 133 one_loop_time = time_function(classifier.compute_distances_one_loop, X_test) 134 print('One loop version took %f seconds' % one_loop_time) 135 136 no_loop_time = time_function(classifier.compute_distances_no_loops, X_test) 137 print('No loop version took %f seconds' % no_loop_time) 138 139 140 #使用交叉验证决定k值 141 num_folds = 5 142 k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100] 143 144 X_train_folds = [] 145 y_train_folds = [] 146 X_train_folds=np.array_split(X_train,num_folds) 147 y_train_folds=np.array_split(y_train,num_folds) 148 print(X_train_folds[0].shape) 149 print(y_train_folds[0].shape) 150 151 152 #记录不同的k值对应的正确率,每个k值会对应num_folds个正确率 153 k_to_accuracies = {} 154 155 for k_ in k_choices: 156 k_to_accuracies.setdefault(k_, []) 157 for i in range(num_folds): 158 classifier = KNearestNeighbor() 159 X_val_train = np.concatenate(X_train_folds[0:i] + X_train_folds[i+1:],axis=0) 160 y_val_train = np.concatenate(y_train_folds[0:i] + y_train_folds[i+1:],axis=0) 161 classifier.train(X_val_train, y_val_train) 162 for k_ in k_choices: 163 y_val_pred = classifier.predict(X_train_folds[i], k=k_) 164 num_correct = np.sum(y_val_pred == y_train_folds[i]) 165 accuracy = float(num_correct) / len(y_val_pred) 166 k_to_accuracies[k_] = k_to_accuracies[k_] + [accuracy] 167 168 169 170 for k in sorted(k_to_accuracies): 171 for accuracy in k_to_accuracies[k]: 172 print('k = %d, accuracy = %f' % (k, accuracy)) 173 174 175 for k in k_choices: 176 accuracies = k_to_accuracies[k] 177 plt.scatter([k] * len(accuracies), accuracies) 178 179 180 accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())]) 181 accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())]) 182 plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std) 183 plt.title('Cross-validation on k') 184 plt.xlabel('k') 185 plt.ylabel('Cross-validation accuracy') 186 plt.show() 187 188 189 #选择最好的k值计算正确率 190 best_k = k_choices[np.argmax(accuracies_mean)] 191 192 classifier = KNearestNeighbor() 193 classifier.train(X_train, y_train) 194 y_test_pred = classifier.predict(X_test, k=best_k) 195 196 197 num_correct = np.sum(y_test_pred == y_test) 198 accuracy = float(num_correct) / num_test 199 print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))