python实现简单分类knn算法
原理:计算当前点(无label,一般为测试集)和其他每个点(有label,一般为训练集)的距离并升序排序,选取k个最小距离的点,根据这k个点对应的类别进行投票,票数最多的类别的即为该点所对应的类别。
代码实现(数据集采用的是iris):
1 import numpy as np 2 from sklearn.datasets import load_iris 3 from sklearn.model_selection import train_test_split 4 from sklearn import neighbors 5 from sklearn.metrics import accuracy_score 6 7 def get_iris(): 8 iris_data = load_iris() 9 X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=0.4, random_state=0) 10 return X_train, X_test, y_train, y_test 11 12 def knn_classify(self_point, dataset, labels, k): 13 distance = [np.sqrt(sum((self_point - d)**2)) for d in dataset] 14 train_data = zip(distance, labels) 15 train_data = sorted(train_data, key=lambda x: x[0])[:k] 16 self_label = {} 17 for i in train_data: 18 i = str(i[1]) 19 self_label[i] = self_label.setdefault(i, 0) + 1 20 self_label = sorted(self_label, key=self_label.get, reverse=True) 21 return self_label[0] 22 23 24 X_train, X_test, y_train, y_test = get_iris() 25 size = len(y_test) 26 count = 0 27 for t in range(len(X_test)): 28 y_pre = knn_classify(X_test[t], X_train, y_train, 5) 29 if y_pre == str(y_test[t]): 30 count += 1 31 print('custom的准确率: ', count / size) 32 33 # 使用sklearn内置的KNN 34 knn = neighbors.KNeighborsClassifier(n_neighbors=5) 35 knn.fit(X_train, y_train) 36 pre = knn.predict(X_test) 37 print('sklearn的准确率: ', accuracy_score(y_test, pre))
对比结果:
custom的准确率: 0.95
sklearn的准确率: 0.95