KNN
KNN
-
n_neighbors. 临近的节点数量,默认值是5
-
Python近邻法(kNN)
- 是一种基本的分类与回归方法
- 所有特征的数值采用归一化处理
- 距离度量:欧氏距离
- 分类决策规则:多数表决
- kd树(空间规划树)
- https://www.bilibili.com/video/BV1Rt411q7WJ?p=43
-
kNN预测iris
用python实现knn
"""
# @Time : 2020/8/3
# @Author : Jimou Chen
"""
from sklearn import datasets # 导入数据集
from sklearn.model_selection import train_test_split # 用于切分数据
from sklearn.metrics import classification_report, confusion_matrix # 验证准确性
import operator
import numpy as np
# 定义kNN函数,采用欧氏距离计算,返回预测的分类结果
def kNN(x_test, x_data, y_data, k):
# 计算样本数量
x_data_size = x_data.shape[0]
# 复制x_test
x_test_copy = np.tile(x_test, (x_data_size, 1))
# 计算x_test与每个样本的差值
diff_mat = x_test_copy - x_data
# 计算差值平方
sq_diff_mat = diff_mat ** 2
# 求和
sq_distance = sq_diff_mat.sum(axis=1)
# 开方,得到每个样本与测试样本的距离
distance = sq_distance ** 0.5
# 从小到大排序
sorted_distance = distance.argsort()
# 进行分类,把分类结果按多到少放到一个字典
class_count = {}
for i in range(k):
# 获取标签
label = y_data[sorted_distance[i]]
# 统计标签数量
class_count[label] = class_count.get(label, 0) + 1
# 将分类结果从数量按多到少排序
sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True)
return sorted_class_count[0][0]
# 载入数据
iris = datasets.load_iris()
# 切分数据集, 0.2为测试集,0.8为训练集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2)
prediction = []
for i in range(x_test.shape[0]):
prediction.append(kNN(x_test[i], x_train, y_train, 5))
# 拿测试的和预测的作比较,看看效果
print(classification_report(y_test, prediction))
print(confusion_matrix(y_test, prediction))
precision recall f1-score support
0 1.00 1.00 1.00 11
1 0.90 1.00 0.95 9
2 1.00 0.90 0.95 10
accuracy 0.97 30
macro avg 0.97 0.97 0.96 30
weighted avg 0.97 0.97 0.97 30
[[11 0 0]
[ 0 9 0]
[ 0 1 9]]
Process finished with exit code 0
- 为了方便,可以把用python实现的kNN算法封装起来
"""
# @Time : 2020/8/3
# @Author : Jimou Chen
"""
import operator
import numpy as np
# 定义knn函数,采用欧氏距离计算,返回预测的分类结果
def kNN(x_test, x_data, y_data, k):
sorted_distance = ((((np.tile(x_test, (x_data.shape[0], 1)) - x_data) ** 2).sum(axis=1)) ** 0.5).argsort()
# 进行分类,把分类结果按多到少放到一个字典
class_count = {}
for i in range(k):
# 获取标签
label = y_data[sorted_distance[i]]
# 统计标签数量
class_count[label] = class_count.get(label, 0) + 1
# 将分类结果从数量按多到少排序
sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True)
return sorted_class_count[0][0]
用sklearn调用kNN
"""
# @Time : 2020/8/8
# @Author : Jimou Chen
"""
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# 读入数据
iris = load_iris()
x_data = iris.data
y_data = iris.target
# 切分数据
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2)
# 建模,n_neighbors即为k
# kNN_model = KNeighborsClassifier()
kNN_model = KNeighborsClassifier(n_neighbors=5)
kNN_model.fit(x_train, y_train)
predictions = kNN_model.predict(x_test)
print('origin: \n', y_test)
print('predict result:\n', predictions)
print(classification_report(y_test, predictions))
# 调用该对象的打分方法,计算出准确率
# print(kNN_model.score(x_test, y_test, sample_weight=None))
print(kNN_model.score(x_test, y_test))