k近邻算法-3.算法应用
算法具体应用
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn import datasets
加载手写数据集
digits = datasets.load_digits() #加载手写数据集
手写数据集共有5620个样本,每个样本有64个特征,为手写数据集的像素点,其样本的结果为0-9的手写数字,其数据集描述如下:
样本结构:
数据可视化,查看某个样本的特征和结果:
x =digits.data
y = digits.target
# 数据集中第222个样本
some_digit = x[222]
#一个手写数字有64个特征,将一维数组的特征变为8*8的矩阵
some_digit_image = some_digit.reshape(8, 8)
plt.imshow(some_digit_image, cmap=matplotlib.cm.binary)
plt.show()
查看此数据的结果:
封装之前的代码,实现手写数据集的预测
定义K近邻算法(KNN.py):
import numpy as np
from math import sqrt
from collections import Counter
class KNNClassifier:
"""docstring for KNNClassifier"""
def __init__(self, k):
"""初始化KNN分类器"""
assert k >= 1, 'k must be valid'
self.k = k
self._x_train = None
self._y_train = None
def fit(self, _x_train, _y_train):
"""根据训练数据集训练KNN分类器"""
self._x_train = _x_train
self._y_train = _y_train
# 返回对象本身(高级操作)
return self
def predict(self,x_predict):
"""给定待测试的数据集x_predict,返回结果向量"""
assert self._x_train is not None and self._y_train is not None, \
"must fit before predict!"
assert self._x_train.shape[0] == self._y_train.shape[0], \
"the size of x_train must equal to the size of y_train"
assert self._x_train.shape[1] == x_predict.shape[1], \
"the feature number of x must be equal to x_train"
y_predict = [self._predict(x) for x in x_predict]
return np.array(y_predict)
def _predict(self, x):
"""给定单个的待测数据x,返回x的预测结果"""
assert self._x_train.shape[1] == x.shape[0], \
"the feature number of x must be equal to x_train"
#求出一个预测的数据 和 每个数据集的距离,是一个无序列表
distances = [sqrt(np.sum((x_train -x) ** 2)) for x_train in self._x_train]
#根据索引排序
nearest = np.argsort(distances)
#找出距离此新样本最近的k个原始样本的结果
topK_y = [self._y_train[i] for i in nearest[:self.k]]
#统计数组中的元素,及它出现的次数
votes = Counter(topK_y)
#找到票数最多的n个元素 ,按票数从多到少 排序 [(元素,票数)]
return votes.most_common()[0][0]
def __repr__(self):
return 'KNN(k=%d)'%self.k
定义模型选择库(model_selection.py)
import numpy as np
#训练 测试数据集分离
def train_test_split(x, y, test_ratio=0.2, seed=None):
assert x.shape[0] == y.shape[0],\
"the size of x must be equal to the size of y"
assert 0.0 <= test_ratio <= 1.0,\
"test_ratio must be valid"
if seed:
np.random.seed(seed)
shuffle_index = np.random.permutation(len(x))
test_size = int(len(x) * test_ratio)
test_index = shuffle_index[:test_size]
train_index = shuffle_index[test_size:]
x_train = x[train_index]
x_test = x[test_index]
y_train = y[train_index]
y_test = y[test_index]
return x_train, x_test, y_train, y_test
使用自己封装的库:
from mylib.model_selection import train_test_split
from mylib.KNN import KNNClassifier
x_train,x_test,y_train,y_test = train_test_split(x, y,test_ratio=0.2)
my_clf = KNNClassifier(k=3)
my_clf.fit(x_train,y_train)
y_predict = my_clf.predict(x_test)
验证算法的准确率:
score = numpy.sum(y_predict==y_test)/len(y_test)
封装,实现解耦:
# metrics.py (metrics 意为衡量标准)
import numpy
import math
def accuracy_score(y_true, y_predict):
'''计算准确率'''
assert y_true.shape[0] == y_predict.shape[0], \
"size of y_true must be equal to the size of y_predict"
return numpy.sum(y_true == y_predict)/len(y_true)
# KNN.py 添加求准确率方法
from .metrics import accuracy_score
def score(self,x_test,y_test):
y_predict = self.predict(x_test)
return accuracy_score(y_test, y_predict)