09.超参数
digits 手写数据测试
import numpy as np import matplotlib import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score
digits = datasets.load_digits()
X = digits.data
X.shape
(1797, 64)
y = digits.target
y.shape
(1797,)
digits.target_names
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
X[0]
array([ 0., 0., 5., 13., 9., 1., 0., 0., 0., 0., 13., 15., 10., 15., 5., 0., 0., 3., 15., 2., 0., 11., 8., 0., 0., 4., 12., 0., 0., 8., 8., 0., 0., 5., 8., 0., 0., 9., 8., 0., 0., 4., 11., 0., 1., 12., 7., 0., 0., 2., 14., 5., 10., 12., 0., 0., 0., 0., 6., 13., 10., 0., 0., 0.])
y[0]
0
some_digit_image = X[666].reshape(8, 8) plt.imshow(some_digit_image, cmap=matplotlib.cm.binary) y[666]
some_digit_image = X[1000].reshape(8, 8) plt.imshow(some_digit_image, cmap=matplotlib.cm.binary) y[1000]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
knn_clf = KNeighborsClassifier(3) knn_clf.fit(X_train, y_train) y_predict = knn_clf.predict(X_test)
accuracy_score(y_test, y_predict)
0.9888888888888889
寻找最好的k
best_score = 0.0 best_k = -1 for k in range(1,11): knn_clf = KNeighborsClassifier(k) knn_clf.fit(X_train, y_train) y_predict = knn_clf.predict(X_test) score= accuracy_score(y_test, y_predict) if score > best_score: best_k = k best_score = score print("best_k:", best_k) print("best_score:", best_score)
best_k: 4 best_score: 0.9916666666666667
考虑距离?不考虑距离
best_method = "" best_score = 0.0 best_k = -1 for method in ["uniform", "distance"]: for k in range(1,11): knn_clf = KNeighborsClassifier(n_neighbors=k, weights=method) knn_clf.fit(X_train, y_train) y_predict = knn_clf.predict(X_test) score= accuracy_score(y_test, y_predict) if score > best_score: best_k = k best_score = score best_method = method print("best_k:", best_k) print("best_score:", best_score) print("best_method:", best_method)
best_k: 4 best_score: 0.9916666666666667 best_method: uniform
探索明可夫斯基距离相应的p
best_p = -1 best_method = "" best_score = 0.0 best_k = -1 for k in range(1,11): for p in range(1, 6): knn_clf = KNeighborsClassifier(n_neighbors=k, weights="distance", p=p) knn_clf.fit(X_train, y_train) y_predict = knn_clf.predict(X_test) score= accuracy_score(y_test, y_predict) if score > best_score: best_k = k best_score = score best_p = p print("best_k:", best_k) print("best_score:", best_score) print("best_p:", p)
best_k: 3 best_score: 0.9888888888888889 best_p: 5
寻找最好的超参数 Grid Search
param_grid = [ { "weights":["uniform"], "n_neighbors":[i for i in range(1,11)] }, { "weights":["distance"], "n_neighbors":[i for i in range(1,11)], "p":[i for i in range(1,6)] } ] knn_clf = KNeighborsClassifier() from sklearn.model_selection import GridSearchCV
%%time
grid_search = GridSearchCV(knn_clf, param_grid)
Wall time: 0 ns
%%time
grid_search.fit(X_train, y_train)
Wall time: 1min 1s
GridSearchCV(cv=None, error_score=nan, estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=5, p=2, weights='uniform'), iid='deprecated', n_jobs=None, param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'weights': ['uniform']}, {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5], 'weights': ['distance']}], pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0)
grid_search.best_estimator_
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=1, p=2, weights='uniform')
knn_clf = grid_search.best_estimator_ y_predict = knn_clf.predict(X_test) print(accuracy_score(y_test, y_predict))
0.9833333333333333
knn_clf = KNeighborsClassifier() grid_search = GridSearchCV(knn_clf, param_grid, n_jobs=-1, verbose=2) grid_search.fit(X_train, y_train) knn_clf = grid_search.best_estimator_ y_predict = knn_clf.predict(X_test) print(accuracy_score(y_test, y_predict))
grid_search.best_estimator_
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=1, p=2, weights='uniform')