前言:机器学习的主要任务是使用数据训练出模型,使用训练好的模型完成相关如分类、聚类、推荐等任务。
对于某类问题,可以建立多种不同的模型,也有多种评价指标对模型进行评估。
本文汇总了最常用的几种评价指标,方便快速查询使用。(本文使用鸢尾花数据集和逻辑回归模型)
1.常用评估指标及损失函数:
1 import numpy as np 2 import pandas as pd 3 import matplotlib as mpl 4 import matplotlib.pyplot as plt 5 import sklearn 6 7 from sklearn import datasets 8 9 from sklearn.linear_model import LogisticRegression 10 from sklearn.model_selection import cross_val_score 11 from sklearn.model_selection import StratifiedKFold 12 from sklearn.model_selection import train_test_split 13 14 from sklearn.metrics import accuracy_score 15 from sklearn.metrics import precision_score 16 from sklearn.metrics import recall_score 17 from sklearn.metrics import f1_score 18 from sklearn.metrics import confusion_matrix 19 20 from sklearn.metrics import mean_squared_error 21 from sklearn.metrics import mean_squared_log_error 22 from sklearn.metrics import mean_absolute_error 23 from sklearn.metrics import r2_score 24 25 26 #1. 加载数据 27 iris = datasets.load_iris() 28 x,y = iris.data,iris.target 29 30 #2. 划分训练集与测试集 31 x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0) 32 33 #3. 使用逻辑回归训练分类器 34 lr_clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6, solver='liblinear',multi_class='ovr') 35 lr_clf.fit(x_train, y_train) 36 37 #4. 对测试集进行预测(分类) 38 y_predict = lr_clf.predict(x_test) 39 y_probs = lr_clf.predict_proba(x_test) #模型的预测得分 40 41 #5. 衡量指标 42 # #交叉验证 43 # cross_score = cross_val_score(lr_clf, x_train, y_train, cv=3, scoring="accuracy") 44 # print(cross_score) 45 46 #5.1 准确率——accuracy 47 accuracy = accuracy_score(y_test, y_predict) 48 print(accuracy) 49 print() 50 51 #5.2 精确率——precision 52 accuracy = precision_score(y_test, y_predict, average='macro') 53 print(accuracy) 54 print() 55 56 #5.3 召回率——recall 57 accuracy = recall_score(y_test, y_predict, average='weighted') 58 print(accuracy) 59 print() 60 61 #5.4 f1分数——f1_score 62 accuracy = f1_score(y_test, y_predict, average='micro') 63 print(accuracy) 64 print() 65 66 #5.5 混淆矩阵 67 confusion_matrix = confusion_matrix(y_test, y_predict) 68 print(confusion_matrix) 69 print() 70 71 #5.6 均方误差(MSE) 72 mse = mean_squared_error(y_test,y_predict) 73 print(mse) 74 75 #5.7 均方对数误差(MSLE) 76 msle = mean_squared_log_error(y_test,y_predict) 77 print(msle) 78 79 #5.8 平均绝对值误差(MAE) 80 mae = mean_absolute_error(y_test,y_predict) 81 print(mae) 82 83 #5.9 R2决定系数() 84 r2 = r2_score(y_test,y_predict) 85 print(r2)
2.ROC曲线与P-R曲线:二分类问题使用
1 import numpy as np 2 import pandas as pd 3 import matplotlib as mpl 4 import matplotlib.pyplot as plt 5 import sklearn 6 7 from sklearn.model_selection import StratifiedKFold 8 from sklearn.linear_model import LogisticRegression 9 from sklearn.model_selection import train_test_split 10 from sklearn.model_selection import cross_val_score 11 from sklearn.model_selection import cross_val_predict 12 from sklearn import datasets 13 from sklearn.metrics import auc 14 from sklearn.metrics import roc_curve 15 from sklearn.metrics import precision_recall_curve 16 17 #1. 加载数据 18 iris = datasets.load_iris() 19 x,y = iris.data,iris.target 20 21 #2. 划分训练集与测试集 22 x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0) 23 24 # 进行二分类:0类和非0类 25 y_train = (y_train == 0) 26 y_test = (y_test == 0) 27 28 #3. 使用逻辑回归训练分类器 29 lr_clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6, solver='liblinear',multi_class='ovr') 30 lr_clf.fit(x_train, y_train) 31 32 #4. 对测试集进行预测(分类) 33 y_predict = lr_clf.predict(x_test) 34 y_probs = lr_clf.predict_proba(x_test) #模型的预测得分 35 36 cross_score = cross_val_score(lr_clf, x_train, y_train, cv=3, scoring="accuracy") 37 y_scores = cross_val_predict(lr_clf, x_train, y_train, cv=3,method="decision_function") 38 39 40 #5. 不同阈值下精度与召回率 41 precisions,recalls,thresholds = precision_recall_curve(y_train,y_scores) 42 def plot_precision_recall_vs_threshold(precisions, recalls, thresholds): 43 plt.plot(thresholds, precisions[:-1], "r-", label="Precision", linewidth=1) 44 plt.plot(thresholds, recalls[:-1], "b-", label="Recall", linewidth=1) 45 plt.xlabel("Threshold", fontsize=12) 46 plt.legend(loc="upper left", fontsize=12) 47 plt.ylim([0, 1]) 48 49 plt.figure(figsize=(8, 6)) 50 plot_precision_recall_vs_threshold(precisions,recalls,thresholds) 51 plt.xlim([0, 6]) 52 plt.show() 53 54 55 #6. P-R曲线 56 def plot_precision_vs_recall(precisions, recalls): 57 plt.plot(recalls, precisions, "b-", linewidth=2) 58 plt.xlabel("Recall", fontsize=16) 59 plt.ylabel("Precision", fontsize=16) 60 plt.axis([0, 1, 0, 1]) 61 62 plt.figure(figsize=(8, 6)) 63 plot_precision_vs_recall(precisions, recalls) 64 plt.show() 65 66 67 #7. ROC曲线 68 fpr, tpr, thresholds = roc_curve(y_train, y_scores) 69 def plot_roc_curve(fpr, tpr, label=None): 70 plt.plot(fpr, tpr, 'r--',linewidth=2, label=label) 71 plt.plot([0, 1], [0, 1], 'k--') 72 plt.axis([0, 1, 0, 1]) 73 plt.xlabel('False Positive Rate', fontsize=16) 74 plt.ylabel('True Positive Rate', fontsize=16) 75 76 plt.figure(figsize=(8, 6)) 77 plot_roc_curve(fpr, tpr) 78 plt.show()
3.常用距离计算:余弦距离 = 1 - 余弦相似度,欧氏距离
1 import numpy as np 2 import pandas as pd 3 import matplotlib as mpl 4 import matplotlib.pyplot as plt 5 import sklearn 6 7 from sklearn.metrics.pairwise import cosine_similarity 8 from sklearn.metrics.pairwise import pairwise_distances 9 from sklearn.metrics.pairwise import euclidean_distances 10 11 #1. 余弦相似度 12 a = [[1,3,2],[2,2,1]] 13 dist = cosine_similarity(a) 14 print(dist) 15 print() 16 17 18 #2. 余弦距离= 1 - 余弦相似度 19 dist = pairwise_distances(a,metric="cosine") 20 print(dist) 21 print() 22 23 #3. 欧氏距离 24 dist = euclidean_distances(a) 25 print(dist) 26 print()
4.数据集划分:holdout,cross-validation,bootstrap
1 import numpy as np 2 import pandas as pd 3 import matplotlib as mpl 4 import matplotlib.pyplot as plt 5 import sklearn 6 7 from sklearn import datasets 8 from sklearn.linear_model import LogisticRegression 9 from sklearn.model_selection import train_test_split 10 11 from sklearn.model_selection import KFold 12 from sklearn.model_selection import LeaveOneOut 13 from sklearn.model_selection import ShuffleSplit 14 15 from sklearn.model_selection import StratifiedKFold 16 17 iris = datasets.load_iris() 18 x,y = iris.data,iris.target 19 20 #1. holdout方式:用于划分训练集与测试集 21 x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0) 22 23 24 #2. 交叉验证:用于划分训练集和验证集 25 26 ##2.1 K折交叉 27 # kf = KFold(n_splits=2) 28 # for train_idx, valid_idx in kf.split(x_train): 29 # train_data = x_train[train_idx] 30 # valid_data = x_train[valid_idx] 31 # print("%s %s" % (train_data, valid_data)) 32 # print() 33 34 ##2.2 留一法 35 # loo = LeaveOneOut() 36 # for train_idx, valid_idx in loo.split(x_train): 37 # train_data = x_train[train_idx] 38 # valid_data = x_train[valid_idx] 39 # print("%s %s" % (train_data, valid_data)) 40 # print() 41 42 ##2.3 随机洗牌 43 # ss = ShuffleSplit(n_splits=5, test_size=0.25,random_state=0) 44 # for train_idx, valid_idx in ss.split(x_train): 45 # train_data = x_train[train_idx] 46 # valid_data = x_train[valid_idx] 47 # print("%s %s" % (train_data, valid_data)) 48 # print() 49 50 ##2.4 分层K折交叉 51 # skf = StratifiedKFold(n_splits=3) 52 # for train_idx, valid_idx in skf.split(x_train,y_train): 53 # train_data = x_train[train_idx] 54 # valid_data = x_train[valid_idx] 55 # print("%s %s" % (train_data, valid_data)) 56 # print() 57 58 59 #3. boostrap方式(自助法):用于划分训练集与测试集 60 df_x = pd.DataFrame(x) 61 df_y = pd.DataFrame(y) 62 data = pd.concat([df_x,df_y],axis=1) 63 train = data.sample(frac=1.0,replace=True) 64 test = data.loc[data.index.difference(train.index)].copy() 65 #train中有m个样本,但是有重复的 66 print(len(train)) 67 68 #test中有大约(m * 0.368)个样本,不重复,并且都未在train中出现过 69 print(len(test))
5.超参数调优:网格搜索,随机搜索
1 import numpy as np 2 import pandas as pd 3 import matplotlib as mpl 4 import matplotlib.pyplot as plt 5 import sklearn 6 from scipy.stats import randint 7 from sklearn import datasets 8 from sklearn.neighbors import KNeighborsClassifier 9 from sklearn.metrics import accuracy_score 10 11 from sklearn.model_selection import train_test_split 12 from sklearn.model_selection import StratifiedKFold 13 from sklearn.model_selection import GridSearchCV 14 from sklearn.model_selection import RandomizedSearchCV 15 16 iris = datasets.load_iris() 17 x,y = iris.data,iris.target 18 x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0) 19 20 knn = KNeighborsClassifier(n_neighbors=5) 21 22 #本题使用KNN分类模型,通过网格搜索在1~30之间选择超参数n_neighbors,获得参数及评分 23 24 ##1. 网格搜索 25 # k_range = range(10, 15) 26 # param_grid = dict(n_neighbors=k_range) 27 # #param_grid = [{'n_neighbors':k_range},{'algorithm':['ball_tree','kd_tree'],'leaf_size':range(29,31),'n_neighbors':k_range}] 28 # grid = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10, scoring='accuracy') 29 # grid.fit(x, y) 30 31 32 ##2. 随机搜索 33 param_grid = {'n_neighbors':randint(low=1,high=31)} 34 grid = RandomizedSearchCV(estimator=knn, param_distributions=param_grid, cv=10, scoring='accuracy') 35 grid.fit(x, y) 36 37 38 #最佳参数选择 39 print(grid.best_params_) 40 41 #最佳得分 42 print(grid.best_score_) 43 44 #最佳评估器,可直接用于实际训练 45 best_knn = grid.best_estimator_ 46 best_knn.fit(x_train, y_train) 47 48 pred_train = best_knn.predict(x_train) 49 pred_test = best_knn.predict(x_test) 50 51 train_acc = accuracy_score(y_train, pred_train) 52 test_acc = accuracy_score(y_test, pred_test) 53 54 #验证集与测试集得分 55 print(train_acc,test_acc)