分类的性能评估:准确率、精确率、Recall召回率、F1、F2
import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt df = pd.read_csv('./sms.csv') X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['message'], df['label'], random_state=11) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) X_test = vectorizer.transform(X_test_raw) classifier = LogisticRegression() classifier.fit(X_train, y_train) scores = cross_val_score(classifier, X_train, y_train, cv=5) print('Accuracies: %s' % scores) print('Mean accuracy: %s' % np.mean(scores))
Accuracies: [ 0.95221027 0.95454545 0.96172249 0.96052632 0.95209581] Mean accuracy: 0.956220068309
precisions = cross_val_score(classifier, X_train, y_train, cv=5, scoring='precision') print('Precision: %s' % np.mean(precisions)) recalls = cross_val_score(classifier, X_train, y_train, cv=5, scoring='recall') print('Recall: %s' % np.mean(recalls)) f1s = cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1') print('F1 score: %s' % np.mean(f1s))
Precision: 0.992542742398 Recall: 0.683605030275
F1 score: 0.809067846627
F1是精确率和召回率的调和平均值。如果精确度为1,召回为0,那F1为0.还有F0.5和F2两种模型,分别偏重精确率和召回率。在一些场景下,召回率比精确率还更重要。
常用分类的对比
from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report X, y = make_classification( n_samples=5000, n_features=100, n_informative=20, n_clusters_per_class=2, random_state=11) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11) print('决策树') clf = DecisionTreeClassifier(random_state=11) clf.fit(X_train, y_train) predictions = clf.predict(X_test) print(classification_report(y_test, predictions)) print('随机森林') clf = RandomForestClassifier(n_estimators=10, random_state=11) clf.fit(X_train, y_train) predictions = clf.predict(X_test) print(classification_report(y_test, predictions)) print('逻辑回归') clf = LogisticRegression() clf.fit(X_train, y_train) predictions = clf.predict(X_test) print(classification_report(y_test, predictions)) print('AdaBoost') clf = AdaBoostClassifier(n_estimators=50, random_state=11) clf.fit(X_train, y_train) predictions = clf.predict(X_test) print(classification_report(y_test, predictions)) print('KNN近邻') clf = KNeighborsClassifier(n_neighbors=3) clf.fit(X_train,y_train) predictions = clf.predict(X_test) print(classification_report(y_test, predictions)) print('SVM支持向量机') clf = SVC(kernel='rbf', C=100, gamma=0.1).fit(X, y) predictions = clf.predict(X_test) print(classification_report(y_test, predictions))
结果
决策树 precision recall f1-score support 0 0.80 0.76 0.78 634 1 0.76 0.80 0.78 616 accuracy 0.78 1250 macro avg 0.78 0.78 0.78 1250 weighted avg 0.78 0.78 0.78 1250 随机森林 precision recall f1-score support 0 0.79 0.86 0.82 634 1 0.84 0.76 0.80 616 accuracy 0.81 1250 macro avg 0.82 0.81 0.81 1250 weighted avg 0.82 0.81 0.81 1250 逻辑回归 precision recall f1-score support 0 0.82 0.85 0.84 634 1 0.84 0.81 0.83 616 accuracy 0.83 1250 macro avg 0.83 0.83 0.83 1250 weighted avg 0.83 0.83 0.83 1250
AdaBoost
precision recall f1-score support
0 0.83 0.85 0.84 634
1 0.84 0.82 0.83 616
accuracy 0.83 1250
macro avg 0.83 0.83 0.83 1250
weighted avg 0.83 0.83 0.83 1250
KNN近邻
precision recall f1-score support
0 0.93 0.93 0.93 634
1 0.93 0.93 0.93 616
accuracy 0.93 1250
macro avg 0.93 0.93 0.93 1250
weighted avg 0.93 0.93 0.93 1250
SVM支持向量机
precision recall f1-score support
0 1.00 1.00 1.00 634
1 1.00 1.00 1.00 616
accuracy 1.00 1250
macro avg 1.00 1.00 1.00 1250
weighted avg 1.00 1.00 1.00 1250
目前维护的开源产品:https://gitee.com/475660