再论sklearn分类器
这几天在看 sklearn 的文档,发现他的分类器有很多,这里做一些简略的记录。
大致可以将这些分类器分成两类: 1)单一分类器,2)集成分类器
一、单一分类器
下面这个例子对一些单一分类器效果做了比较
from sklearn.cross_validation import cross_val_score from sklearn.datasets import make_blobs # meta-estimator from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis classifiers = { 'KN': KNeighborsClassifier(3), 'SVC': SVC(kernel="linear", C=0.025), 'SVC': SVC(gamma=2, C=1), 'DT': DecisionTreeClassifier(max_depth=5), 'RF': RandomForestClassifier(n_estimators=10, max_depth=5, max_features=1), # clf.feature_importances_ 'ET': ExtraTreesClassifier(n_estimators=10, max_depth=None), # clf.feature_importances_ 'AB': AdaBoostClassifier(n_estimators=100), 'GB': GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0), # clf.feature_importances_ 'GNB': GaussianNB(), 'LD': LinearDiscriminantAnalysis(), 'QD': QuadraticDiscriminantAnalysis()} X, y = make_blobs(n_samples=10000, n_features=10, centers=100, random_state=0) for name, clf in classifiers.items(): scores = cross_val_score(clf, X, y) print(name,'\t--> ',scores.mean())
下图是效果图:
二、集成分类器
集成分类器有四种:Bagging, Voting, GridSearch, PipeLine。最后一个PipeLine其实是管道技术
1.Bagging
from sklearn.ensemble import BaggingClassifier from sklearn.neighbors import KNeighborsClassifier meta_clf = KNeighborsClassifier() bg_clf = BaggingClassifier(meta_clf, max_samples=0.5, max_features=0.5)
2.Voting
from sklearn import datasets from sklearn import cross_validation from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import VotingClassifier iris = datasets.load_iris() X, y = iris.data[:, 1:3], iris.target clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard', weights=[2,1,2]) for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']): scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
3.GridSearch
import numpy as np from sklearn.datasets import load_digits from sklearn.ensemble import RandomForestClassifier from sklearn.grid_search import GridSearchCV from sklearn.grid_search import RandomizedSearchCV # 生成数据 digits = load_digits() X, y = digits.data, digits.target # 元分类器 meta_clf = RandomForestClassifier(n_estimators=20) # ================================================================= # 设置参数 param_dist = {"max_depth": [3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # 运行随机搜索 RandomizedSearch n_iter_search = 20 rs_clf = RandomizedSearchCV(meta_clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() rs_clf.fit(X, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) print(rs_clf.grid_scores_) # ================================================================= # 设置参数 param_grid = {"max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [1, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # 运行网格搜索 GridSearch gs_clf = GridSearchCV(meta_clf, param_grid=param_grid) start = time() gs_clf.fit(X, y) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(gs_clf.grid_scores_))) print(gs_clf.grid_scores_)
4.PipeLine
第一个例子
from sklearn import svm from sklearn.datasets import samples_generator from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression from sklearn.pipeline import Pipeline # 生成数据 X, y = samples_generator.make_classification(n_informative=5, n_redundant=0, random_state=42) # 定义Pipeline,先方差分析,再SVM anova_filter = SelectKBest(f_regression, k=5) clf = svm.SVC(kernel='linear') pipe = Pipeline([('anova', anova_filter), ('svc', clf)]) # 设置anova的参数k=10,svc的参数C=0.1(用双下划线"__"连接!) pipe.set_params(anova__k=10, svc__C=.1) pipe.fit(X, y) prediction = pipe.predict(X) pipe.score(X, y) # 得到 anova_filter 选出来的特征 s = pipe.named_steps['anova'].get_support() print(s)
第二个例子
import numpy as np from sklearn import linear_model, decomposition, datasets from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target # 定义管道,先降维(pca),再逻辑回归 pca = decomposition.PCA() logistic = linear_model.LogisticRegression() pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) # 把管道再作为grid_search的estimator n_components = [20, 40, 64] Cs = np.logspace(-4, 4, 3) estimator = GridSearchCV(pipe, dict(pca__n_components=n_components, logistic__C=Cs)) estimator.fit(X_digits, y_digits)