集成学习之随机森林分类器
随机森林就是用多个决策树分类器,共同预测,投票最多的那个分类就把预测数据划分到那个分类,别的不多说了,代码附上如下,
# -*- coding:utf-8 -*- import numpy as np from sklearn import datasets, ensemble from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt import pandas as pd from sklearn.metrics import accuracy_score #此处是如何导入本地数据作为训练数据 #data = pd.read_csv('iris.data') #print(data.head(5)) #attributes=data[['sl','sw','pl','pw']] #前四列属性简化为sl,sw,pl,pw #types=data['type'] #第5列属性为鸢尾花的类别 iris = datasets.load_iris() iris_X = iris.data iris_y = iris.target feature_names = iris.feature_names target_names = iris.feature_names #print(iris_X[:4,:]) X_train,X_test,y_train,y_test = train_test_split(iris_X,iris_y,test_size = 0.3) print("\n训练集样本大小:", X_train.shape) print("训练集标签大小:", y_train.shape) print("测试集样本大小:", X_test.shape) print("测试集标签大小:", y_test.shape)
#交叉验证,看看分类器选择几个的时候模型泛化的结果最佳,根据图来选择最佳的分类器个数 ''' cv_scores = [] k_range = range(1,20) for n in k_range: clf = ensemble.RandomForestClassifier(max_depth=n, n_estimators=1, max_features=1) scores = cross_val_score(clf,X_train,y_train,cv=10,scoring='accuracy') cv_scores.append(scores.mean()) plt.plot(k_range,cv_scores) plt.xlabel('K') plt.ylabel('Accuracy') plt.show() ''' #clf = ensemble.RandomForestClassifier(max_depth=5, n_estimators=1, max_features=1) #clf.fit(X_train,y_train) #print(knn.predict(X_test)) #print(clf.score(X_test,y_test)) #X=np.array(X_train) #Y=np.array(y_train) clf = ensemble.RandomForestClassifier(max_depth=9, n_estimators=1, max_features=1) clf.fit(iris_X,iris_y) #print(clf.score(X_train,y_train)) #print(clf.score(X_test,y_test)) print(clf.predict( [[5.1,3.5,1.4,0.2], [4.9,3.0,1.4,0.2], [5.7,3.0,4.2,1.2], [5.7,2.9,4.2,1.3], [5.7,2.8,4.1,1.3], [6.3,3.3,6.0,2.5], [5.8,2.7,5.1,1.9], [6.2,2.9,4.3,1.3], [5.1,2.5,3.0,1.1], [7.1,3.0,5.9,2.1], [6.3,2.9,5.6,1.8], [6.5,3.0,5.8,2.2], [4.7,3.2,1.3,0.2], [4.6,3.1,1.5,0.2]] ))