集成学习之随机森林分类器

随机森林就是用多个决策树分类器,共同预测,投票最多的那个分类就把预测数据划分到那个分类,别的不多说了,代码附上如下,

# -*- coding:utf-8 -*-
import numpy as np
from sklearn import datasets, ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import accuracy_score

#此处是如何导入本地数据作为训练数据
#data = pd.read_csv('iris.data')
#print(data.head(5))

#attributes=data[['sl','sw','pl','pw']]  #前四列属性简化为sl,sw,pl,pw
#types=data['type'] #第5列属性为鸢尾花的类别

iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
feature_names = iris.feature_names
target_names = iris.feature_names

#print(iris_X[:4,:])

X_train,X_test,y_train,y_test = train_test_split(iris_X,iris_y,test_size = 0.3)

print("\n训练集样本大小:", X_train.shape)
print("训练集标签大小:", y_train.shape)
print("测试集样本大小:", X_test.shape)
print("测试集标签大小:", y_test.shape)

#交叉验证,看看分类器选择几个的时候模型泛化的结果最佳,根据图来选择最佳的分类器个数
''' cv_scores = [] k_range = range(1,20) for n in k_range: clf = ensemble.RandomForestClassifier(max_depth=n, n_estimators=1, max_features=1) scores = cross_val_score(clf,X_train,y_train,cv=10,scoring='accuracy') cv_scores.append(scores.mean()) plt.plot(k_range,cv_scores) plt.xlabel('K') plt.ylabel('Accuracy') plt.show() ''' #clf = ensemble.RandomForestClassifier(max_depth=5, n_estimators=1, max_features=1) #clf.fit(X_train,y_train) #print(knn.predict(X_test)) #print(clf.score(X_test,y_test)) #X=np.array(X_train) #Y=np.array(y_train) clf = ensemble.RandomForestClassifier(max_depth=9, n_estimators=1, max_features=1) clf.fit(iris_X,iris_y) #print(clf.score(X_train,y_train)) #print(clf.score(X_test,y_test)) print(clf.predict( [[5.1,3.5,1.4,0.2], [4.9,3.0,1.4,0.2], [5.7,3.0,4.2,1.2], [5.7,2.9,4.2,1.3], [5.7,2.8,4.1,1.3], [6.3,3.3,6.0,2.5], [5.8,2.7,5.1,1.9], [6.2,2.9,4.3,1.3], [5.1,2.5,3.0,1.1], [7.1,3.0,5.9,2.1], [6.3,2.9,5.6,1.8], [6.5,3.0,5.8,2.2], [4.7,3.2,1.3,0.2], [4.6,3.1,1.5,0.2]] ))

 

posted @ 2020-03-31 15:03  站在云端看世界  阅读(327)  评论(0编辑  收藏  举报