随机森林代码实现(奥拓数据分类)
import pandas as pd import numpy as np import matplotlib.pyplot as plt data = pd.read_csv("./data/train.csv") data.head() import seaborn as sns sns.countplot(data.target) plt.show() # 采用随机欠采样之前需要确定数据的特征值和标签值 y=data["target"] x=data.drop(["id","target"],axis=1) from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler() x_resampled,y_resampled = rus.fit_resample(x,y) sns.countplot(y_resampled) plt.show() y_resampled #j将标签转化为编码 from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y_resampled = le.fit_transform(y_resampled) y_resampled #数据集分割 from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test = train_test_split(x_resampled,y_resampled,test_size=0.2) from sklearn.ensemble import RandomForestClassifier estimator = RandomForestClassifier(oob_score=True) estimator.fit(x_train,y_train) y_pre = estimator.predict(x_test) y_test,y_pre from sklearn.preprocessing import OneHotEncoder one_hot = OneHotEncoder(sparse=False) y_pre = one_hot.fit_transform(y_pre.reshape(-1,1)) y_test = one_hot.fit_transform(y_test.reshape(-1,1)) y_test,y_pre from sklearn.metrics import log_loss log_loss(y_test,y_pre,eps=1e-15,normalize=True) # 7.637713870225003 y_pre_proba = estimator.predict_proba(x_test) y_pre_proba log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True) # 0.7611795612521034 # 确定n_estimators的取值范围 tuned_parameters = range(10,200,10) # 创建添加accuracy的一个numpy accuracy_t = np.zeros(len(tuned_parameters)) # 创建添加error的一个numpy error_t = np.zeros(len(tuned_parameters)) # 调优过程实现 for i,one_parameter in enumerate(tuned_parameters): estimator = RandomForestClassifier(n_estimators=one_parameter, max_depth=10, max_features=10, min_samples_leaf=10, oob_score=True, random_state=0, n_jobs=-1) estimator.fit(x_train,y_train) # 输出accuracy accuracy_t[i] = estimator.oob_score_ # 输出log_loss y_pre = estimator.predict_proba(x_test) error_t[i] = log_loss(y_test,y_pre,eps=1e-15,normalize=True) # 优化结果过程可视化 fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100) axes[0].plot(tuned_parameters,accuracy_t) axes[1].plot(tuned_parameters,error_t) axes[0].set_xlabel("n_estimators") axes[0].set_ylabel("accuracy_t") axes[1].set_xlabel("n_estimators") axes[1].set_ylabel("error_t") axes[0].grid() axes[1].grid() 确定max_depth的取值范围 tuned_parameters = range(10,100,10) # 创建添加accuracy的一个numpy accuracy_t = np.zeros(len(tuned_parameters)) # 创建添加error的一个numpy error_t = np.zeros(len(tuned_parameters)) # 调优过程实现 for i,one_parameter in enumerate(tuned_parameters): estimator = RandomForestClassifier(n_estimators=175, max_depth=one_parameter, max_features=10, min_samples_leaf=10, oob_score=True, random_state=0, n_jobs=-1) estimator.fit(x_train,y_train) # 输出accuracy accuracy_t[i] = estimator.oob_score_ # 输出log_loss y_pre = estimator.predict_proba(x_test) error_t[i] = log_loss(y_test,y_pre,eps=1e-15,normalize=True) # 优化结果过程可视化 fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100) axes[0].plot(tuned_parameters,accuracy_t) axes[1].plot(tuned_parameters,error_t) axes[0].set_xlabel("max_depth") axes[0].set_ylabel("accuracy_t") axes[1].set_xlabel("max_depth") axes[1].set_ylabel("error_t") axes[0].grid() axes[1].grid() # 确定max_features取值范围 tuned_parameters = range(5,40,5) # 创建添加accuracy的一个numpy accuracy_t = np.zeros(len(tuned_parameters)) # 创建添加error的一个numpy error_t = np.zeros(len(tuned_parameters)) # 调优过程实现 for i,one_parameter in enumerate(tuned_parameters): estimator = RandomForestClassifier(n_estimators=175, max_depth=30, max_features=one_parameter, min_samples_leaf=10, oob_score=True, random_state=0, n_jobs=-1) estimator.fit(x_train,y_train) # 输出accuracy accuracy_t[i] = estimator.oob_score_ # 输出log_loss y_pre = estimator.predict_proba(x_test) error_t[i] = log_loss(y_test,y_pre,eps=1e-15,normalize=True) # 优化结果过程可视化 fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100) axes[0].plot(tuned_parameters,accuracy_t) axes[1].plot(tuned_parameters,error_t) axes[0].set_xlabel("max_features") axes[0].set_ylabel("accuracy_t") axes[1].set_xlabel("max_features") axes[1].set_ylabel("error_t") axes[0].grid() axes[1].grid() # 确定n_estimators的取值范围 tuned_parameters = range(1,10,2) # 创建添加accuracy的一个numpy accuracy_t = np.zeros(len(tuned_parameters)) # 创建添加error的一个numpy error_t = np.zeros(len(tuned_parameters)) # 调优过程实现 for i,one_parameter in enumerate(tuned_parameters): estimator = RandomForestClassifier(n_estimators=175, max_depth=30, max_features=15, min_samples_leaf=one_parameter, oob_score=True, random_state=0, n_jobs=-1) estimator.fit(x_train,y_train) # 输出accuracy accuracy_t[i] = estimator.oob_score_ # 输出log_loss y_pre = estimator.predict_proba(x_test) error_t[i] = log_loss(y_test,y_pre,eps=1e-15,normalize=True) # 优化结果过程可视化 fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100) axes[0].plot(tuned_parameters,accuracy_t) axes[1].plot(tuned_parameters,error_t) axes[0].set_xlabel("min_samples_leaf") axes[0].set_ylabel("accuracy_t") axes[1].set_xlabel("min_samples_leaf") axes[1].set_ylabel("error_t") axes[0].grid() axes[1].grid() #确定最优模型 estimator = RandomForestClassifier(n_estimators=175, max_depth=30, max_features=15, min_samples_leaf=1, oob_score=True, random_state=0, n_jobs=-1) estimator.fit(x_train,y_train) y_pre_proba = estimator.predict_proba(x_test) log_loss(y_test,y_pre_proba) # 0.7413651159154644