随笔 - 119  文章 - 0  评论 - 5  阅读 - 57360

回归预测之随机森林——运行+调优

from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
from sklearn.datasets import make_regression

随机森林

from sklearn.ensemble import RandomForestRegressor
X, y = make_regression(n_features=4, n_informative=2,
random_state=0, shuffle=False)
plt.plot(X)

image-20220218193641004

plt.plot(y)

image-20220218193702180

X_train=X[:70]
y_train=y[:70]
X_test=X[:30]
y_test=y[:30]
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train, y_train)
# 预测
# 测试集
y_pred=regr.predict(X_test)
print(y_pred)

[ 41.71152007 -15.51877479 18.77435453 2.4613485 -5.25163664
11.98242971 -28.99147231 67.82781115 -46.47813223 58.94403962
-44.43019803 -25.35127762 -27.46837011 -31.48276853 17.81715876
-25.42572978 -16.172543 -20.43062853 -20.84673413 -30.25425251
17.90104445 67.70073552 28.81417535 33.29761523 40.28058259
-22.61219493 34.50175346 68.835082 38.18859153 -6.48249831]

# 绘制y_test曲线
# 创建t变量
t = np.arange(len(X_test))
plt.plot(t, y_test, 'r', linewidth=2, label='真实值')

image-20220218193933860

# 绘制y_pred曲线
plt.plot(t, y_pred, 'g', linewidth=2, label='预测值')

image-20220218193955734

# 返回拟合优度the coefficient of determination
regr.score(X_test,y_test)

0.8338446596824768

# mse
# https://blog.csdn.net/xiaohutong1991/article/details/108178143?spm=1001.2101.3001.6650.11&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7ERate-11.pc_relevant_default&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7ERate-11.pc_relevant_default&utm_relevant_index=14
metrics.mean_squared_error(y_test, y_pred)

334.42748631188385

regr.set_params()

RandomForestRegressor(max_depth=2, random_state=0)

regr.feature_importances_

array([0.15597865, 0.84082089, 0. , 0.00320046])

调优——k折交叉验证,scikit-learn的网格搜索GridSearchCV

# param_grid = {"n_estimators":[5,10,50,100,200,500],"max_depth":[5,10,50,100,200,500]}
param_grid = {"n_estimators":[5,50,100],"max_depth":[8,9,10]}
# 调用scikit-learn的网格搜索,传入参数选择范围,并且制定随机森林回归算法,cv = 5表示5折交叉验证
grid_search = GridSearchCV(RandomForestRegressor(),param_grid,cv = 3)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestRegressor(),
param_grid={'max_depth': [8, 9, 10], 'n_estimators': [5, 50, 100]})

y_pred=grid_search.predict(X_test)
print(y_pred)

[ 49.50191561 -0.7122897 15.26286215 17.50407347 15.87708862
-14.54908528 -13.32531612 80.64244515 -75.54860534 63.84753325
-68.76733049 -27.15074728 -34.90857798 -45.24935823 16.53953061
-25.26432862 -10.65729336 -18.79136562 -19.30815651 -38.14527267
6.93420609 88.31726657 16.87408796 34.57068077 53.79849864
-9.89424185 39.75832876 87.18227999 45.21303975 13.54728708]

plt.figure(figsize=(15, 10))
# 创建t变量
t = np.arange(len(X_test))
# 绘制y_test曲线
plt.plot(t, y_test, 'r', linewidth=2, label='真实值')
# 绘制y_hat曲线
plt.plot(t, y_pred, 'g', linewidth=2, label='预测值')
# 设置图例
plt.legend()
plt.show()

image-20220218194707744

# 拟合优度R2
print("r2:", grid_search.score(X_test, y_test))

r2: 0.9866915026043963

# 用Scikit_learn计算MSE
print("MSE:", metrics.mean_squared_error(y_test, y_pred))

MSE: 26.786543978030636

grid_search.set_params()

GridSearchCV(cv=3, estimator=RandomForestRegressor(),
param_grid={'max_depth': [8, 9, 10], 'n_estimators': [5, 50, 100]})

print(grid_search.best_params_)

{'max_depth': 9, 'n_estimators': 50}

调优——k折交叉验证+逐个参数

superpa = []
for i in range(10,200,10):
regr = RandomForestRegressor(n_estimators=i
,random_state=42
)
regr_s = cross_val_score(regr
,X_train
,y_train
,cv=10
#,scoring='roc_auc'
).mean()# 评估指标
superpa.append(regr_s)
print(max(superpa),superpa.index(max(superpa)),(superpa.index(max(superpa)))*10+10)#输出最大值及其索引
plt.figure(figsize=[20,5])
plt.plot(range(10,200,10),superpa)#横纵坐标
plt.show()

image-20220218201944890

# max_depth的学习曲线
superpa = []
for i in range(10,30,2):
regr = RandomForestRegressor(n_estimators=170
,max_depth=i
,random_state=42
)
regr_s = cross_val_score(regr
,X_train
,y_train
,cv=10
#,scoring='roc_auc'
).mean()# 评估指标
superpa.append(regr_s)
print(max(superpa),superpa.index(max(superpa)),(superpa.index(max(superpa)))*2+10)#输出最大值及其索引
plt.figure(figsize=[20,5])
plt.plot(range(10,30,2),superpa)#横纵坐标
plt.show()

image-20220218202016889

# min_samples_split的学习曲线,分割内部节点所需的最小样本数
superpa = []
for i in range(2,10,2):
regr = RandomForestRegressor(n_estimators=170
,max_depth=12
,min_samples_split=i
,random_state=42
,n_jobs=-1)
regr_s = cross_val_score(regr
,X_train
,y_train
,cv=10
#,scoring='roc_auc'
).mean()# 评估指标
superpa.append(regr_s)
print(max(superpa),superpa.index(max(superpa)),(superpa.index(max(superpa)))*2+2)#输出最大值及其索引
plt.figure(figsize=[20,5])
plt.plot(range(2,10,2),superpa)#横纵坐标
plt.show()

image-20220218202103612

# min_samples_leaf的学习曲线
superpa = []
for i in range(1,15,1):
regr = RandomForestRegressor(n_estimators=170
,max_depth=12
,min_samples_split=2
,min_samples_leaf=i
,random_state=42
)
regr_s = cross_val_score(regr
,X_train
,y_train
,cv=10
#,scoring='roc_auc'
).mean()# 评估指标
superpa.append(regr_s)
print(max(superpa),superpa.index(max(superpa)),(superpa.index(max(superpa)))*1+1)#输出最大值及其索引
plt.figure(figsize=[20,5])
plt.plot(range(1,15,1),superpa)#横纵坐标
plt.show()

image-20220218202133744

# max_features的学习曲线,每棵树用到的最大特征数
#from sklearn.model_selection import GridSearchCV
param_grid = {'max_features':np.arange(3, 11, 1)}
regr = RandomForestRegressor(n_estimators=170
,max_depth=12
,min_samples_split=2
,min_samples_leaf=1
,random_state=42
)
GS = GridSearchCV(regr,param_grid,cv=10)
GS.fit(X_train, y_train)
print(GS.best_params_)
#print(GS.best_score_)

{'max_features': 4}

最终模型:

regr = RandomForestRegressor(n_estimators=170
,max_depth=12
,min_samples_split=2
,min_samples_leaf=1
,random_state=42
)
# 返回拟合优度the coefficient of determination
regr.score(X_test,y_test)

0.9879834206877871

# mse
metrics.mean_squared_error(y_test, y_pred)

24.275331856914285

regr.set_params()

RandomForestRegressor(max_depth=12, n_estimators=170, random_state=42)

regr.feature_importances_

array([0.21877631, 0.7564047 , 0.01129733, 0.01352166])

plt.figure(figsize=(15, 10))
# 创建t变量
t = np.arange(len(X_test))
# 绘制y_test曲线
plt.plot(t, y_test, 'r', linewidth=2, label='真实值')
# 绘制y_hat曲线
plt.plot(t, y_pred, 'g', linewidth=2, label='预测值')
# 设置图例
plt.legend()
plt.show()

image-20220218201858482

posted on   cookie的笔记簿  阅读(2803)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· Vue3状态管理终极指南:Pinia保姆级教程
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5

点击右上角即可分享
微信分享提示