随机森林乳腺癌案例
随机森林在乳腺癌数据上的调参
导入需要的库#
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
导入数据集,探索数据#
cancer = load_breast_cancer()
cancer.data.shape
(569, 30)
进行一次简单的建模,看看模型本身在数据集上的效果#
rfc = RandomForestClassifier(n_estimators=100, random_state=90)
score_pre = cross_val_score(rfc, cancer.data, cancer.target, cv=10).mean()
score_pre
0.9648809523809524
随机森林调整的第一步:无论如何先来调 n_estimators#
score_ = []
for i in range(0,200,10):
rfc = RandomForestClassifier(n_estimators=i+1, random_state=90)
score = cross_val_score(rfc, cancer.data, cancer.target, cv=10).mean()
score_.append(score)
查看 n_estimators(1-200)最高得分#
print(max(score_), (score_.index(max(score_)) * 10) + 1)
0.9631265664160402 71
nestimators 学习曲线#
plt.figure(figsize=(20, 8),dpi=80)
plt.plot(range(1,201,10), score_)
plt.show()
细化 n_estimators 取值#
est_range = [*range(65,75)]
est_refinement = []
for i in est_range:
rfc = RandomForestClassifier(n_estimators=i, random_state=90)
score = cross_val_score(rfc, cancer.data, cancer.target, cv=10).mean()
est_refinement.append(score)
print(max(est_refinement), est_range[est_refinement.index(max(est_refinement))])
0.9666353383458647 73
为网格搜索做准备,书写网格搜索的参数#
"""
param_grid = {'n_estimators':np.arange(0, 200, 10)}
param_grid = {'max_depth':np.arange(1, 20, 1)}
param_grid = {'max_leaf_nodes':np.arange(25,50,1)}
对于大型数据集,可以尝试从1000来构建,先输入1000,每100个叶子一个区间,再逐渐缩小范围
有一些参数是可以找到一个范围的,或者说我们知道他们的取值和随着他们的取值,模型的整体准确率会如何变化,这
样的参数我们就可以直接跑网格搜索
param_grid = {'criterion':['gini', 'entropy']}
param_grid = {'min_samples_split':np.arange(2, 2+20, 1)}
param_grid = {'min_samples_leaf':np.arange(1, 1+10, 1)}
param_grid = {'max_features':np.arange(5,30,1)}
"""
"\nparam_grid = {'n_estimators':np.arange(0, 200, 10)}\n \nparam_grid = {'max_depth':np.arange(1, 20, 1)}\n \nparam_grid = {'max_leaf_nodes':np.arange(25,50,1)}\n 对于大型数据集,可以尝试从1000来构建,先输入1000,每100个叶子一个区间,再逐渐缩小范围\n \n有一些参数是可以找到一个范围的,或者说我们知道他们的取值和随着他们的取值,模型的整体准确率会如何变化,这\n样的参数我们就可以直接跑网格搜索\nparam_grid = {'criterion':['gini', 'entropy']}\n \nparam_grid = {'min_samples_split':np.arange(2, 2+20, 1)}\n \nparam_grid = {'min_samples_leaf':np.arange(1, 1+10, 1)}\n \nparam_grid = {'max_features':np.arange(5,30,1)} \n \n"
开始按照参数对模型整体准确率的影响程度进行调参,首先调整 max_depth#
param_grid = {'max_depth':np.arange(1, 20, 1)}
rfc = RandomForestClassifier(n_estimators=73, random_state=90)
grid = GridSearchCV(rfc, param_grid,cv=10)
grid.fit(cancer.data, cancer.target)
estimator=RandomForestClassifier(n_estimators=73, random_state=90),
param_grid={'max_depth': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19])})</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-1" type="checkbox" ><label for="sk-estimator-id-1" class="sk-toggleable__label sk-toggleable__label-arrow">GridSearchCV</label><div class="sk-toggleable__content"><pre>GridSearchCV(cv=10,
estimator=RandomForestClassifier(n_estimators=73, random_state=90),
param_grid={'max_depth': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19])})</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-2" type="checkbox" ><label for="sk-estimator-id-2" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-3" type="checkbox" ><label for="sk-estimator-id-3" class="sk-toggleable__label sk-toggleable__label-arrow">RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div></div></div></div></div></div></div></div>
# 查看最好模型参数
grid.best_params_
{'max_depth': 8}
# 查看最好的分
grid.best_score_
0.9666353383458647
param_grid = {'max_features':np.arange(5,30,1)}
rfc = RandomForestClassifier(n_estimators=73, random_state=90)
grid = GridSearchCV(rfc, param_grid,cv=10)
grid.fit(cancer.data, cancer.target)
estimator=RandomForestClassifier(n_estimators=73, random_state=90),
param_grid={'max_features': array([ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29])})</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-4" type="checkbox" ><label for="sk-estimator-id-4" class="sk-toggleable__label sk-toggleable__label-arrow">GridSearchCV</label><div class="sk-toggleable__content"><pre>GridSearchCV(cv=10,
estimator=RandomForestClassifier(n_estimators=73, random_state=90),
param_grid={'max_features': array([ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29])})</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-5" type="checkbox" ><label for="sk-estimator-id-5" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-6" type="checkbox" ><label for="sk-estimator-id-6" class="sk-toggleable__label sk-toggleable__label-arrow">RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div></div></div></div></div></div></div></div>
# 查看最好模型参数
grid.best_params_
{'max_features': 24}
# 查看最好的分
grid.best_score_
0.9666666666666668
param_grid = {'min_samples_leaf':np.arange(1, 1+10, 1)}
rfc = RandomForestClassifier(n_estimators=73, random_state=90)
grid = GridSearchCV(rfc, param_grid,cv=10)
grid.fit(cancer.data, cancer.target)
estimator=RandomForestClassifier(n_estimators=73, random_state=90),
param_grid={'min_samples_leaf': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])})</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-7" type="checkbox" ><label for="sk-estimator-id-7" class="sk-toggleable__label sk-toggleable__label-arrow">GridSearchCV</label><div class="sk-toggleable__content"><pre>GridSearchCV(cv=10,
estimator=RandomForestClassifier(n_estimators=73, random_state=90),
param_grid={'min_samples_leaf': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])})</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-8" type="checkbox" ><label for="sk-estimator-id-8" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-9" type="checkbox" ><label for="sk-estimator-id-9" class="sk-toggleable__label sk-toggleable__label-arrow">RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div></div></div></div></div></div></div></div>
# 查看最好模型参数
grid.best_params_
{'min_samples_leaf': 1}
# 查看最好的分
grid.best_score_
0.9666353383458647
param_grid = {'min_samples_split':np.arange(2, 2+20, 1)}
rfc = RandomForestClassifier(n_estimators=73, random_state=90)
grid = GridSearchCV(rfc, param_grid,cv=10)
grid.fit(cancer.data, cancer.target)
estimator=RandomForestClassifier(n_estimators=73, random_state=90),
param_grid={'min_samples_split': array([ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21])})</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-10" type="checkbox" ><label for="sk-estimator-id-10" class="sk-toggleable__label sk-toggleable__label-arrow">GridSearchCV</label><div class="sk-toggleable__content"><pre>GridSearchCV(cv=10,
estimator=RandomForestClassifier(n_estimators=73, random_state=90),
param_grid={'min_samples_split': array([ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21])})</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-11" type="checkbox" ><label for="sk-estimator-id-11" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-12" type="checkbox" ><label for="sk-estimator-id-12" class="sk-toggleable__label sk-toggleable__label-arrow">RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div></div></div></div></div></div></div></div>
# 查看最好模型参数
grid.best_params_
{'min_samples_split': 2}
# 查看最好的分
grid.best_score_
0.9666353383458647
param_grid = {'criterion':['gini', 'entropy']}
rfc = RandomForestClassifier(n_estimators=73, random_state=90)
grid = GridSearchCV(rfc, param_grid,cv=10)
grid.fit(cancer.data, cancer.target)
estimator=RandomForestClassifier(n_estimators=73, random_state=90),
param_grid={'criterion': ['gini', 'entropy']})</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-13" type="checkbox" ><label for="sk-estimator-id-13" class="sk-toggleable__label sk-toggleable__label-arrow">GridSearchCV</label><div class="sk-toggleable__content"><pre>GridSearchCV(cv=10,
estimator=RandomForestClassifier(n_estimators=73, random_state=90),
param_grid={'criterion': ['gini', 'entropy']})</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-14" type="checkbox" ><label for="sk-estimator-id-14" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-15" type="checkbox" ><label for="sk-estimator-id-15" class="sk-toggleable__label sk-toggleable__label-arrow">RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div></div></div></div></div></div></div></div>
# 查看最好模型参数
grid.best_params_
{'criterion': 'gini'}
# 查看最好的分
grid.best_score_
0.9666353383458647
rfc = RandomForestClassifier(n_estimators=73,
random_state=90,
max_features=24)
score = cross_val_score(rfc, cancer.data, cancer.target, cv=10).mean()
score
0.9666666666666668
score - score_pre
0.0017857142857143904
作者:Hovey
出处:https://www.cnblogs.com/thankcat/p/17294505.html
版权:本作品采用「署名-非商业性使用-相同方式共享 4.0 国际」许可协议进行许可。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?