超参数调优——网格搜索、随机搜索和贝叶斯优化
在Python中,我们可以使用sklearn.model_selection.GridSearchCV进行网格搜索。这是一个用于系统地遍历多种参数组合,通过交叉验证确定最佳效果参数的方法。它的主要目的是通过网格搜索在指定的值集合中找到最优的模型参数。
以下是一个使用XGBoost分类器的网格搜索示例:
# 导入必要的库 from sklearn.model_selection import GridSearchCV import xgboost as xgb from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split # 加载数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 创建xgb分类模型实例 model = xgb.XGBClassifier() # 待搜索的参数列表空间 param_lst = {"max_depth": [3,5,7], "min_child_weight" : [1,3,6], "n_estimators": [100,200,300], "learning_rate": [0.01, 0.05, 0.1] } # 创建网格搜索对象 grid_search = GridSearchCV(model, param_grid=param_lst, cv=3, verbose=10, n_jobs=-1) # 基于iris数据集执行搜索 grid_search.fit(X_train, y_train) # 输出搜索结果 print(grid_search.best_estimator_)
输出:
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.01, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=3, max_leaves=None,
min_child_weight=1, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=200, n_jobs=None,
num_parallel_tree=None, objective='multi:softprob', ...)
如果使用随机搜索,代码如下:
# 导入必要的库 from sklearn.model_selection import RandomizedSearchCV import xgboost as xgb from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split # 加载数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 创建xgb分类模型实例 model = xgb.XGBClassifier() # 待搜索的参数列表空间 param_lst = {"max_depth": [3,5,7], "min_child_weight" : [1,3,6], "n_estimators": [100,200,300], "learning_rate": [0.01, 0.05, 0.1] } # 创建随机搜索对象 random_search = RandomizedSearchCV(model, param_distributions=param_lst, cv=3, verbose=10, n_jobs=-1) # 基于iris数据集执行搜索 random_search.fit(X_train, y_train) # 输出搜索结果 print(random_search.best_estimator_)
输出:
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.01, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=7, max_leaves=None,
min_child_weight=1, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=300, n_jobs=None,
num_parallel_tree=None, objective='multi:softprob', ...)
结果和上面gridsearch不一样。。。
使用贝叶斯优化的代码:
# 导入必要的库 from skopt import BayesSearchCV from skopt.space import Categorical, Integer, Real import xgboost as xgb from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split # 加载数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 创建xgb分类模型实例 model = xgb.XGBClassifier() # 待搜索的参数列表空间 param_lst = {"max_depth": Categorical([3,5,7]), "min_child_weight" : Categorical([1,3,6]), "n_estimators": Integer(100,300), "learning_rate": Real(0.01, 0.1) } # 创建贝叶斯优化搜索对象 bayes_search = BayesSearchCV(model, search_spaces=param_lst, cv=3, n_jobs=-1) # 基于iris数据集执行搜索 bayes_search.fit(X_train, y_train) # 输出搜索结果 print(bayes_search.best_estimator_)
上述的离线的值,如果是连续的值,则代码修改为:
# 导入必要的库 from skopt import BayesSearchCV import xgboost as xgb from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split # 加载数据集 iris = load_iris() X = iris.data y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 创建xgb分类模型实例 model = xgb.XGBClassifier() # 待搜索的参数列表空间 param_lst = {"max_depth": (3,7), "min_child_weight" : (1,6), "n_estimators": (100,300), "learning_rate": (0.01, 0.1) } # 创建贝叶斯优化搜索对象 bayes_search = BayesSearchCV(model, search_spaces=param_lst, cv=3, n_jobs=-1) # 基于iris数据集执行搜索 bayes_search.fit(X_train, y_train) # 输出搜索结果 print(bayes_search.best_estimator_)