建模
- 使用随机森林加网络搜索调参,调节5个参数:n_estimators max_depth min_samples_split min_samples_leaf max_features
- 第一种: 单个参数调参
- 第二种: 5个参数一起调参
- 想法:想看看哪种方法结果好些
- 几十w数据这样调参时间相当久。谨慎啊。
- 目前结果还未跑出来。以下假设已取得最佳参数,均,仅是为了代码完整所添加。因为具体最优参数还未跑出结果。
# 导入经过特征处理的数据
train_for_model = pd.read_csv('train_for_model.csv')
testa_for_model = pd.read_csv('testa_for_model.csv')
# 删除时间列,暂时不参与训练
train_for_model.drop(columns=['issueDate','earliesCreditLine'], inplace=True)
testa_for_model.drop(columns=['issueDate','earliesCreditLine'], inplace=True)
# 忘了删除id,补上
train_for_model.drop(columns=['id'], inplace=True)
testa_for_model.drop(columns=['id'], inplace=True)
# 分离标签
y = train_for_model.isDefault
train_for_model_x = train_for_model.drop(columns=['isDefault'])
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,roc_curve
import time
from sklearn.model_selection import GridSearchCV
# 处理nan,因此nan超出float64范围,会报错
def nan_to_null(x):
if np.isnan(x):
return -1
else:
return x
for data in [train_for_model_x, testa_for_model]:
data['employmentLength'] = data['employmentLength'].apply(nan_to_null)
print(testa_for_model.info())
print(train_for_model_x.info())
# s_time = time.time()
x_train, x_valid, y_train, y_valid = train_test_split(train_for_model_x, y, test_size=0.1, random_state=10)
# 其它参数默认
# 调参 n_estimators
param_test1 = {'n_estimators':[50,120,160,200,250]}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100,
min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(x_train,y_train)
print( gsearch1.best_params_, gsearch1.best_score_)
# 调参 max_depth
param_test2 = {'max_depth':[10,15,20,30]} #
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=50,
min_samples_leaf=20,max_features='sqrt' ,oob_score=True, random_state=10),
param_grid = param_test2, scoring='roc_auc',iid=False, cv=5)
gsearch2.fit(x_train,y_train)
print( gsearch2.best_params_, gsearch2.best_score_)
# 调参 min_samples_split min_samples_leaf
param_test3 = {'min_samples_split':[50,100,200,400], 'min_samples_leaf':[20,30,40,50,100]}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 50, max_depth=2,
max_features='sqrt' ,oob_score=True, random_state=10),
param_grid = param_test3, scoring='roc_auc',iid=False, cv=5)
gsearch3.fit(x_train,y_train)
print( gsearch3.best_params_, gsearch3.best_score_)
# 调参 max_features
param_test4 = {'max_features':[10,20,30,35]}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 50, max_depth=2, min_samples_split=80,
min_samples_leaf=10 ,oob_score=True, random_state=10),
param_grid = param_test4, scoring='roc_auc',iid=False, cv=5)
gsearch4.fit(x_train,y_train)
print( gsearch4.best_params_, gsearch4.best_score_)
# 代入之前学习到的最佳参数 n_estimators max_depth min_samples_split min_samples_leaf max_features
rf2 = RandomForestClassifier(n_estimators= 50, max_depth=2, min_samples_split=80,
min_samples_leaf=10,max_features=3,oob_score=True, random_state=10)
rf2.fit(x_train,y_train)
print (rf2.oob_score_)
# 查看验证集roc
y_predprob = rf2.predict_proba(x_valid)[:,1]
print( "AUC Score (valid): %f" % roc_auc_score(y_valid, y_predprob))
连续调参
param_test4 = {
'n_estimators':[50,120,160,200,250],
'max_depth':[10,15,20,30],
'min_samples_split':[50,100,200,400],
'min_samples_leaf':[20,30,40,50,100],
'max_features':[10,20,30,35]
}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(),
param_grid = param_test4, scoring='roc_auc',iid=False, cv=5)
gsearch4.fit(x_train,y_train)
print( gsearch4.best_params_, gsearch4.best_score_)
# 查看验证集roc
y_predprob = gsearch4.predict_proba(x_valid)[:,1]
print( "AUC Score (valid): %f" % roc_auc_score(y_valid, y_predprob))