机器学习笔记:模型融合之stacking

最近在打 2022 DCIC 比赛 —— 能源大数据子赛道:虚拟货币挖矿行为识别。

遂对模型融合学习一番。

以此记录。

一、原理

二、实操

1.准备数据

## 数据准备
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier as GBDT
from sklearn.ensemble import ExtraTreesClassifier as ET
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import AdaBoostClassifier as ADA
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

x, y = make_classification(n_samples=6000)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

2.底层模型

## 第一层模型
clfs = [GBDT(n_estimators=100),
        RF(n_estimators=100),
        ET(n_estimators=100),
        ADA(n_estimators=100),
        lgb.LGBMClassifier(n_estimators=100, metric='binary_logloss'),
        xgb.XGBClassifier(n_estimators=100, eval_metric=['logloss','auc','error']),
        ctb.CatBoostClassifier(n_estimators=100, verbose=100)
    ]
x_train_stack = np.zeros((x_train.shape[0], len(clfs)))
x_test_stack = np.zeros((x_test.shape[0], len(clfs)))

3.交叉验证

## 5折stacking
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, 
                      shuffle=True,
                      random_state=42)
for i, clf in enumerate(clfs):
    print("分类器:{}".format(clf))
    x_stack_test_n = np.zeros((x_test.shape[0], n_folds))
    for j, (train_index, val_index) in enumerate(skf.split(x_train, y_train)):
        train_x = x_train[train_index]
        train_y = y_train[train_index]
        val_x = x_train[val_index]
        val_y = y_train[val_index]
        clf.fit(train_x, train_y)
        x_train_stack[val_index, i] = clf.predict_proba(val_x)[:, 1]
        x_stack_test_n[:, j] = clf.predict_proba(x_test)[:, 1]
    x_test_stack[:, i] = x_stack_test_n.mean(axis=1)

4.顶层模型

## 第二层模型 lR
clf_second = LogisticRegression(solver='lbfgs')
clf_second.fit(x_train_stack, y_train)
pred = clf_second.predict_proba(x_test_stack)[:, 1]
roc_auc_score(y_test, pred) # 0.9754351517396808

5.基模型得分

## GBDT分类器
clf_1 = clfs[0]
clf_1.fit(x_train, y_train)
pred_1 = clf_1.predict_proba(x_test)[:, 1]
roc_auc_score(y_test, pred_1) # 0.975638403509552

## 随机森林分类器
clf_2 = clfs[1]
clf_2.fit(x_train, y_train)
pred_2 = clf_2.predict_proba(x_test)[:, 1]
roc_auc_score(y_test, pred_2) # 0.9734220848833361

## ExtraTrees分类器
clf_3 = clfs[2]
clf_3.fit(x_train, y_train)
pred_3 = clf_3.predict_proba(x_test)[:, 1]
roc_auc_score(y_test, pred_3) # 0.9757149773489643

## AdaBoost分类器
clf_4 = clfs[3]
clf_4.fit(x_train, y_train)
pred_4 = clf_4.predict_proba(x_test)[:, 1]
roc_auc_score(y_test, pred_4) # 0.9696699667521329

## LGB分类器
clf_5 = clfs[4]
clf_5.fit(x_train, y_train)
pred_5 = clf_5.predict_proba(x_test)[:, 1]
roc_auc_score(y_test, pred_5) # 0.9744694513908185

## XGB分类器
clf_6 = clfs[5]
clf_6.fit(x_train, y_train)
pred_6 = clf_6.predict_proba(x_test)[:, 1]
roc_auc_score(y_test, pred_6) # 0.9716007532956685

## CTB分类器
clf_7 = clfs[6]
clf_7.fit(x_train, y_train)
pred_7 = clf_7.predict_proba(x_test)[:, 1]
roc_auc_score(y_test, pred_7) # 0.9742521070667779

6.可视化效果对比

  • 数据合并
model_name = ['GBDT', 'RandomForest', 'ExtraTrees', 'AdaBoost', 'LGB', 'XGB', 'CatBoost', 'Stacking']
model_score = [roc_auc_score(y_test, pred_1), 
               roc_auc_score(y_test, pred_2), 
               roc_auc_score(y_test, pred_3),
               roc_auc_score(y_test, pred_4),
               roc_auc_score(y_test, pred_5),
               roc_auc_score(y_test, pred_6),
               roc_auc_score(y_test, pred_7),
               roc_auc_score(y_test, pred)]   
  • 纵向柱状图(不好看)
## 纵向柱状图
fig, ax = plt.subplots(figsize=(8, 5))
ax.bar(x=model_name, 
       height=model_score,
       width=0.5,
       align='center',
       color='grey')
ax.set_title("Score Of Model", fontsize=15)
xticks = ax.get_xticks()
for i in range(len(model_score)): # 数字堆叠很难看!
    xy = (xticks[i], model_score[i]*1.03)
    s = str(model_score[i])
    ax.annotate(
            s=s,
            xy=xy)
  • 横向柱状图
## 横向柱状图
fig, ax = plt.subplots(figsize=(10, 5))
b = ax.barh(model_name, 
       model_score,
       # color='#6699CC',
       height=0.5)
ax.set_title("Score Of Model", fontsize=20)
ax.set_xlim(0.95, 0.98)
for rect in b:
    w = rect.get_width()
    ax.text(w, 
            rect.get_y() + rect.get_height()/2, 
            '%.5f' % w,
            ha='left',
            va='center',
            fontsize=14)

三、注意

  • 底层的基模型必须为强模型,顶层模型可以使用简单分类器,防止过拟合
  • 底层基模型个数不能太少,因为底层模型个数等于顶层模型特征维度
  • 底层基模型必须准而不同,才能起到类似投票作用
  • 底层模型不能有性能很差模型

参考链接:Kaggle提升模型性能的超强杀招Stacking——机器学习模型融合

posted @   Hider1214  阅读(543)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 上周热点回顾(2.24-3.2)
点击右上角即可分享
微信分享提示