信用卡欺诈检测Credit Card Fraud Detection(kaggle)

地址：https://www.kaggle.com/mlg-ulb/creditcardfraud

数据概述

数据集包含2013年9月欧洲持卡人通过信用卡进行的交易。
该数据集显示了两天内发生的交易，在284,807笔交易中，我们有492起欺诈。数据集高度不平衡，阳性类别（欺诈）占所有交易的0.172％。

它仅包含数字输入变量，它们是PCA转换的结果。遗憾的是，由于机密性问题，我们无法提供有关数据的原始功能和更多背景信息。功能部件V1，V2，…，V28是使用PCA获得的主要组件，唯一尚未使用PCA转换的功能部件是“时间”和“量”。功能“时间”包含数据集中每个事务和第一个事务之间经过的秒数。功能“金额”是交易金额，此功能可用于与示例相关的成本敏感型学习。要素“类别”是响应变量，在发生欺诈时其值为1，否则为0。

识别欺诈性的信用卡交易。

给定类别不平衡率，我们建议使用精确召回曲线下的面积（AUPRC）测量精度。混淆矩阵的准确性对于不平衡分类没有意义。

还有很多的至于要使用PRC曲线，我们后面在补上，现在先补充代码

这个是使用逻辑回归计算的代码

# -*- coding: utf-8 -*-
"""
Created on Thu Feb 18 17:22:54 2021

@author: Administrator
"""

#%%导入模块
import pandas as pd 
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc("font",family="SimHei",size="12")  #解决中文无法显示的问题


#%%导入数据
creditcard = pd.read_csv('D:/信用卡欺诈检测/creditcard.csv/creditcard.csv')
creditcard.info()
creditcard.isnull().sum()
creditcard.corr().to_excel('tmp1.xlsx')


#%%使用常规的方法
creditcard.Class.mean()  #creditcard.Class,这个属于极度倾斜了  28w条数据



#%%数值变量的iv值计算
num_col = list(creditcard.columns)[1:-1]
num_iv_woedf = pc.WoeDf()
clf = pc.NumBin()
for i in num_col:
    clf.fit(creditcard[i] ,creditcard.Class)
    #clf.generate_transform_fun()
    num_iv_woedf.append(clf.woe_df_)
num_iv_woedf.to_excel('tmp2')

  
# 去掉这些V13    V15    V22    V24    V25    V26
num_col = [i for i in num_col if i not in ['V13',    'V15',    'V22',    'V24',    'V25',    'V26']]
num_iv_woedf = pc.WoeDf()
clf = pc.NumBin()
for i in num_col:
    clf.fit(creditcard[i] ,creditcard.Class)
    creditcard[i+'_bin'] = clf.transform(creditcard[i])  #这样可以省略掉后面转换成_bin的一步骤
    num_iv_woedf.append(clf.woe_df_)


#%%woe转换
bin_col = [i for i in list(creditcard.columns) if i[-4:]=='_bin']

cate_iv_woedf = pc.WoeDf()
for i in bin_col:
    cate_iv_woedf.append(pc.cross_woe(creditcard[i] ,creditcard.Class))
cate_iv_woedf.to_excel('tmp1')
cate_iv_woedf.bin2woe(creditcard,bin_col)


#%%建模
model_col = [i for i in list(creditcard.columns) if i[-4:]=='_woe']

import pandas as pd
import matplotlib.pyplot as plt #导入图像库
import matplotlib
import seaborn as sns
import statsmodels.api as sm
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split

X = creditcard[model_col]
Y = creditcard['Class']


x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=100)


X1=sm.add_constant(x_train)   #在X前加上一列常数1，方便做带截距项的回归
logit=sm.Logit(y_train.astype(float),X1.astype(float))
result=logit.fit()
result.summary()
result.params

resu_1 = result.predict(X1.astype(float))
fpr, tpr, threshold = roc_curve(y_train, resu_1)
rocauc = auc(fpr, tpr)  #0.9693313248601317
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()

# 此处我们看一下混淆矩阵
from sklearn.metrics import precision_score, recall_score, f1_score,confusion_matrix
#lr = LogisticRegression(C=best_c, penalty='l1')
#lr.fit(X_train_undersample, y_train_undersample)
#y_pred_undersample = lr.predict(X_train_undersample)

resu_1 = resu_1.apply(lambda x :1 if x>=0.5 else 0)
matrix = confusion_matrix(y_train, resu_1)
print("混淆矩阵:\n", matrix)
print("精度:", precision_score(y_train, resu_1))
print("召回率:", recall_score(y_train, resu_1))
print("f1分数:", f1_score(y_train, resu_1))
'''
混淆矩阵:
 [[198985     29]
 [    73    277]]
精度: 0.9052287581699346
召回率: 0.7914285714285715
f1分数: 0.8445121951219513
'''


#%%验证集
X3 = sm.add_constant(x_test)
resu = result.predict(X3.astype(float))
fpr, tpr, threshold = roc_curve(y_test, resu)
rocauc = auc(fpr, tpr)
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()

# 此处我们看一下混淆矩阵
from sklearn.metrics import precision_score, recall_score, f1_score,confusion_matrix
#lr = LogisticRegression(C=best_c, penalty='l1')
#lr.fit(X_train_undersample, y_train_undersample)
#y_pred_undersample = lr.predict(X_train_undersample)

resu = resu.apply(lambda x :1 if x>=0.5 else 0)
matrix = confusion_matrix(y_test, resu)
print("混淆矩阵:\n", matrix)
print("精度:", precision_score(y_test, resu))
print("召回率:", recall_score(y_test, resu))
print("f1分数:", f1_score(y_test, resu))
'''
混淆矩阵:
 [[85275    26]
 [   40   102]]
精度: 0.796875
召回率: 0.7183098591549296
f1分数: 0.7555555555555555
'''







#%%试一下那个度量工具
def tpr_weight_funtion(y_true,y_predict):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

tpr_weight_funtion(y_train, resu_1) #0.8754285714285714

View Code

下面补上xgboost模型的代码

# -*- coding: utf-8 -*-
"""
Created on Wed Mar 10 19:47:40 2021

@author: Administrator
"""


#%%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import time
import xgboost as xgb
from xgboost import plot_importance  #画特征重要性的函数
#from imblearn.ensemble import EasyEnsemble  #还有模块木有安装
from sklearn.model_selection import train_test_split
#from sklearn.externals import joblib 已经改成了下面这种方式
import joblib
from sklearn.metrics import auc,roc_curve  #说明是分类
plt.rc('font',family='SimHei',size=13)   #使画出的图形中能正常显示中文
%matplotlib inline


#%%导入数据
creditcard = pd.read_csv('D:/信用卡欺诈检测/creditcard.csv/creditcard.csv')


#%%
train_y = creditcard[['Class']]
train_y.columns = ['y']
train_x = creditcard.drop(['Class','Time'],axis=1)


# 
file_xgboost_model='./xgboost_model' #模型文件
file_xgboost_columns='./columns.csv' #最终使用的特征
file_xgboost_model_auc_ks='./xgboost_model_auc_ks.png' #模型AUC和KS值
file_xgboost_model_score='./xgboost_model_score.png' # 模型预测用户的评分分布
file_xgboost_model_prob='./xgboost_model_prob.png' #模型预测用户的概率分布


#%%
def create_feature_map(features):
    outfile = open('xgb.txt', 'w')  #写，新建一个叫xgb.txt的文件
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))   #格式为 0  feature  q  \t是分隔符，为空  就是说第一列是序号，第二列是特征名称，第三列是q,不知道需要这个q干吗，可以是多写了，先要着吧，后面再看看吧
        i = i + 1
    outfile.close()
create_feature_map(train_x.columns)

#%%
#运行XGBoost,输出特征重要性排名
#运行XGBoost,输出特征重要性排名
def run_xgboost(data_x,data_y,random_state_num):
    train_x,valid_x,train_y,valid_y = train_test_split(data_x.values,data_y.values,test_size=0.25,random_state=random_state_num)
    print('开始训练模型')
    start = time.time()
    #转换成xgb运算格式
    d_train = xgb.DMatrix(train_x,train_y)
    d_valid = xgb.DMatrix(valid_x,valid_y)
    watchlist = [(d_train,'train'),(d_valid,'valid')]
    #参数设置（未调箱前的参数）
    params={
        'eta':0.2,                        #特征权重，取值范围0~1，通常最后设置eta为0.01~0.2
        'max_depth':3,                    #树的深度，通常取值3-10，过大容易过拟合，过小欠拟合
        'min_child_weight':1,             #最小样本的权重，调大参数可以繁殖过拟合
        'gamma':0.4,                      #控制是否后剪枝，越大越保守，一般0.1、 0.2的样子
        'subsample':0.8,                  #随机取样比例
        'colsample_bytree':0.8 ,          #默认为1，取值0~1，对特征随机采集比例
        'reg_lambda':0.8,
        'reg_alpha':0.6,
        'learning_rate':0.1,
        'n_estimators':500,
        'booster':'gbtree',               #迭代树
        'objective':'binary:logistic',    #逻辑回归，输出为概率
        'nthread':6,                      #设置最大的进程量，若不设置则会使用全部资源
        'scale_pos_weight':1,             #默认为0,1可以处理类别不平衡
        'lambda':1,                       #默认为1，用于L2平滑处理项，避免模型过拟合
        'seed':1234,                      #随机树种子
        'silent':1,                       #0表示输出结果
        'eval_metric':'auc'               #评分指标
    }
    bst = xgb.train(params, d_train,1000,watchlist,early_stopping_rounds=100, verbose_eval=5)   #最大迭代次数1000次
    print(time.time()-start)
    tree_nums = bst.best_ntree_limit
    print('最优模型树的数量：%s,最优迭代次数：%s,auc: %s' %(bst.best_ntree_limit,bst.best_iteration,bst.best_score))
    bst = xgb.train(params, d_train,tree_nums,watchlist,early_stopping_rounds=100, verbose_eval=10) #最优模型迭代次数去训练
    
#     feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)
#     #新版需要转换成dict or list 
#     #feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False)
#     #plt.bar(feat_imp.index, feat_imp)
#     feat_imp.plot(kind='bar', title='Feature Importances')
    #展示特征重要性排名
    feat_imp = bst.get_fscore(fmap='xgb.txt')
    feat_imp = sorted(feat_imp.items(),key=operator.itemgetter(1))
    df = pd.DataFrame(feat_imp,columns=['feature','fscore'])
    #每个特征被调用的次数/所有特征被调用总次数
    df['fscore'] = df['fscore']/df['fscore'].sum()
    #分数高的排在前面,展示前40个重要特征排名
    df = df.sort_values(by='fscore',ascending=False)
    df = df.iloc[:40]
    plt.figure()
    df.plot(kind='bar',x='feature',y='fscore',legend=True,figsize=(32,10))
    plt.title('XGBoost Feature Importance')
    plt.xlabel('relative importance')
    plt.gcf().savefig('feature_importance_xgb.png')
    plt.show()
    return bst

#%%
# 绘制ROC曲线函数
def plot_roc(test_x, test_y):
    predictions = bst.predict(xgb.DMatrix(test_x))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions)  #roc的几个参数
    roc_auc = auc(false_positive_rate, true_positive_rate)  #直接计算auc
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r.')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('tpr')
    plt.xlabel('fpr')

# 绘制K-S函数 从大到小排序，分10等分
def plot_ks(test_x, test_y):
    predictions = bst.predict(xgb.DMatrix(test_x))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions, drop_intermediate=False)
    pre = sorted(predictions, reverse=True)  #reverse参数为True意味着按照降序排序，这是画ks时要求的
    num = []
    for i in range(10):
        num.append((i) * int(len(pre) / 10))
    num.append(len(pre) - 1)
    df = pd.DataFrame()
    df['false_positive_rate'] = false_positive_rate
    df['true_positive_rate'] = true_positive_rate
    df['thresholds'] = thresholds
    data_ks = []
    for i in num:
        data_ks.append(list(df[df['thresholds'] == pre[i]].values[0]))
    data_ks = pd.DataFrame(data_ks)
    data_ks.columns = ['fpr', 'tpr', 'thresholds']
    ks = max(data_ks['tpr'] - data_ks['fpr'])
    plt.title('K-S曲线')
    plt.plot(np.array(range(len(num))), data_ks['tpr'])
    plt.plot(np.array(range(len(num))), data_ks['fpr'])
    plt.plot(np.array(range(len(num))), data_ks['tpr'] - data_ks['fpr'], label='K-S = %0.4f' % ks)
    plt.legend(loc='lower right')
    plt.xlim([0, 10])
    plt.ylim([0.0, 1.0])
    plt.ylabel('累计占比')
    plt.xlabel('分组编号')

# 绘制一张图，包含训练和测试集的ROC、AUC、K-S图形指标。
def auc_ks(train_x, test_x, train_y, test_y):
    plt.figure(figsize=(15, 15))
    plt.subplot(221)
    plot_roc(train_x, train_y)
    plt.subplot(222)
    plot_roc(test_x, test_y)
    plt.subplot(223)
    plot_ks(train_x, train_y)
    plt.subplot(224)
    plot_ks(test_x, test_y)
    plt.savefig(file_xgboost_model_auc_ks)
    plt.show()

#%%
#保存模型、评价指标、选择变量到D盘
def run_main(data_x,data_y):
    global bst
    start=time.time()
    bst=run_xgboost(data_x,data_y,random_state_num=1234)  #为什么要是1234，因为调参时候就是=1234
    joblib.dump(bst, file_xgboost_model)  #joblib的用法https://www.cnblogs.com/wzdLY/p/9630671.html 将模型保存
    print('模型已成功保存在 %s'%(file_xgboost_model))
    train_x, test_x, train_y, test_y = train_test_split(data_x.values, data_y.values, test_size=0.25, random_state=1234)
    auc_ks(train_x, test_x, train_y, test_y)
    print('模型评价指标已保存在：%s'%(file_xgboost_model_auc_ks))
    print('运行共花费时间：%s'%(time.time()-start))
    resu = bst.predict(xgb.DMatrix(test_x))
    

    

if __name__=='__main__':
    run_main(train_x, train_y)
    
#%%单独跑这段，就可以得到混淆矩阵
from sklearn.metrics import precision_score, recall_score, f1_score,confusion_matrix
#lr = LogisticRegression(C=best_c, penalty='l1')
#lr.fit(X_train_undersample, y_train_undersample)
#y_pred_undersample = lr.predict(X_train_undersample)

bst=run_xgboost(train_x, train_y,random_state_num=1234) 
train_x, test_x, train_y, test_y = train_test_split(train_x.values, train_y.values, test_size=0.25, random_state=1234)
resu = bst.predict(xgb.DMatrix(test_x))
resu = pd.DataFrame(resu)
resu.columns=['y']
resu = resu['y'].apply(lambda x:1 if x>0.5 else 0)
resu = resu.values

matrix = confusion_matrix(test_y, resu)
print("混淆矩阵:\n", matrix)
print("精度:", precision_score(test_y, resu))
print("召回率:", recall_score(test_y, resu))
print("f1分数:", f1_score(test_y, resu))
'''
混淆矩阵:
 [[71078     6]
 [   32    86]]
精度: 0.9347826086956522
召回率: 0.7288135593220338
f1分数: 0.819047619047619
'''

View Code

混淆矩阵:
[[71078 6]
[ 32 86]]
精度: 0.9347826086956522
召回率: 0.7288135593220338
f1分数: 0.819047619047619

我们可以看出，xgboost模型还是比逻辑回归模型要好，而且我还没有经过调参

2021.03.12补充LightGBM

代码如下：

# -*- coding: utf-8 -*-
"""
Created on Fri Mar 12 14:43:16 2021

@author: Administrator
"""

#%%导入模块
import pandas as pd 
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc("font",family="SimHei",size="12")  #解决中文无法显示的问题


#%%导入数据
creditcard = pd.read_csv('D:/信用卡欺诈检测/creditcard.csv/creditcard.csv')
creditcard.info()  #284806
creditcard.isnull().sum()
creditcard.head(3)
creditcard.rename(columns={'Class':'y'},inplace = True)

from sklearn.model_selection import KFold
# 分离数据集，方便进行交叉验证
X_train = creditcard.iloc[:,0:-1]
y_train = creditcard.y

# 5折交叉验证
folds = 5
seed = 2021
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)


#%%对训练集数据进行划分，分成训练集和验证集，并进行相应的操作
from sklearn.model_selection import train_test_split
import lightgbm as lgb
# 数据集划分
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.3,stratify=y_train)
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)

params = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'learning_rate': 0.1,
            'metric': 'auc',
            'min_child_weight': 1,
            'num_leaves': 10,
            'max_depth': 7,
            'reg_lambda': 0,
            'reg_alpha': 0,
            'feature_fraction': 1,
            'bagging_fraction': 1,
            'bagging_freq': 0,
            'seed': 2020,
            'nthread': 8,
            'silent': True,
            'verbose': -1,
}

"""使用训练集数据进行模型训练"""
model = lgb.train(params, train_set=train_matrix, valid_sets=valid_matrix, \
                  num_boost_round=20000, verbose_eval=1000, early_stopping_rounds=200)
#[847]    valid_0's auc: 0.94372
    
from sklearn import metrics
from sklearn.metrics import roc_auc_score

"""预测并计算roc的相关指标"""
val_pre_lgb = model.predict(X_val, num_iteration=model.best_iteration)
fpr, tpr, threshold = metrics.roc_curve(y_val, val_pre_lgb)
roc_auc = metrics.auc(fpr, tpr)
print('未调参前lightgbm单模型在验证集上的AUC：{}'.format(roc_auc))
"""画出roc曲线图"""
plt.figure(figsize=(8, 8))
plt.title('Validation ROC')
plt.plot(fpr, tpr, 'b', label = 'Val AUC = %0.4f' % roc_auc)
plt.ylim(0,1)
plt.xlim(0,1)
plt.legend(loc='best')
plt.title('ROC')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
# 画出对角线
plt.plot([0,1],[0,1],'r--')
plt.show()


import lightgbm as lgb
"""使用lightgbm 5折交叉验证进行建模预测"""
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)):
    print('************************************ {} ************************************'.format(str(i+1)))
    X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], y_train[train_index], X_train.iloc[valid_index], y_train[valid_index]
    
    train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
    valid_matrix = lgb.Dataset(X_val, label=y_val)

    params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'learning_rate': 0.1,
                'metric': 'auc',
        
                'min_child_weight': 1e-3,
                'num_leaves': 10,
                'max_depth': -1,
                'reg_lambda': 0,
                'reg_alpha': 0,
                'feature_fraction': 1,
                'bagging_fraction': 1,
                'bagging_freq': 0,
                'seed': 2021,
                'nthread': 8,
                'silent': True,
                'verbose': -1,
    }
    
    model = lgb.train(params, train_set=train_matrix, num_boost_round=20000, valid_sets=valid_matrix, verbose_eval=1000, early_stopping_rounds=200)
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    
    cv_scores.append(roc_auc_score(y_val, val_pred))
    print(cv_scores)

print("lgb_scotrainre_list:{}".format(cv_scores))
print("lgb_score_mean:{}".format(np.mean(cv_scores)))
print("lgb_score_std:{}".format(np.std(cv_scores)))


#%%贝叶斯调参
from sklearn.model_selection import cross_val_score

"""定义优化函数"""
def rf_cv_lgb(num_leaves, max_depth, bagging_fraction, feature_fraction, bagging_freq, min_data_in_leaf, 
              min_child_weight, min_split_gain, reg_lambda, reg_alpha):
    # 建立模型
    model_lgb = lgb.LGBMClassifier(boosting_type='gbdt', bjective='binary', metric='auc',
                                   learning_rate=0.1, n_estimators=5000,
                                   num_leaves=int(num_leaves), max_depth=int(max_depth), 
                                   bagging_fraction=round(bagging_fraction, 2), feature_fraction=round(feature_fraction, 2),
                                   bagging_freq=int(bagging_freq), min_data_in_leaf=int(min_data_in_leaf),
                                   min_child_weight=min_child_weight, min_split_gain=min_split_gain,
                                   reg_lambda=reg_lambda, reg_alpha=reg_alpha,
                                   n_jobs= 8
                                  )
    
    val = cross_val_score(model_lgb, X_train_split, y_train_split, cv=5, scoring='roc_auc').mean()
    
    return val

from bayes_opt import BayesianOptimization
"""定义优化参数"""
bayes_lgb = BayesianOptimization(
    rf_cv_lgb, 
    {
        'num_leaves':(10, 200),
        'max_depth':(3, 20),
        'bagging_fraction':(0.5, 1.0),
        'feature_fraction':(0.5, 1.0),
        'bagging_freq':(0, 100),
        'min_data_in_leaf':(10,100),
        'min_child_weight':(0, 10),
        'min_split_gain':(0.0, 1.0),
        'reg_alpha':(0.0, 10),
        'reg_lambda':(0.0, 10),
    }
)

"""开始优化"""
bayes_lgb.maximize(n_iter=10)

bayes_lgb.max
'''
{'target': 0.978984093218777,
 'params': {'bagging_fraction': 0.7852426281123215,
  'bagging_freq': 42.927767267031435,
  'feature_fraction': 0.8729234124911952,
  'max_depth': 18.80072510809031,
  'min_child_weight': 8.29481722055312,
  'min_data_in_leaf': 13.261838180182071,
  'min_split_gain': 0.45972976507462127,
  'num_leaves': 154.4793280962274,
  'reg_alpha': 7.018060276190158,
  'reg_lambda': 2.1475557765094413}}
'''
#%%调整一个较小的学习率，并通过cv函数确定当前最优的迭代次数"""
base_params_lgb = {
                    'boosting_type': 'gbdt',
                    'objective': 'binary',
                    'metric': 'auc',
                    'learning_rate': 0.01,
                    'num_leaves': 154,
                    'max_depth': 18,
                    'min_data_in_leaf': 21,
                    'min_child_weight':8.3,
                    'bagging_fraction': 0.78,
                    'feature_fraction': 0.87,
                    'bagging_freq': 43,
                    'reg_lambda': 2,
                    'reg_alpha': 7,
                    'min_split_gain': 0.5,
                    'nthread': 8,
                    'seed': 2021,
                    'silent': True,
                    'verbose': -1
                    
                    
}

cv_result_lgb = lgb.cv(
    train_set=train_matrix,
    early_stopping_rounds=1000, 
    num_boost_round=20000,
    nfold=5,
    stratified=True,
    shuffle=True,
    params=base_params_lgb,
    metrics='auc',
    seed=0
)

print('迭代次数{}'.format(len(cv_result_lgb['auc-mean'])))
print('最终模型的AUC为{}'.format(max(cv_result_lgb['auc-mean'])))
'''
迭代次数855
最终模型的AUC为0.9821581751610478
'''

#%%模型参数已经确定，建立最终模型并对验证集进行验证
import lightgbm as lgb
"""使用lightgbm 5折交叉验证进行建模预测"""
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)):
    print('************************************ {} ************************************'.format(str(i+1)))
    X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], y_train[train_index], X_train.iloc[valid_index], y_train[valid_index]
    
    train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
    valid_matrix = lgb.Dataset(X_val, label=y_val)

    params = {
                'boosting_type': 'gbdt',
                    'objective': 'binary',
                    'metric': 'auc',
                    'learning_rate': 0.01,
                    'num_leaves': 154,
                    'max_depth': 18,
                    'min_data_in_leaf': 20,
                    'min_child_weight':8.3,
                    'bagging_fraction': 0.78,
                    'feature_fraction': 0.87,
                    'bagging_freq': 43,
                    'reg_lambda': 2,
                    'reg_alpha': 7,
                    'min_split_gain': 0.5,
                    'nthread': 8,
                    'seed': 2021,
                    'silent': True,
                    'verbose': -1
    }
    
    model = lgb.train(params, train_set=train_matrix, num_boost_round=855, valid_sets=valid_matrix, verbose_eval=1000, early_stopping_rounds=200)
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    
    cv_scores.append(roc_auc_score(y_val, val_pred))
    print(cv_scores)

print("lgb_scotrainre_list:{}".format(cv_scores))
print("lgb_score_mean:{}".format(np.mean(cv_scores)))
print("lgb_score_std:{}".format(np.std(cv_scores)))

#%%通过5折交叉验证可以发现，模型迭代次数在750次的时候会停之，那么我们在建立新模型时直接设置最大迭代次数，并使用验证集进行模型预测


""""""
base_params_lgb = {
                    'boosting_type': 'gbdt',
                    'objective': 'binary',
                    'metric': 'auc',
                    'learning_rate': 0.01,
                    'num_leaves': 154,
                    'max_depth': 18,
                    'min_data_in_leaf': 20,
                    'min_child_weight':8.3,
                    'bagging_fraction': 0.78,
                    'feature_fraction': 0.87,
                    'bagging_freq': 43,
                    'reg_lambda': 2,
                    'reg_alpha': 7,
                    'min_split_gain': 0.5,
                    'nthread': 8,
                    'seed': 2021,
                    'silent': True
                    
}

"""使用训练集数据进行模型训练"""
final_model_lgb = lgb.train(base_params_lgb, train_set=train_matrix, valid_sets=valid_matrix, num_boost_round=855, verbose_eval=1000, early_stopping_rounds=200)

"""预测并计算roc的相关指标"""
val_pre_lgb = final_model_lgb.predict(X_val)
fpr, tpr, threshold = metrics.roc_curve(y_val, val_pre_lgb)
roc_auc = metrics.auc(fpr, tpr)  
print('调参后lightgbm单模型在验证集上的AUC：{}'.format(roc_auc))  #0.9765762181212846
"""画出roc曲线图"""
plt.figure(figsize=(8, 8))
plt.title('Validation ROC')
plt.plot(fpr, tpr, 'b', label = 'Val AUC = %0.4f' % roc_auc)
plt.ylim(0,1)
plt.xlim(0,1)
plt.legend(loc='best')
plt.title('ROC')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
# 画出对角线
plt.plot([0,1],[0,1],'r--')
plt.show()

View Code

使用了贝叶斯调参，最后效果索然不如xgboost，但是也比逻辑回归要好，且不需要处理任何变量，直接喂给算法

2021.03.15补充xgboost另外一种调参方式

# -*- coding: utf-8 -*-
"""
Created on Tue Mar  9 16:16:56 2021

@author: Administrator
"""

#%%导入模块
import pandas as pd 
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc("font",family="SimHei",size="12")  #解决中文无法显示的问题


#%%导入数据
creditcard = pd.read_csv('D:/信用卡欺诈检测/creditcard.csv/creditcard.csv')
creditcard.info()


train_y = creditcard[['Class']]
train_y.columns = ['y']
train_x = creditcard.drop(['Class','Time'],axis=1)


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import time
import xgboost as xgb
from xgboost import plot_importance  #画特征重要性的函数
#from imblearn.ensemble import EasyEnsemble  #还有模块木有安装
from sklearn.model_selection import train_test_split
#from sklearn.externals import joblib 已经改成了下面这种方式
import joblib
from sklearn.metrics import auc,roc_curve  #说明是分类
plt.rc('font',family='SimHei',size=13)   #使画出的图形中能正常显示中文
%matplotlib inline


def create_feature_map(features):
    outfile = open('xgb.txt', 'w')  #写，新建一个叫xgb.txt的文件
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))   #格式为 0  feature  q  \t是分隔符，为空  就是说第一列是序号，第二列是特征名称，第三列是q,不知道需要这个q干吗，可以是多写了，先要着吧，后面再看看吧
        i = i + 1
    outfile.close()
create_feature_map(train_x.columns)

file_xgboost_model='./xgboost_model' #模型文件
file_xgboost_columns='./columns.csv' #最终使用的特征
file_xgboost_model_auc_ks='./xgboost_model_auc_ks.png' #模型AUC和KS值
file_xgboost_model_score='./xgboost_model_score.png' # 模型预测用户的评分分布
file_xgboost_model_prob='./xgboost_model_prob.png' #模型预测用户的概率分布

import xgboost as xgb
from xgboost import XGBClassifier 
from xgboost import plot_tree
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

X = creditcard.iloc[:,0:-1]
y = creditcard.Class

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

#%%
def tun_parameters(train_x,train_y):  #通过这个函数，确定树的个数
    xgb1 = XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=5,min_child_weight=1,gamma=0,subsample=0.8,  
                         colsample_bytree=0.8,objective= 'binary:logistic',scale_pos_weight=1,seed=27)  
    modelfit(xgb1,train_x,train_y)  
 
def modelfit(alg,X, y,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X, label=y)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds,callbacks=[
            xgb.callback.print_evaluation(show_stdv=False),
            xgb.callback.early_stop(early_stopping_rounds)
       ])
        alg.set_params(n_estimators=cvresult.shape[0])
 
    #Fit the algorithm on the data
    alg.fit(X, y,eval_metric='auc')
 
    #Predict training set:
    dtrain_predictions = alg.predict(X)
    dtrain_predprob = alg.predict_proba(X)[:,1]
 
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(y, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, dtrain_predprob))
 
    print ('n_estimators=',cvresult.shape[0])
tun_parameters(X_train,y_train)
'''
Accuracy : 0.9998
AUC Score (Train): 0.999886
n_estimators= 100
'''

#%%第二步： max_depth 和 min_child_weight 参数调优
from sklearn.model_selection import GridSearchCV
param_test1 = {
  'max_depth':range(3,10,1),
 'min_child_weight':range(2,9,1)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=100, max_depth=5,
min_child_weight=1, gamma=0, subsample=0.8,colsample_bytree=0.8,\
 objective= 'binary:logistic', nthread=8,scale_pos_weight=1, seed=27), 
 param_grid = param_test1,scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.best_params_,     gsearch1.best_score_
#({'max_depth': 3, 'min_child_weight': 5}, 0.9851612149724902)
#({'max_depth': 5, 'min_child_weight': 8}, 0.9860796809303931)

#%%第三步：gamma参数调优 
param_test3 = {  
    'gamma': [i / 10.0 for i in range(0, 5)]  
}  
gsearch3 = GridSearchCV(  
    estimator=XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5, min_child_weight=8, gamma=0,  
                            subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=8,  
                            scale_pos_weight=1, seed=27), param_grid=param_test3, scoring='roc_auc', n_jobs=-1,  
    iid=False, cv=5)  
gsearch3.fit(X_train,y_train)  
gsearch3.best_params_, gsearch3.best_score_  
#({'gamma': 0.0}, 0.9860796809303931)
#%%第四步：调整subsample 和 colsample_bytree 参数 
param_test4 = {  
    'subsample': [i / 10.0 for i in range(6, 10)],  
    'colsample_bytree': [i / 10.0 for i in range(6, 10)]  
}  
  
gsearch4 = GridSearchCV(  
    estimator=XGBClassifier(learning_rate=0.1,n_estimators=100, max_depth=5, min_child_weight=8, gamma=0,  
                            subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=8,  
                            scale_pos_weight=1, seed=27), param_grid=param_test4, scoring='roc_auc', n_jobs=-1,  
    iid=False, cv=5)  
  
gsearch4.fit(X_train,y_train)  
gsearch4.best_params_, gsearch4.best_score_  

#%%第五步：正则化参数调优 reg_alpha和reg_lambda(这里只调了reg_alpha)


def tun_parameters2(train_x,train_y):  #通过这个函数，确定树的个数
    xgb1 = XGBClassifier(learning_rate =0.1, n_estimators=5000, max_depth=5, min_child_weight=8, gamma=0, subsample=0.8, colsample_bytree=0.8,objective= 'binary:logistic', nthread=8,booster='gbtree',
    reg_alpha= 0.6,reg_lambda= 0.8,
    scale_pos_weight=1,seed=2021)
    modelfit(xgb1,train_x,train_y)  
tun_parameters2(X_train,y_train)


'''
Model Report
Accuracy : 0.9997
AUC Score (Train): 0.998747
n_estimators= 134
'''



model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=8, missing=np.nan, monotone_constraints='()',
              n_estimators=134, n_jobs=8, nthread=8, num_parallel_tree=1,
              random_state=27, reg_alpha=0.6, reg_lambda=0.8,
              scale_pos_weight=1, seed=27, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbosity=None)
model.fit(X_train,y_train)  


#%%验证集


def plot_roc(test_x, test_y):
    predictions = model.predict(test_x)
    false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(test_y, predictions)  #roc的几个参数
    roc_auc = metrics.auc(false_positive_rate, true_positive_rate)  #直接计算auc
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r.')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('tpr')
    plt.xlabel('fpr')

plot_roc(X_test,y_test)


def run_xgboost(data_x,data_y,random_state_num):
    train_x,valid_x,train_y,valid_y = train_test_split(data_x.values,data_y.values,test_size=0.25,random_state=random_state_num)
    print('开始训练模型')
    start = time.time()
    #转换成xgb运算格式
    d_train = xgb.DMatrix(train_x,train_y)
    d_valid = xgb.DMatrix(valid_x,valid_y)
    watchlist = [(d_train,'train'),(d_valid,'valid')]
    #参数设置（未调箱前的参数）
    params={
        'eta':0.1,                        #特征权重，取值范围0~1，通常最后设置eta为0.01~0.2
        'max_depth':5,                    #树的深度，通常取值3-10，过大容易过拟合，过小欠拟合
        'min_child_weight':8,             #最小样本的权重，调大参数可以繁殖过拟合
        'gamma':0.0,                      #控制是否后剪枝，越大越保守，一般0.1、 0.2的样子
        'subsample':0.8,                  #随机取样比例
        'colsample_bytree':0.8 ,          #默认为1，取值0~1，对特征随机采集比例
        'lambda':0.8,
        'alpha':0.6,
        'n_estimators':500,
        'booster':'gbtree',               #迭代树
        'objective':'binary:logistic',    #逻辑回归，输出为概率
        'nthread':6,                      #设置最大的进程量，若不设置则会使用全部资源
        'scale_pos_weight':10,             #默认为0,1可以处理类别不平衡
        'lambda':1,                       #默认为1，用于L2平滑处理项，避免模型过拟合
        'seed':1234,                      #随机树种子
        'silent':1,                       #0表示输出结果
        'eval_metric':'auc'               #评分指标
    }
    bst = xgb.train(params, d_train,1000,watchlist,early_stopping_rounds=100, verbose_eval=5)   #最大迭代次数1000次
    print(time.time()-start)
    tree_nums = bst.best_ntree_limit
    print('最优模型树的数量：%s,最优迭代次数：%s,auc: %s' %(bst.best_ntree_limit,bst.best_iteration,bst.best_score))
    bst = xgb.train(params, d_train,tree_nums,watchlist,early_stopping_rounds=100, verbose_eval=10) #最优模型迭代次数去训练
    
#     feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)
#     #新版需要转换成dict or list 
#     #feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False)
#     #plt.bar(feat_imp.index, feat_imp)
#     feat_imp.plot(kind='bar', title='Feature Importances')
    #展示特征重要性排名
    feat_imp = bst.get_fscore(fmap='xgb.txt')
    feat_imp = sorted(feat_imp.items(),key=operator.itemgetter(1))
    df = pd.DataFrame(feat_imp,columns=['feature','fscore'])
    #每个特征被调用的次数/所有特征被调用总次数
    df['fscore'] = df['fscore']/df['fscore'].sum()
    #分数高的排在前面,展示前40个重要特征排名
    df = df.sort_values(by='fscore',ascending=False)
    df = df.iloc[:40]
    plt.figure()
    df.plot(kind='bar',x='feature',y='fscore',legend=True,figsize=(32,10))
    plt.title('XGBoost Feature Importance')
    plt.xlabel('relative importance')
    plt.gcf().savefig('feature_importance_xgb.png')
    plt.show()
    return bst

#%%
# 绘制ROC曲线函数
def plot_roc(test_x, test_y):
    predictions = bst.predict(xgb.DMatrix(test_x))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions)  #roc的几个参数
    roc_auc = auc(false_positive_rate, true_positive_rate)  #直接计算auc
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r.')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('tpr')
    plt.xlabel('fpr')

# 绘制K-S函数 从大到小排序，分10等分
def plot_ks(test_x, test_y):
    predictions = bst.predict(xgb.DMatrix(test_x))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions, drop_intermediate=False)
    pre = sorted(predictions, reverse=True)  #reverse参数为True意味着按照降序排序，这是画ks时要求的
    num = []
    for i in range(10):
        num.append((i) * int(len(pre) / 10))
    num.append(len(pre) - 1)
    df = pd.DataFrame()
    df['false_positive_rate'] = false_positive_rate
    df['true_positive_rate'] = true_positive_rate
    df['thresholds'] = thresholds
    data_ks = []
    for i in num:
        data_ks.append(list(df[df['thresholds'] == pre[i]].values[0]))
    data_ks = pd.DataFrame(data_ks)
    data_ks.columns = ['fpr', 'tpr', 'thresholds']
    ks = max(data_ks['tpr'] - data_ks['fpr'])
    plt.title('K-S曲线')
    plt.plot(np.array(range(len(num))), data_ks['tpr'])
    plt.plot(np.array(range(len(num))), data_ks['fpr'])
    plt.plot(np.array(range(len(num))), data_ks['tpr'] - data_ks['fpr'], label='K-S = %0.4f' % ks)
    plt.legend(loc='lower right')
    plt.xlim([0, 10])
    plt.ylim([0.0, 1.0])
    plt.ylabel('累计占比')
    plt.xlabel('分组编号')

# 绘制一张图，包含训练和测试集的ROC、AUC、K-S图形指标。
def auc_ks(train_x, test_x, train_y, test_y):
    plt.figure(figsize=(15, 15))
    plt.subplot(221)
    plot_roc(train_x, train_y)
    plt.subplot(222)
    plot_roc(test_x, test_y)
    plt.subplot(223)
    plot_ks(train_x, train_y)
    plt.subplot(224)
    plot_ks(test_x, test_y)
    plt.savefig(file_xgboost_model_auc_ks)
    plt.show()

#%%
#保存模型、评价指标、选择变量到D盘
def run_main(data_x,data_y):
    global bst
    start=time.time()
    bst=run_xgboost(data_x,data_y,random_state_num=1234)  #为什么要是1234，因为调参时候就是=1234
    joblib.dump(bst, file_xgboost_model)  #joblib的用法https://www.cnblogs.com/wzdLY/p/9630671.html 将模型保存
    print('模型已成功保存在 %s'%(file_xgboost_model))
    train_x, test_x, train_y, test_y = train_test_split(data_x.values, data_y.values, test_size=0.25, random_state=1234)
    auc_ks(train_x, test_x, train_y, test_y)
    print('模型评价指标已保存在：%s'%(file_xgboost_model_auc_ks))
    print('运行共花费时间：%s'%(time.time()-start))
    resu = bst.predict(xgb.DMatrix(test_x))
    

    

if __name__=='__main__':
    run_main(train_x, train_y)

View Code

posted on 2021-02-09 11:33 小小喽啰阅读(2311) 评论(0) 编辑收藏举报