DF平台:消费金融场景下的用户购买预测
赛题链接:https://www.datafountain.cn/competitions/287/details
赛题任务:利用招商银行客户的个人属性、信用卡消费数据,以及部分客户在掌上生活APP上的一个月的操作行为日志,设计合理的特征工程与模型算法方案,预测客户在未来一周内(4月1日-7日),是否会购买掌上生活APP上的优惠券(包括饭票、影票等)。考虑到客户隐私,客户的个人属性数据与信用卡消费数据,采用脱敏并标准化处理为V1,V2,…,V30数值型属性。客户在APP上的行为日志,一些字段也进行了相应加密。
由于比赛后期时间紧张,排名一直往下掉,最终B榜排名75名,本博客记录在此比赛中的一些过程和心得。
赛题数据介绍
比赛提供包括训练集和测试集,训练集包括部分如下:
(1)个人属性与信用卡消费数据:包含80000名信用卡客户的个人属性与信用卡消费数据,其中包含枚举型特征和数值型特征,均已转为数值并进行了脱敏和标准化处理。
(2)APP操作行为日志:上述信用卡客户中,部分已绑定掌上生活APP的客户,在近一个月时间窗口内的所有点击行为日志。
(3)标注数据:包括客户号及标签。其中,标签数据为用户是否会在未来一周,购买掌上生活APP上的优惠券。
评分方式:AUC
(一)数据EDA分析
在这部分我并没有过多的花心思,主要是先load一下数据,看下缺失值,看下数据类型,老套路,用pandas_profiling.ProfileReport看一下每个表的各种信息,十分方便。然后就是各个表的merge,由于用的Jupyter Notebook,这部分代码比较散,整个代码可以参考github。
(二)特征工程
这部分包括各个表的处理,尤其是log表的一些操作行为日志的挖掘,包括一些滑动时间窗口操作
import time from datetime import datetime from sklearn.externals import joblib from sklearn.preprocessing import StandardScaler import pandas as pd import numpy as np import pickle #序列化 import os import missingno as msno import pandas_profiling from xgboost import XGBClassifier from sklearn.metrics import roc_auc_score from sklearn.model_selection import KFold import math if not os.path.exists('tmp'): os.mkdir('tmp') #log表特征(App操作行为日志) train_log_path = '../data/train/train_log.csv' test_log_path = '../data/test/test_log.csv' def gen_log_feat(): dump_path = './tmp/log_feat.pkl' if os.path.exists(dump_path): log_copy = pickle.load(open(dump_path,'rb')) else: train_log = pd.read_csv(train_log_path,sep='\t') test_log = pd.read_csv(test_log_path,sep='\t') log = pd.concat([train_log,test_log],copy=False) EVT_LBL = log[['USRID','EVT_LBL']] getdummies_EVT_LBL = pd.get_dummies(EVT_LBL) getdummies_EVT_LBL1 = getdummies_EVT_LBL.copy() L1 = getdummies_EVT_LBL1.groupby('USRID',as_index=False).sum() # USRID count 7-9 # EVT_LBL2 = EVT_LBL.copy() # USRID_count = EVT_LBL2.groupby(['USRID'],as_index=False)['USRID'].agg({'cnt':'count'}) # log['EVT_LBL_0'] = log['EVT_LBL'].apply(lambda x: x.split('-')[0]) # log['EVT_LBL_1'] = log['EVT_LBL'].apply(lambda x: x.split('-')[1]) # log['EVT_LBL_2'] = log['EVT_LBL'].apply(lambda x: x.split('-')[2]) # del log['EVT_LBL'] #时间转换成秒,计算用户下一次的时间差特征 log['OCC_TIM'] = log['OCC_TIM'].apply(lambda x:time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S"))) log = log.sort_values(['USRID','OCC_TIM']) log['next_time'] = log.groupby(['USRID'])['OCC_TIM'].diff(-1).apply(np.abs) statistic_log = log.copy() log_copy = log.copy() stat_feat = ['min','mean','max','std','median'] statistic_log = statistic_log.groupby(['USRID'],as_index=False)['next_time'].agg(stat_feat).reset_index() statistic_log.columns = ['USRID'] + ['next_time_' + col for col in stat_feat] log_copy = pd.merge(log_copy,statistic_log,how='outer',on='USRID') log_copy = log_copy.groupby(['USRID'],as_index=False).mean() log_copy = pd.merge(log_copy,L1,how='left',on='USRID') # log_copy = pd.merge(log_copy,USRID_count,how='left',on='USRID') pickle.dump(log_copy,open(dump_path,'wb')) return log_copy log_copy = gen_log_feat() #agg表特征(个人属性与信用卡消费数据) train_agg_path = '../data/train/train_agg.csv' test_agg_path = '../data/test/test_agg.csv' def get_stat_feat(df,values,action,days1,days2): # day1 起始时间 days2:终止时间 df = df[df['day'] > days1] df = df[df['day'] <= days2] stat_feat = ['min','mean','max','median','count','sum','std','var'] df = df.groupby('USRID')[values].agg(stat_feat).reset_index() #所以说在进行get_stat_feat之前,uid并不唯一 df.columns = ['USRID'] + ['%s_%s_%s_' % (values,action,days2) + col for col in stat_feat] #loan_7_min,loan_7_max return df def gen_filter_agg_feat(): dump_path = './tmp/filter_agg_feat.pkl' if os.path.exists(dump_path): filter_agg = pickle.load(open(dump_path,'rb')) else: train_agg = pd.read_csv(train_agg_path,sep='\t') test_agg = pd.read_csv(test_agg_path,sep='\t') agg = pd.concat([train_agg,test_agg],copy=False) #处理偏斜数据 7-9 # agg_columns = agg.columns # skewed_feats = agg[agg_columns].apply(lambda x: x.skew()) # skewed_feats = skewed_feats[skewed_feats > 10 ] # skewed_feats = skewed_feats.index # agg[skewed_feats] = np.log1p(agg[skewed_feats]) aggV28 = agg[['USRID','V28']] aggV25 = agg[['USRID','V25']] aggV20 = agg[['USRID','V20']] # aggV19 = agg[['USRID','V19']] # aggV18 = agg[['USRID','V18']] train_log = pd.read_csv(train_log_path,sep='\t') test_log = pd.read_csv(test_log_path,sep='\t') log = pd.concat([train_log,test_log],copy=False) log2 = log.copy() log2['day'] = log2['OCC_TIM'].map(lambda x:int(x.split('-')[2].split(' ')[0])) log1 = log2[['USRID','day']] aggV28_day = pd.merge(log1,aggV28,on=['USRID'],how='left',copy=False) aggV25_day = pd.merge(log1,aggV25,on=['USRID'],how='left',copy=False) aggV20_day = pd.merge(log1,aggV20,on=['USRID'],how='left',copy=False) # aggV19_day = pd.merge(log1,aggV19,on=['USRID'],how='left',copy=False) # aggV18_day = pd.merge(log1,aggV18,on=['USRID'],how='left',copy=False) USRID = aggV28_day['USRID'].unique() exclu1 = [1]*len(USRID) exclu2 = [1]*len(USRID) exclu3 = [1]*len(USRID) # exclu4 = [1]*len(USRID) # exclu5 = [1]*len(USRID) days_df1 = pd.DataFrame({'USRID':USRID,'exclu1':exclu1}) days_df2 = pd.DataFrame({'USRID':USRID,'exclu2':exclu2}) days_df3 = pd.DataFrame({'USRID':USRID,'exclu3':exclu3}) # days_df4 = pd.DataFrame({'USRID':USRID,'exclu4':exclu4}) # days_df5 = pd.DataFrame({'USRID':USRID,'exclu5':exclu5}) day_list = [0,3,7,14,21,28,31] for i in range(len(day_list)-1): days1 = day_list[i] days2 = day_list[i+1] df_V28 = aggV28_day.copy() df_V25 = aggV25_day.copy() df_V20 = aggV20_day.copy() # df_V19 = aggV19_day.copy() # df_V18 = aggV18_day.copy() # VS = ['V28','V25'] # for Vi in VS: day_dfV28 = get_stat_feat(df_V28,'V28','agg',days1,days2) day_dfV25 = get_stat_feat(df_V25,'V25','agg',days1,days2) day_dfV20 = get_stat_feat(df_V20,'V20','agg',days1,days2) # day_dfV19 = get_stat_feat(df_V19,'V19','agg',days1,days2) # day_dfV18 = get_stat_feat(df_V18,'V18','agg',days1,days2) days_df1 = pd.merge(days_df1,day_dfV28,how='left',on='USRID') days_df2 = pd.merge(days_df2,day_dfV25,how='left',on='USRID') days_df3 = pd.merge(days_df3,day_dfV20,how='left',on='USRID') # days_df4 = pd.merge(days_df4,day_dfV19,how='left',on='USRID') # days_df5 = pd.merge(days_df5,day_dfV18,how='left',on='USRID') days_df1 = days_df1.fillna(0.) days_df2 = days_df2.fillna(0.) days_df3 = days_df3.fillna(0.) # days_df4 = days_df4.fillna(0.) # days_df5 = days_df5.fillna(0.) del days_df1['exclu1'] del days_df2['exclu2'] del days_df3['exclu3'] # del days_df4['exclu4'] # del days_df5['exclu5'] filter_agg1 = pd.merge(agg,days_df1,how='left',on='USRID') filter_agg2 = pd.merge(filter_agg1,days_df2,how='left',on='USRID') # filter_agg3 = pd.merge(filter_agg2,days_df3,how='left',on='USRID') # filter_agg4 = pd.merge(filter_agg3,days_df4,how='left',on='USRID') filter_agg = pd.merge(filter_agg2,days_df3,how='left',on='USRID') # agg_V3 = agg[['USRID','V3']] # agg_V3["VV3"]=agg_V3["V3"].astype(str).astype("str") # getdummies_agg_V3 = pd.get_dummies(agg_V3) # filter_agg = pd.merge(filter_agg,getdummies_agg_V3,how='left',on='USRID') #del filter_agg['V3'] filter_agg.fillna(0.) pickle.dump(filter_agg,open(dump_path,'wb')) return filter_agg # V20,V25,V28 filter_agg = gen_filter_agg_feat() print("sucessful!!!") #flg表处理 train_flg_path = '../data/train/train_flg.csv' test_flg_path = '../data/submit_sample.csv' def gen_flg(): dump_path = './tmp/flg.pkl' if os.path.exists(dump_path): flg = pickle.load(open(dump_path,'rb')) else: train_flg = pd.read_csv(train_flg_path,sep='\t') test_flg = pd.read_csv(test_flg_path,sep='\t') test_flg['FLAG']=-1 del test_flg['RST'] flg = pd.concat([train_flg,test_flg],copy=False) pickle.dump(flg,open(dump_path,'wb')) return flg log_copy = gen_log_feat() #表merge def make_data(): dump_path = './tmp/data.pkl' if os.path.exists(dump_path): data = pickle.load(open(dump_path,'rb')) else: log_copy = gen_log_feat() filter_agg = gen_filter_agg_feat() flg = gen_flg() data = pd.merge(filter_agg,flg,how='left',on='USRID') data = pd.merge(data,log_copy,how='left',on='USRID') pickle.dump(data,open(dump_path,'wb')) return data data = make_data() #train = data[data['FLAG']!=-1] #test = data[data['FLAG']==-1] #test = test.drop(['FLAG'],axis=1) #labels = train.pop('FLAG') #labels = labels[:len(train)] #target = np.zeros([len(labels), len(np.unique(labels))]) #target[:, 0] = labels == 0 #target[:, 1] = labels == 1
(三)XGBoost模型搭建
import pickle #序列化 import os import missingno as msno import pandas_profiling from xgboost import XGBClassifier from sklearn.metrics import roc_auc_score from sklearn.model_selection import KFold import lightgbm as lgb import time import math from sklearn.model_selection import KFold data_path = './tmp/data.pkl' data = pickle.load(open(data_path,'rb')) data = data.fillna(0.) train = data[data['FLAG']!=-1] test = data[data['FLAG']==-1] y = train.pop('FLAG') col = train.columns X = train[col].values test = test.drop(['FLAG'],axis=1) folds = KFold(n_splits=6,shuffle=True,random_state=546799) oof_preds = np.zeros(train.shape[0]) sub_preds = np.zeros(test.shape[0]) print("oof_preds.shape:",oof_preds.shape) print("sub_preds.shape:",sub_preds.shape) ignore_features = ['USRID'] features=[f for f in train.columns if f not in ignore_features] for n_fold,(trn_idx,val_idx) in enumerate(folds.split(train)): print("trn_idx:",trn_idx) trn_x,trn_y = train[features].iloc[trn_idx],y.iloc[trn_idx] val_x,val_y = train[features].iloc[val_idx],y.iloc[val_idx] clf = XGBClassifier( object = 'binary:logistic', booster = "gbtree", eval_metric = 'auc', # nthread = 8, # 如果你希望使用CPU全部的核,那就不要输入这个参数,算法会自动检测它。 eta = 0.025, gamma = 0, # 用于控制是否后剪枝的参数,越大越保守,一般选0.1,0.2这样子 # lamda = 2, #控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合 max_depth = 6, subsample = 0.8, # 随机采样训练样本 原来是0.8 colsample_bytree = 0.632, # 生成树的列采样 原来是0.632 colsample_bylecel = 0.8, min_child_weight = 19, # 原来是19 #这个参数默认为1,是每个叶子里面 h 的和至少为多少,对正负样本不均衡的0-1分类而言,假设 h 在0.01附近 # min_child_weight 为1意味着叶子节点中最少需要包含100个样本。这个参数非常影响结果,控制叶子节点中二阶导和的最小值,该参数越小,越容易overfiting alpha = 0, #random_state = 42, # reg_alpha=100, nrounds = 8000, scale_pos_weight = 1, seed = 4396, #2018,4396 n_estimators = 1000, learning_rate = 0.1 # silent :0 ,#设置成1则没有运行信息输出,最好是设置为0. ) clf.fit(trn_x,trn_y,eval_set = [(trn_x,trn_y),(val_x,val_y)],verbose=10,early_stopping_rounds=30) oof_preds[val_idx] = clf.predict_proba(val_x)[:,1] sub_preds+=clf.predict_proba(test[features])[:,1] / folds.n_splits print('Fold %2d AUC: %.6f' % (n_fold + 1,roc_auc_score(val_y,oof_preds[val_idx]))) del clf,trn_x,trn_y,val_x,val_y print('Full AUC score %.6f' % roc_auc_score(y,oof_preds)) test['RST'] = sub_preds time_date = time.strftime('%Y-%m-%d',time.localtime(time.time())) test[['USRID','RST']].to_csv('../submit/%s_%s.csv'%(str(time_date),str(roc_auc_score(y,oof_preds)).split('.')[1]),index=False,sep='\t') #test[['USRID','RST']].to_csv("F:/Jupyter_Notebook_dir/DataFountain_JN/submit/submission_03.csv",index = False,float_format = '%.8f',sep='\t')
(四)网格搜索,模型调参
import pandas as pd import numpy as np import xgboost as xgb from sklearn.metrics import roc_auc_score from sklearn.model_selection import KFold from xgboost import XGBClassifier import gc import matplotlib as mpl from pandas.core.frame import DataFrame from matplotlib import pyplot as plt from matplotlib.pyplot import GridSpec import seaborn as sns import numpy as np import warnings warnings.filterwarnings('ignore') sns.set_context("poster",font_scale=1.3) from missingno import missingno import missingno as msno import pandas_profiling from sklearn.datasets import make_blobs from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn import cross_validation, metrics from sklearn.model_selection import GridSearchCV import pickle import time import os import math import numpy as np import pandas as pd import xgboost as xgb from xgboost import plot_importance from sklearn.model_selection import GridSearchCV from sklearn.model_selection import StratifiedKFold # from xgboost.sklearn import GridSearchCV import pickle data_path = 'G:/xjl_docunment/DataFountain/code/tmp/data.pkl' data = pickle.load(open(data_path,'rb')) train = data[data['FLAG']!=-1] test = data[data['FLAG']==-1] # y = train.pop('FLAG') col = train.columns X = train[col].values test = test.drop(['FLAG'],axis=1) print('Sucessful') def modelfit(alg,dtrain,predictors,useTrainCV=True,cv_folds=5,early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() xgbtrain = xgb.DMatrix(dtrain[predictors].values,label=dtrain[target].values) cvresult = xgb.cv(xgb_param,xgbtrain,num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,metrics='auc',early_stopping_rounds=early_stopping_rounds, show_stdv=False) alg.set_params(n_estimators=cvresult.shape[0]) # Fit the algorithm on the data alg.fit(dtrain[predictors],dtrain['FLAG'],eval_metric='auc') #Predict training set: dtrain_predictions = alg.predict(dtrain[predictors]) dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1] #Print model report: print("\nModel Report") print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['FLAG'].values, dtrain_predictions)) print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['FLAG'], dtrain_predprob)) print('sucessful') feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') from matplotlib.pylab import rcParams rcParams['figure.figsize'] = 12, 4 target = 'FLAG' IDcol = 'USRID' #Choose all predictors except target & IDcols predictors = [x for x in train.columns if x not in [target,IDcol]] xgb1 = XGBClassifier( learning_rate =0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', # nthread=4, scale_pos_weight=1, seed=27) modelfit(xgb1, train, predictors) print('Sucessful')
1)max_depth 和 min_weight 参数调优:先对这两个参数调优,因为它们对最终的结果有很大的影响,先大范围的粗调参数,然后小范围微调
param_test1 = { 'max_depth':range(3,10,2), 'min_child_weight':range(1,20,3) } gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=5, min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8, objective='binary:logistic',scale_pos_weight=1,seed=27), param_grid=param_test1,scoring='roc_auc',n_jobs=8,iid=False,cv=5) gsearch1.fit(train[predictors],train['FLAG']) gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
param_test11 = { 'max_depth':range(3,10,2), 'min_child_weight':range(1,20,3) } gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=140,max_depth=5, min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8, objective='binary:logistic',scale_pos_weight=1,seed=27), param_grid=param_test11,scoring='roc_auc',n_jobs=8,iid=False,cv=5) gsearch1.fit(train[predictors],train['FLAG']) gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
params_test2 = { 'max_depth':[3,5,6], 'min_child_weight':[1,5,6] } gsearch2 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=5, min_child_weight=2,gamma=0,subsample=0.8, colsample_bytree=0.632,objective = 'binary:logistic', scale_pos_weight=1,seed=27),param_grid = params_test2, scoring='roc_auc',iid=False, cv=5) gsearch2.fit(train[predictors],train[target]) gsearch2.grid_scores_, gsearch2.best_params_,gsearch2.best_score_
param_test2b = { 'min_child_weight':[1,5,8,10,12,13,19,21] } gsearch2b = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=3, min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.632, objective= 'binary:logistic', scale_pos_weight=1,seed=27), param_grid = param_test2b, scoring='roc_auc',n_jobs=16,iid=False, cv=5) gsearch2b.fit(train[predictors],train[target]) modelfit(gsearch2b.best_estimator_, train, predictors) gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_
2)gamma参数调优:在已经调整好其他参数的基础上我们可以进行gamma参数调优,gamma参数取值的范围可以很大
param_test3 = { 'gamma':[i/10.0 for i in range(0,5)] } gsearch3 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=3, min_child_weight=19, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', scale_pos_weight=1,seed=27), param_grid = param_test3, scoring='roc_auc',iid=False, cv=5) gsearch3.fit(train[predictors],train[target]) gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
3)调整subsample和colsample——bytree参数¶:分两个阶段来进行这个步骤,这两个步骤都取0.6,0.7,0.8,0.9作为起始值
param_test4 = { 'subsample':[i/10.0 for i in range(6,10)], 'colsample_bytree':[i/10.0 for i in range(6,10)] } gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=3, min_child_weight=19, gamma=0.2, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', scale_pos_weight=1,seed=27), param_grid = param_test4, scoring='roc_auc',n_jobs=16,iid=False, cv=5) gsearch4.fit(train[predictors],train[target]) gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
param_test5 = { 'subsample':[i/100.0 for i in range(75,90,5)], 'colsample_bytree':[i/100.0 for i in range(65,90,5)] } gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=3, min_child_weight=19, gamma=0, subsample=0.9, colsample_bytree=0.7, objective= 'binary:logistic', scale_pos_weight=1, seed=27), param_grid = param_test5, scoring='roc_auc',n_jobs=16,iid=False, cv=5) gsearch5.fit(train[predictors],train[target]) gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
4)正则化参数调优:应用正则化参数来降低过拟合,由于gamma函数提供了一种更加有效地降低过拟合的方法
param_test6 = { 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100] } gsearch6 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=4, min_child_weight=19, gamma=0.2, subsample=0.85, colsample_bytree=0.65, objective= 'binary:logistic', scale_pos_weight=1,seed=27), param_grid = param_test6, scoring='roc_auc',n_jobs=16,iid=False, cv=5) gsearch6.fit(train[predictors],train[target]) gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_
param_test7 = { 'reg_alpha':[100,150,200,500] } gsearch7 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=4, min_child_weight=19, gamma=0.2, subsample=0.85, colsample_bytree=0.65, objective= 'binary:logistic', scale_pos_weight=1,seed=27), param_grid = param_test7, scoring='roc_auc',n_jobs=16,iid=False, cv=5) gsearch7.fit(train[predictors],train[target]) gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_
5)降低学习速率:最后,我们使用较低的学习速率以及使用更多的决策树,我们可以使用XGBoost中的CV函数来进行这一步的工作
xgb4 = XGBClassifier(learning_rate =0.01,n_estimators=1000,max_depth=4,min_child_weight=19,gamma=0.2,subsample=0.85,colsample_bytree=0.65, objective= 'binary:logistic',reg_alpha=100,nthread=16,scale_pos_weight=1,seed=27) modelfit(xgb4, train, predictors)
其实,感觉网格搜索调参,在这里提升的效果并没有特别明显,不如特征工程提升明显,但还是加上了
(五)模型融合
这里采用stacking的形式,融合RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier,SVC
# Class to extend the Sklearn classifier class SklearnHelper(object): def __init__(self,clf,seed=0,params=None): params['random_state'] = seed self.clf = clf(**params) def train(self,x_train,y_train): self.clf.fit(x_train,y_train) def predict(self,x): return self.clf.predict(x) def fit(self,x,y): return self.clf.fit(x,y) def feature_importances(self,x,y): print(self.clf.fit(x,y).feature_importances_) # Class to extend XGboost classifer #对每个分类器生成5折交叉验证的预测值 def get_oof(clf,x_train,y_train,x_test): oof_train = np.zeros((ntrain,)) #生成一个向量 oof_test = np.zeros((ntest,)) oof_test_skf = np.empty((NFOLDS,ntest)) for i,(train_index,test_index) in enumerate(kf):#enumerate会将一个数组a或者列表生成(0,a[0]),(1,a[1])... x_tr = x_train[train_index] y_tr = y_train[train_index] x_te = x_train[test_index] clf.train(x_tr,y_tr) oof_train[test_index] = clf.predict(x_te) oof_test_skf[i,:] = clf.predict(x_test) #每一次交叉验证得到的在x_test测试集上的预测值 oof_test[:]=oof_test_skf.mean(axis=0) #对五次的在测试集上的预测值取均值, return oof_train.reshape(-1,1),oof_test.reshape(-1,1) #我们就得到了训练集,验 # Random Forest parameters rf_params = { 'n_jobs':-1, 'n_estimators':500, 'warm_start':True, ##True在前面基础上增量训练(重设参数减少训练次数) False默认擦除重新训练 'max_depth':6, 'min_samples_leaf':2, 'max_features':'sqrt', 'verbose':0 } # Extra Trees Parameters et_params = { 'n_jobs': -1, 'n_estimators':500, #'max_features': 0.5, 'max_depth': 8, 'min_samples_leaf': 2, 'verbose': 0 } # AdaBoost parameters ada_params = { 'n_estimators': 500, 'learning_rate' : 0.75 } # Gradient Boosting parameters gb_params = { 'n_estimators': 500, #'max_features': 0.2, 'max_depth': 5, 'min_samples_leaf': 2, 'verbose': 0 } # Support Vector Classifier parameters svc_params = { 'kernel' : 'linear', 'C' : 0.025 } # Create 5 objects that represent our 4 models rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params) et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params) ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params) gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params) svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params) # Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models y_train = y.ravel() # train = train.drop(['FLAG'], axis=1) x_train = train.values # Creates an array of the train data x_test = test.values# Creats an array of the test data,这种方式可以把DataFrame格式或者Series格式的数据转化为数组形式。 #train.as_matrix()这种方法也可以 # Create our OOF train and test predictions. These base results will be used as new features et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test), axis=1)
gbm = xgb.XGBClassifier(
#learning_rate = 0.02,
n_estimators= 2000,
max_depth= 4,
min_child_weight= 2,
#gamma=1,
gamma=0.9,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread= -1,
scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict_proba(x_test)[:,1]
print("sucessful")
USRID = test['USRID']
import time
test['RST'] = predictions
time_date = time.strftime('%Y-%m-%d',time.localtime(time.time()))
test[['USRID','RST']].to_csv('../submit/StackingSubmission.csv',index=False,sep='\t')