【集成学习】lightgbm使用案例

github地址

  1 #!/usr/bin/env python2
  2 # -*- coding: utf-8 -*-
  3 """
  4 Created on Sat Mar 31 21:19:09 2018
  5 
  6 @author: hello4720
  7 """
  8 import numpy as np
  9 import pandas as pd
 10 import lightgbm as lgb
 11 from sklearn import metrics
 12 from sklearn.model_selection import train_test_split
 13 
 14 ### 读取数据
 15 print("载入数据")
 16 dataset1 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data1.csv')
 17 dataset2 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data2.csv')
 18 dataset3 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data3.csv')
 19 dataset4 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data4.csv')
 20 dataset5 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data5.csv')
 21 
 22 dataset1.drop_duplicates(inplace=True)
 23 dataset2.drop_duplicates(inplace=True)
 24 dataset3.drop_duplicates(inplace=True)
 25 dataset4.drop_duplicates(inplace=True)
 26 dataset5.drop_duplicates(inplace=True)
 27 
 28 ### 数据合并
 29 print("数据合并")
 30 trains = pd.concat([dataset1,dataset2],axis=0)
 31 trains = pd.concat([trains,dataset3],axis=0)
 32 trains = pd.concat([trains,dataset4],axis=0)
 33 
 34 online_test = dataset5
 35 
 36 ### 数据拆分
 37 print("数据拆分")
 38 train_xy,offline_test = train_test_split(trains, test_size = 0.2,random_state=21)
 39 train,val = train_test_split(train_xy, test_size = 0.2,random_state=21)
 40 
 41 print("训练集")
 42 y = train.is_trade                                                  # 训练集标签
 43 X = train.drop(['instance_id','is_trade'],axis=1)                   # 训练集特征矩阵
 44 
 45 print("验证集")
 46 val_y = val.is_trade                                                # 验证集标签
 47 val_X = val.drop(['instance_id','is_trade'],axis=1)                 # 验证集特征矩阵
 48 
 49 print("测试集")
 50 offline_test_X=offline_test.drop(['instance_id','is_trade'],axis=1) # 线下测试特征矩阵
 51 online_test_X=online_test.drop(['instance_id'],axis=1)              # 线上测试特征矩阵
 52 
 53 ### 数据转换
 54 lgb_train = lgb.Dataset(X, y, free_raw_data=False)
 55 lgb_eval = lgb.Dataset(val_X, val_y, reference=lgb_train,free_raw_data=False)
 56 
 57 ### 开始训练
 58 print('设置参数')
 59 params = {
 60             'boosting_type': 'gbdt',
 61             'boosting': 'dart',
 62             'objective': 'binary',
 63             'metric': 'binary_logloss',
 64 
 65             'learning_rate': 0.01,
 66             'num_leaves':25,
 67             'max_depth':3,
 68 
 69             'max_bin':10,
 70             'min_data_in_leaf':8,
 71 
 72             'feature_fraction': 0.6,
 73             'bagging_fraction': 1,
 74             'bagging_freq':0,
 75 
 76             'lambda_l1': 0,
 77             'lambda_l2': 0,
 78             'min_split_gain': 0
 79 }
 80 
 81 print("开始训练")
 82 gbm = lgb.train(params,                     # 参数字典
 83                 lgb_train,                  # 训练集
 84                 num_boost_round=2000,       # 迭代次数
 85                 valid_sets=lgb_eval,        # 验证集
 86                 early_stopping_rounds=30)   # 早停系数
 87 ### 线下预测
 88 print ("线下预测")
 89 preds_offline = gbm.predict(offline_test_X, num_iteration=gbm.best_iteration) # 输出概率
 90 offline=offline_test[['instance_id','is_trade']]
 91 offline['preds']=preds_offline
 92 offline.is_trade = offline['is_trade'].astype(np.float64)
 93 print('log_loss', metrics.log_loss(offline.is_trade, offline.preds))
 94 
 95 ### 线上预测
 96 print("线上预测")
 97 preds_online =  gbm.predict(online_test_X, num_iteration=gbm.best_iteration)  # 输出概率
 98 online=online_test[['instance_id']]
 99 online['preds']=preds_online
100 online.rename(columns={'preds':'predicted_score'},inplace=True)
101 online.to_csv("./data/20180405.txt",index=None,sep=' ')
102 
103 ### 保存模型
104 from sklearn.externals import joblib
105 joblib.dump(gbm,'gbm.pkl')
106 
107 ### 特征选择
108 df = pd.DataFrame(X.columns.tolist(), columns=['feature'])
109 df['importance']=list(gbm.feature_importance())
110 df = df.sort_values(by='importance',ascending=False)
111 df.to_csv("./data/feature_score_20180405.csv",index=None,encoding='gbk')
posted @ 2018-03-26 21:22  wanglei5205  阅读(16330)  评论(0编辑  收藏  举报
levels of contents