lightGBM基础模型步骤
###基础工具
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import os import warnings from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score,roc_curve from statsmodels.stats.outliers_influence import variance_inflation_factor from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import LabelEncoder,Imputer from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score warnings.filterwarnings("ignore") os.chdir("C:/Users/my/Desktop/记录/网约车/流失模型") driver = pd.read_excel('样本.xlsx') y_test = driver["target"] x_test = driver.drop(['a.driver_id','target'],axis = 1) #特征工程 #第一步:填补空值,类别型转化 str_encoder = LabelEncoder()#类别型变量编码 str_encoder.fit(driver["a.contract_company"]) driver["a.contract_company"] = str_encoder.transform(driver["a.contract_company"]) #第二步:特征初步选择,变量值少于0.01,需要删除 # ValueLess = [] # for i in x_train.columns: # ValuePct = driver[driver[i]>0][i].count()/driver[i].count() # if ValuePct < 0.05: # ValueLess.append(i) # print(ValueLess,ValuePct) # # SameValue= [] # for i in x_train.columns: # SameValuePct = driver[i].value_counts().max()/driver[i].count() # if SameValuePct < 0.05: # SameValue.append(i) # print(SameValue,SameValuePct) #driver = driver.drop(ValueLess,axis = 1) #driver = driver.drop(SameValue,axis = 1) select_col = ['vehicle_level','max_days','min_days','min_score','tendcy'] os.chdir("C:/Users/my/Desktop/模型/第四版/") driver = pd.read_excel('8.8训练样本.xlsx') y = driver["target"] x = driver.drop(['a.driver_id','target'],axis = 1) x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)#划分数据集 #类别型转化 from sklearn.preprocessing import LabelEncoder,Imputer imp = Imputer(missing_values = 'NaN',strategy = 'mean',axis = 0) imp.fit(x_train) x_train = imp.transform(x_train) x_test = imp.transform(x_test) #第二步:变量分析 fig = plt.figure() fig.set(alpha=0.2) # 解决中文的显示问题 plt.rcParams["font.sans-serif"]=["SimHei"] plt.rcParams["axes.unicode_minus"] = False #看变量分布 plt.subplot2grid((2, 2), (0, 0)) x_train['max_days'].plot(kind="kde", grid=True) plt.title('max_days') plt.show() plt.subplot2grid((2, 2), (0, 1)) x_train['rest_rate'].plot(kind="kde", grid=True) plt.title('rest_rate') plt.show()
变量分布
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))
sns.distplot(df_train.Age, kde=True, bins=20, rug=True)
#皮尔森相关性
pd.DataFrame(data = list(driver.corr()['target']),index = driver.columns,columns = ['value']).sort_values("value",ascending = True)
#多重共线性 vif_x = np.matrix(x[select_col]) vif_list = [variance_inflation_factor(vif_x,i) for i in range(x.shape[1])] print(max(vif_list)) #便捷调用评价函数 def model_evaluate(model,x,y): y_pred = model.predict(x) fpr,tpr,_ = roc_curve(y,y_pred) ks = abs(fpr - tpr).max() auc = roc_auc_score(y,y_pred) print('ks:',ks) print('auc:',auc) #第三步:建模 #GBDT建模比较 from sklearn.ensemble import GradientBoostingClassifier gb = GradientBoostingClassifier(learning_rate=0.05,subsample=0.6,min_samples_split= 90,n_estimators = 50,min_samples_leaf = 10 ,max_depth=15,max_features=15,random_state=10) gb_model =gb.fit(x_train,y_train) model_evaluate(gb_model,x_train,y_train) model_evaluate(gb_model,x_test,y_test) import xgboost as xgb # 初始化模型 xgb_classifier = xgb.XGBClassifier(n_estimators=20,max_depth=4,learning_rate=0.1,subsample=0.7,colsample_bytree=0.7) # 拟合模型 xgb_classifier.fit(x_train, y_train) model_evaluate(xgb_classifier,x_train,y_train) model_evaluate(xgb_classifier,x_test,y_test) xgb_y_train_prob = xgb_classifier.predict(x_train) fpr_xgb_train,tpr_xgb_train,_ = roc_curve(y_train,xgb_y_train_prob) xgb_train_ks = abs(fpr_xgb_train - tpr_xgb_train).max() xgb_train_auc = roc_auc_score(y_train,xgb_y_train_prob) print("train_ks:",xgb_train_ks) print("train_auc:",xgb_train_auc) import lightgbm as lgb lgb = lgb.LGBMClassifier( boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=2, n_estimators=800, objective='binary', subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=-1, num_iterations = 800) lgb.fit(x_train, y_train) model_evaluate(lgb,x_train,y_train) model_evaluate(lgb,x_test,y_test) #随机调参 from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint gb = GradientBoostingClassifier(learning_rate=0.02,subsample=0.6,min_samples_split= 70,n_estimators = 200,min_samples_leaf = 40 ,max_depth=4,max_features='sqrt',random_state=10) gbParams = {'loss' : ['deviance', 'exponential'], 'n_estimators': randint(10,500), 'max_depth': randint(1,20), 'subsample':[0.5,0.6,0.7,0.8], 'min_samples_split':range(10,101,10), 'min_samples_leaf':range(5,51,5), 'learning_rate':[0.2,0.1, 0.05,0.02,0.01], 'max_features':randint(1,20)} randomizedSearchGB = RandomizedSearchCV(estimator=gb, param_distributions=gbParams, n_iter=10, scoring='roc_auc', fit_params=None, cv=None, verbose=2).fit(x_train, y_train) print(randomizedSearchGB.best_params_, randomizedSearchGB.best_score_) bestGb = randomizedSearchGB.best_estimator_.fit(x_train, y_train) model_evaluate(bestGb,x_train,y_train) #快速查看模型评估值 from sklearn.metrics import precision_score, recall_score, f1_score print('Precision: %.3f' % precision_score(y_true=y_train, y_pred=xgb_y_train_prob)) print('Recall: %.3f' % recall_score(y_true=y_train, y_pred=xgb_y_train_prob)) print('F1: %.3f' % f1_score(y_true=y_train, y_pred=xgb_y_train_prob)) #变量重要性输出,树模型可以输出变量重要性 gb_importance = pd.DataFrame({'cols':x_train.columns,'gb':gb_model.feature_importances_}).sort_values('gb',ascending=False) gb_importance import pickle #将模型保存 folderOfData = "C:/Users/my/Desktop/模型/" saveModel =open(folderOfData+'bestGb.pkl','wb') pickle.dump(bestGb,saveModel) saveModel.close() #调用模型 import pickle folderOfData = "C:/Users/my/Desktop/模型/" modelFile =open(folderOfData+'bestGb.pkl','rb') gb = pickle.load(modelFile) modelFile.close() #测试数据 #概率转分数 def Prob2Score(prob, basePoint, PDO): #将概率转化成分数且为正整数 y = np.log(prob/(1-prob)) return (basePoint+PDO/np.log(2)*(-y)) basePoint = 300 PDO = 100 prob = pd.DataFrame({'prob':xgb_y_pred,'y_test':y_test}) prob['score'] = prob['prob'].map(lambda x:Prob2Score(x,basePoint,PDO)) plt.hist(prob['score'], 100) plt.style.use('seaborn') plt.xlabel('score') plt.ylabel('freq') plt.title('distribution') plt.show() #决策树规则提取 from sklearn import tree dtree = tree.DecisionTreeClassifier(max_depth = 4,min_samples_leaf = 7,min_samples_split = 18) dtree = dtree.fit(x,y) import pydotplus from sklearn.externals.six import StringIO os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' dot_data = StringIO() tree.export_graphviz(dtree,out_file = dot_data,feature_names=x.columns, class_names=['0','1'],filled=True,rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("loss.pdf") print('Visible tree plot saved as pdf.')
###贝叶斯优化模型
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import warnings from sklearn.model_selection import train_test_split,cross_val_score import lightgbm as lgb from bayes_opt import BayesianOptimization from sklearn.metrics import roc_auc_score,roc_curve import os os.chdir('C:/Users/my/Desktop/') data = pd.read_excel('训练数据.xlsx',sheet_name = 'Sheet1') print(data.columns) y = data["label"] x = data.drop(['passenger_id','label'],axis = 1) x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)#划分数据集 print(x_train.shape) print(y_train.shape) import xgboost as xgb # 拟合模型 xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', learning_rate=0.1, max_depth=3, min_child_weight=1,n_estimators=100, n_jobs=1, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,subsample=0.7) xgb.fit(x_train, y_train) xgb_y_train_prob = xgb.predict(x_train) fpr_xgb_train,tpr_xgb_train,_ = roc_curve(y_train,xgb_y_train_prob) xgb_train_ks = abs(fpr_xgb_train - tpr_xgb_train).max() xgb_train_auc = roc_auc_score(y_train,xgb_y_train_prob) print("train_ks:",xgb_train_ks) print("train_auc:",xgb_train_auc) from bayes_opt import BayesianOptimization import lightgbm as lgb def GBM_evaluate(min_child_samples,learning_rate, n_estimators,min_child_weight,num_leaves,colsample_bytree, max_depth, subsample, reg_alpha, reg_lambda): """自定义的模型评估函数""" # 5-fold 交叉检验,注意BayesianOptimization会向最大评估值的方向优化。 val = cross_val_score( lgb.LGBMClassifier(objective= 'binary',metric='auc',random_state= 2018, learning_rate = float(learning_rate), n_estimators=int(n_estimators), max_depth = int(max_depth), num_leaves = int(num_leaves), min_child_samples = int(min_child_samples), subsample = float(subsample), colsample_bytree = float(colsample_bytree), reg_alpha = reg_alpha, reg_lambda = reg_lambda, min_child_weight = min_child_weight, class_weight = 'balanced' ), x_train, y_train, scoring='roc_auc', cv=5).mean() return val # 调参范围 adj_params = {'min_child_weight': (3, 20), 'colsample_bytree': (0.4, 1), 'n_estimators':(100,300), 'learning_rate':(0.05,0.2), 'max_depth': (5, 15), 'num_leaves':(10, 50), 'subsample': (0.5, 1), 'reg_lambda': (0.1, 1), 'reg_alpha': (0.1, 1), 'min_child_samples': (10, 30)} # 调用贝叶斯优化 num_iter = 25 init_points = 5 bayes = BayesianOptimization(GBM_evaluate,adj_params) bayes.maximize(init_points=init_points, n_iter=num_iter) params = bayes.max print(params) # {'target': 0.7452465518984774, 'params': {'colsample_bytree': 0.863774165376339, # 'learning_rate': 0.05000062849693596, # 'max_depth': 6.20154732653672, # 'min_child_samples': 29.985852121149026, # 'min_child_weight': 6.810125687159286, # 'n_estimators': 170.32415049570488, # 'num_leaves': 10.403716972233827, # 'reg_alpha': 0.999999999999874, # 'reg_lambda': 0.10000005514579893, # 'subsample': 0.7261106692459622}} #{'target': 0.752230340011879, 'params': {'colsample_bytree': 0.6766116352832452, # 'learning_rate': 0.08410079723412914, 'max_depth': 6.009908969461344, 'min_child_samples': 10.45373385991692, # 'min_child_weight': 5.299569525386938, 'n_estimators': 100.33382248028828, 'num_leaves': 10.861841362739199, # 'reg_alpha': 0.7515529745843912, 'reg_lambda': 0.9773103767283371, 'subsample': 0.6742906352043163}} lgbm = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary', colsample_bytree= 0.67, learning_rate=0.08, max_depth= 6, min_child_samples=10, min_child_weight= 5.3, n_estimators= 100, num_leaves= 10, reg_alpha= 0.75,subsample_freq=1, reg_lambda= 0.9, subsample= 0.67,random_state=None ,n_jobs=-1, num_iterations = 800,class_weight='balanced') lgbm.fit(x_train, y_train, eval_set=[(x_test, y_test)]) y_pred=lgbm.predict(x_train) from sklearn.metrics import precision_score, recall_score, f1_score,roc_auc_score,roc_curve fpr_lgb_train,tpr_lgb_train,_ = roc_curve(y_train,y_pred) lgb_train_ks = abs(fpr_lgb_train - tpr_lgb_train).max() lgb_train_auc = roc_auc_score(y_train,y_pred) print("train_ks:",lgb_train_ks) print("train_auc:",lgb_train_auc) print('precision:', precision_score(y_train, y_pred)) print('recall:', recall_score(y_train, y_pred)) y_pred_test=lgbm.predict(x_test) from sklearn.metrics import precision_score, recall_score, f1_score,roc_auc_score,roc_curve fpr_lgb_test,tpr_lgb_test,_ = roc_curve(y_test,y_pred_test) lgb_test_ks = abs(fpr_lgb_test - tpr_lgb_test).max() lgb_test_auc = roc_auc_score(y_test,y_pred_test) print("train_ks:",lgb_test_ks) print("train_auc:",lgb_test_auc) print('precision:', precision_score(y_test, y_pred_test)) print('recall:', recall_score(y_test, y_pred_test)) #概率转分数 def Prob2Score(prob, basePoint, PDO): #将概率转化成分数且为正整数 y = np.log(prob/(1-prob)) return (basePoint+PDO/np.log(2)*(-y)) y_pred_test=lgbm.predict_proba(x_test)[:,1] basePoint = 300 PDO = 100 prob = pd.DataFrame({'prob':y_pred_test,'y_test':y_test}) prob['score'] = prob['prob'].map(lambda x:Prob2Score(x,basePoint,PDO)) plt.hist(prob['score'], 100) plt.style.use('seaborn') plt.xlabel('score') plt.ylabel('freq') plt.title('distribution') plt.show()
#调整参数value,设置调参区间 min_value = 40 max_value = 60 for value in range(min_value,max_value+1): best_omd = -1 best_value = -1 best_ks=[] def lgb_test(train_x,train_y,test_x,test_y): clf =lgb.LGBMClassifier(boosting_type = 'gbdt', objective = 'binary', metric = 'auc', learning_rate = 0.1, n_estimators = value, max_depth = 5, num_leaves = 20, max_bin = 45, min_data_in_leaf = 6, bagging_fraction = 0.6, bagging_freq = 0, feature_fraction = 0.8, silent=True ) clf.fit(train_x,train_y,eval_set = [(train_x,train_y),(test_x,test_y)],eval_metric = 'auc') return clf,clf.best_score_['valid_1']['auc'], lgb_model , lgb_auc = lgb_test(train_x,train_y,test_x,test_y) y_pred = lgb_model.predict_proba(x)[:,1] fpr_lgb_train,tpr_lgb_train,_ = roc_curve(y,y_pred) train_ks = abs(fpr_lgb_train - tpr_lgb_train).max() y_pred = lgb_model.predict_proba(val_x)[:,1] fpr_lgb,tpr_lgb,_ = roc_curve(val_y,y_pred) val_ks = abs(fpr_lgb - tpr_lgb).max() Omd= val_ks + 0.8*(val_ks - train_ks) if Omd>best_omd: best_omd = Omd best_value = value best_ks = [train_ks,val_ks] print('best_value:',best_value) print('best_ks:',best_ks)
天才是百分之一的灵感,加百分之九十九的汗水,但那百分之一的灵感往往比百分之九十九的汗水来的重要