lightgbm与贷款违约预测项目

lightgbm


  1. histogram算法
    • 将连续的浮点值离散成k个离散值,构造宽度为k的histogram
  2. leaf-wise生长策略
    • 每次在所有叶子中找到分裂增益最大的一个叶子,一般也是数据量最大的

  1. 参数

    • num_leaves 叶节点数目,<=2^(max_depth)
    • max_depth [1]
  2. 调参

    1. hyperopt:自动获取最佳的超参数。
      pip install hyperopt
    	import hyperopt
    
    	def hyperopt_objective(params):
    
    		model=lgb.LGBMRegressor(
    			num_leaves=31,learning_rate=0.1,n_estimators=int(params["n_estimators"]),max_depth=int(params["max_depth"]),objective="binary",eval_metric="auc")
    		res=lgb.cv(model.get_params(),train_matrix,nfold=5,early_stopping_rounds=10,metrics="auc")
    		return -max(res["auc-mean"])
    

    定义一个目标函数hyperopt_objective,由于fmin返回最小值,因此用-auc

    
    	params_space={
     		"n_estimators":hyperopt.hp.randint("n_estimators",300),
    		"max_depth":hyperopt.hp.randint("max_depth",8)
    
    	}
    

    定义搜索空间:

     hp.uniform(label,low,high)参数在low和high之间均匀分布;
    
     hp.quniform(label,low,high,q)参数的取值是round(uniform(low,high)/q)*q,适用于那些离散的取值
    
     hp.randint(label,upper)返回一个在[0,upper)前闭后开的区间内的随机整数。
    
    
    	trials=hyperopt.Trials()
    	best=hyperopt.fmin(hyperopt_objective,space=params_space,algo=hyperopt.tpe.suggest,
    	max_evals=10,trials=trials)
    

    在搜索空间内搜索

  3. 阿里天池大赛:金融风控-贷款违约预测

    1. 导包

	import pandas as pd
	
	import numpy as np
	
	import matplotlib.pyplot as plt
	
	import seaborn as sns
	
	from IPython.core.interactiveshell import InteractiveShell
	
	InteractiveShell.ast_node_interactivity = "all"#显示全部行输出结果


2. 导入数据
	
	train=pd.read_csv("/风控/train (1).csv")
	
	train.head()#默认显示前五行
	
	train.shape
	
	train.columns#查看字段信息
3. 区分离散型和连续型变量

	numerical_columns=['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'homeOwnership',
	
	'annualIncome', 'verificationStatus',
	
	'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
	
	'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
	
	'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
	
	'initialListStatus', 'applicationType', 'title',
	
	'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8',
	
	'n9', 'n10', 'n11', 'n12', 'n13', 'n14']
	
	def featype(data,feature):
	
	    numerical_continus=[]
	
	    numerical_discrete=[]
	
	    for fea in feature:
	
	        count=data[fea].nunique() #返回唯一值的个数
	
	        if count>10:
	
	            numerical_continus.append(fea)
	
	        else:
	
	            numerical_discrete.append(fea)
	
	    return numerical_continus,numerical_discrete
	
	numerical_continus,numerical_discrete=featype(train,numerical_columns)
	
	numerical_continus
	
	numerical_discrete
4. 处理字符型变量apply

	employmentlength_dict={"10+ years":10,"2 years":2,"< 1 year":0,"1 year":1,"5 years":5,"4 years":4,"6 years":6,"8 years":8,"7 years":7,"9 years":9,"3 years":3}
	def func4(m):
	    m["employmentlength_dict"]=m["employmentLength"].apply(lambda x:x if x not in employmentlength_dict else employmentlength_dict[x])
	    return m
	train=func4(train)

	def func(x):
	
	    month,year=x.split("-")
	
	    month_dict={"Aug":8,"Nov":11,"Feb":2,"Jan":1,"Mar":3,"Jul":7,"Oct":10,"Jun":6,"Apr":4,"Sep":9,"May":5,"Dec":12}
	
	    month_dict=month_dict[month]
	
	    earlistdate=int(year)*12+int(month_dict)
	
	    return earlistdate
	
	train["earlistdate"]=train["earliesCreditLine"].apply(lambda x:func(x))
	
	def func2(x):
	
	    year,month,day=x.split("-")
	
	    final_date=int(year)*12+int(month)
	
	    return final_date
	
	train["issueDate_dict"]=train["issueDate"].apply(lambda x:func2(x))
	
	train[["subGrade","interestRate","grade"]].corr()#subgrade grade 为object
	
	def func3(x):
	
	    tmp=x[["subGrade"]].sort_values(["subGrade"]).drop_duplicates()
	
	    tmp["subgrade_dict"]=range(len(tmp))
	
	    x=x.merge(tmp,on="subGrade",how="left")
	
	    tmp1=x[["grade"]].sort_values(["grade"]).drop_duplicates()
	
	    tmp1["grade_dict"]=range(len(tmp1))
	
	    x=x.merge(tmp1,on="grade",how="left")
	
	    return x
	
	train=func3(train)

5.填充空值:离散型用众数,连续型用中位数

	#查看变量缺失值占比
	d=(train.isnull().sum()/train.shape[0]).to_dict()
	d
	(train.isnull().sum()/train.shape[0]).plot.bar()
	
	train[numerical_continus]=train[numerical_continus].fillna(value=train[numerical_continus].median())
	
	train[numerical_discrete]=train[numerical_discrete].fillna(value=train[numerical_discrete].mode())
	
	category=["employmentlength_dict","subgrade_dict","grade_dict","issueDate_dict",'earlistdate']
	
	train[category]=train[category].fillna(train[category].mode())
6. 参数优化
	
	import lightgbm as lgb

	from sklearn.model_selection import train_test_split
	
	x=train.drop(["grade","subGrade","employmentLength","issueDate","earliesCreditLine","isDefault"],axis=1)
	
	y=train["isDefault"]
	
	# 数据集划分
	
	x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
	
	train_matrix = lgb.Dataset(x_train, label=y_train)
	
	valid_matrix = lgb.Dataset(x_val, label=y_val)

	import hyperopt
	def hyperopt_objective(params):
	    model=lgb.LGBMRegressor(num_leaves=31,learning_rate=0.1,n_estimators=int(params["n_estimators"]),max_depth=int(params["max_depth"]),objective="binary",eval_metric="auc")
	    res=lgb.cv(model.get_params(),train_matrix,nfold=5,early_stopping_rounds=10,metrics="auc")
	    return -max(res["auc-mean"])


	params_space={
	    "n_estimators":hyperopt.hp.randint("n_estimators",300),
	    "max_depth":hyperopt.hp.randint("max_depth",8)
	
	}
	trails=hyperopt.Trials()
	best=hyperopt.fmin(hyperopt_objective,space=params_space,algo=hyperopt.tpe.suggest,max_evals=10,trials=trails)

	print(best)
```{'max_depth': 6, 'n_estimators': 237}

	7. 训练模型

params = {

    'boosting_type': 'gbdt',

    'objective': 'binary',

    'learning_rate': 0.1,#较小的学习率,较大的决策树个数

    "n_estimators":237,

    'metric': 'auc',

    'num_leaves': 31,

    'max_depth': 6,#树的最大深度,防止过拟合

    'feature_fraction': 1, #每次选择所有的特征训练树

    'bagging_fraction': 1,

}



"""使用训练集数据进行模型训练"""

model = lgb.train(params, train_set=train_matrix, valid_sets=valid_matrix, num_boost_round=20000, verbose_eval=1000, early_stopping_rounds=200)

	8. 数据预测

test=pd.read_csv("C:/Users/廖言/Desktop/新建文件夹/努力学习天天向上/风控/testA.csv")
test.head()
#数据处理
test=func4(test)
test["earlistdate"]=test["earliesCreditLine"].apply(lambda x:func(x))
test["issueDate_dict"]=test["issueDate"].apply(lambda x:func2(x))
test=func3(test)
#空值填充
test[numerical_continus]=test[numerical_continus].fillna(value=test[numerical_continus].median())
test[numerical_discrete]=test[numerical_discrete].fillna(value=test[numerical_discrete].mode())
test[category]=test[category].fillna(test[category].mode())
test2=test.drop(["grade","subGrade","employmentLength","issueDate","earliesCreditLine"],axis=1)
#数据预测
test["isDefault"]=model.predict(test2)
#数据输出
test.to_csv("C:/Users/廖言/Desktop/新建文件夹/努力学习天天向上/风控/output3.csv")

  1. 树的深度是从根点到叶子节点的结点数,叶子节点是没有左右孩子的结点。
    - n_estimators 迭代次数=决策树个数
    - bagging_fraction 每次迭代用的数据比例,小比例加快训练速度,减小过拟合
    - feature_fraction 每次迭代用的特征比例
    - min_data_in_leaf 每个叶节点的最少样本数量,设置一个较大的数可以处理过拟合 ↩︎

posted @ 2021-03-25 09:35  yanjiang1996  阅读(257)  评论(0编辑  收藏  举报