基于遗传算法对lightGBM调参优化

#coding=utf-8
from __future__ import division
import numpy as np
import pandas as pd
import random
import math
from sklearn import metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb
 
import lightgbm as lgb
from random import randint
# from xgboost.sklearn import XGBClassifiers
 
 
'''
群体大小,一般取20~100;终止进化代数,一般取100~500;交叉概率,一般取0.4~0.99;变异概率,一般取0.0001~0.1。
'''
# generations = 400   # 繁殖代数 100
pop_size = 500      # 种群数量  500
# max_value = 10      # 基因中允许出现的最大值 (可防止离散变量数目达不到2的幂的情况出现,限制最大值,此处不用) 
chrom_length = 15    # 染色体长度  
pc = 0.6            # 交配概率  
pm = 0.01           # 变异概率  
results = []      # 存储每一代的最优解,N个三元组(auc最高值, n_estimators, max_depth)  
fit_value = []      # 个体适应度  
fit_mean = []       # 平均适应度 
# pop = [[0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0] for i in range(pop_size)] # 初始化种群中所有个体的基因初始序列
 
random_seed = 20
cons_value = 0.19 / 31 # (0.20-0.01)/ (32 - 1)
 
 
 
'''要调试的参数有:(参考:http://xgboost.readthedocs.io/en/latest/parameter.html)
   tree_num:基树的棵数   ----------------(要调的参数)
   eta: 学习率(learning_rate),默认值为0.3,范围[0,1]  ----------------(要调的参数)
   max_depth: 最大树深,默认值为6   ----------------(要调的参数)
   min_child_weight:默认值为1,范围[0, 正无穷],该参数值越小,越容易 overfitting,当它的值较大时,可以避免模型学习到局部的特殊样本。 ----------(要调的参数)
   gamma:默认值为0,min_split_loss,范围[0, 正无穷]
   subsample:选择数据集百分之多少来训练,可以防止过拟合。默认值1,范围(0, 1],理想值0.8
   colsample_bytree:subsample ratio of columns when constructing each tree,默认值1,范围(0, 1],理想值0.8,太小的值会造成欠拟合
   lambda:L2 regularization term on weights, increase this value will make model more conservative.参数值越大,模型越不容易过拟合
   alpha:L1 regularization term on weights, increase this value will make model more conservative.参数值越大,模型越不容易过拟合
   上述参数,要调的有4个,其他的采用理想值就可以
   tree_num: [10、 20、 30、......150、160] 用4位二进制, 0000代表10
   eta: [0.01, 0.02, 0.03, 0.04, 0.05, ...... 0.19, 0.20]   0.2/0.01=20份,用5位二进制表示足够(2的4次方<20<2的5次方)
       00000 -----> 0.01
       11111 -----> 0.20
       0.01 + 对应十进制*(0.20-0.01)/ (2的5次方-1)
   max_depth:[3、4、5、6、7、8、9、10]   用3位二进制
   min_child_weight: [1, 2, 3, 4, 5, 6, 7, 8]  用3位二进制
   示例:   0010,         01001,               010,      110  (共15位)
         tree_num         eta               max_depth  min_child_weight
        (1+2)*10=30  0.01+9*0.005939=0.06       3+2=5      1+6=7
'''
#定义评价函数rmspe    均方根对数误差
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))
 
def xgboostModel(tree_num, eta, max_depth, min_child_weight, random_seed):
 
    #---------------------------数据准备------------------------
    data = pd.read_csv('UnFeature.csv',low_memory=False)
    #因为销售额为0的记录不计入评分,所以只采用店铺为开,且销售额大于0的数据进行训练
    data = data[data["Open"] != 0]
    data = data[data["Sale"] > 0]
 
    data = data[:500000]
    
    data.fillna(0, inplace=True)
 
    #不带特征工程
    data.drop(['Date','Open','PromoInterval','monthStr','Customers'],axis=1,inplace =True)
 
    #不带特征工程
    
    #data.drop(['Date','Customers','Open','PromoInterval','monthStr'],axis=1,inplace =True)
    target = np.log1p(data.Sale)
    data.drop(['Sale'],axis=1,inplace =True)
    X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
    #--------------------------------------------------------------
 
    # 创建成lgb特征的数据集格式
    lgb_train = lgb.Dataset(X_train, y_train) # 将数据保存到LightGBM二进制文件将使加载更快
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)  # 创建验证数据
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',  # 设置提升类型
        'objective': 'regression', # 目标函数
        'early_stopping_rounds': 100,
        'eval_metric': 'rmse',  # 评估函数
        #'num_leaves': 31,   # 叶子节点数
        'learning_rate': eta,  # 学习速率
        'feature_fraction': 0.8, # 建树的特征选择比例
        'bagging_fraction': 0.8, # 建树的样本采样比例
        #'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
        'verbose': 1, # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
        'max_depth': max_depth, 
        'min_child_weight': min_child_weight,
        'seed': randint(1,10)
    }
    print('Start training...')
    # 训练 cv and train              
    model = lgb.train(params,lgb_train,num_boost_round=tree_num,valid_sets=lgb_eval) # 训练数据需要参数列表和数据集
    yhat = model.predict(X_test, num_iteration=model.best_iteration)
    error =  rmspe( np.expm1(yhat), np.expm1(y_test))
 
    return error
 
 
 
def loadFile(filePath):
    fileData = pd.read_csv(filePath)
    return fileData
 
 
# Step 1 : 对参数进行编码(用于初始化基因序列,可以选择初始化基因序列,本函数省略)
def geneEncoding(pop_size, chrom_length):  
    pop = [[]]
    for i in range(pop_size):
        temp = []
        for j in range(chrom_length):
            temp.append(random.randint(0, 1))
        pop.append(temp)
    return pop[1:]
 
# Step 2 : 计算个体的目标函数值
def cal_obj_value(pop):
    objvalue = []
    variable = decodechrom(pop)
    for i in range(len(variable)):
        tempVar = variable[i]
 
        tree_num_value = (tempVar[0] + 1)* 70   #*10
        eta_value = 0.01 + tempVar[1] * cons_value
        max_depth_value = 3 + tempVar[2]
        min_child_weight_value = 1 + tempVar[3]
 
        #aucValue = xgboostModel(tree_num_value, eta_value, max_depth_value, min_child_weight_value, random_seed)
        error = xgboostModel(tree_num_value, eta_value, max_depth_value, min_child_weight_value, random_seed)
       
        #objvalue.append(aucValue)
        objvalue.append(error)
    return objvalue #目标函数值objvalue[m] 与个体基因 pop[m] 对应 
 
 
# 对每个个体进行解码,并拆分成单个变量,返回 tree_num(4)、eta(5)、max_depth(3)、min_child_weight(3)
def decodechrom(pop):
    variable = []
    for i in range(len(pop)):
        res = []
        
        # 计算第一个变量值,即 0101->10(逆转)
        temp1 = pop[i][0:4]
        v1 = 0
        for i1 in range(4):
            v1 += temp1[i1] * (math.pow(2, i1))
        res.append(int(v1))
        
        # 计算第二个变量值
        temp2 = pop[i][4:9]
        v2 = 0
        for i2 in range(5):
            v2 += temp2[i2] * (math.pow(2, i2))
        res.append(int(v2))
 
        # 计算第三个变量值
        temp3 = pop[i][9:12]
        v3 = 0
        for i3 in range(3):
            v3 += temp3[i3] * (math.pow(2, i3))
        res.append(int(v3))
 
        # 计算第四个变量值
        temp4 = pop[i][12:15]
        v4 = 0
        for i4 in range(3):
            v4 += temp4[i4] * (math.pow(2, i4))
        res.append(int(v4))
 
        variable.append(res)
    return variable
 
 
# Step 3: 计算个体的适应值(计算最大值,于是就淘汰负值就好了)
def calfitvalue(obj_value):
    fit_value = []
    temp = 1.0  #0.0
    Cmin = 0
    for i in range(len(obj_value)):
        if(obj_value[i] + Cmin < 1): #>0
            temp = Cmin + obj_value[i]
        else:
            temp = 1.0
        fit_value.append(temp)
    return fit_value
 
 
# Step 4: 找出适应函数值中最大值,和对应的个体
def best(pop, fit_value):
    best_individual = pop[0]
    best_fit = fit_value[0]
    for i in range(1, len(pop)):
        #if(fit_value[i] > best_fit):
        if(fit_value[i] < best_fit):
            best_fit = fit_value[i]
            best_individual = pop[i]
    return [best_individual, best_fit]
 
 
# Step 5: 每次繁殖,将最好的结果记录下来(将二进制转化为十进制)
def b2d(best_individual):
    # 计算第一个变量值
    temp1 = best_individual[0:4]
    v1 = 0
    for i1 in range(4):
        v1 += temp1[i1] * (math.pow(2, i1))
    v1 = (v1 + 1) * 70    #*10
    
    # 计算第二个变量值
    temp2 = best_individual[4:9]
    v2 = 0
    for i2 in range(5):
        v2 += temp2[i2] * (math.pow(2, i2))
    v2 = 0.01 + v2 * cons_value
 
    # 计算第三个变量值
    temp3 = best_individual[9:12]
    v3 = 0
    for i3 in range(3):
        v3 += temp3[i3] * (math.pow(2, i3))
    v3 = 3 + v3
 
    # 计算第四个变量值
    temp4 = best_individual[12:15]
    v4 = 0
    for i4 in range(3):
        v4 += temp4[i4] * (math.pow(2, i4))
    v4 = 1 + v4
 
    return int(v1), float(v2), int(v3), int(v4)
 
 
# Step 6: 自然选择(轮盘赌算法)
def selection(pop, fit_value):
    # 计算每个适应值的概率
    new_fit_value = []
    total_fit = sum(fit_value)
    for i in range(len(fit_value)):
        new_fit_value.append(fit_value[i] / total_fit)
    # 计算每个适应值的累积概率
    cumsum(new_fit_value)
    # 生成随机浮点数序列
    ms = []
    pop_len = len(pop)
    for i in range(pop_len):
        ms.append(random.random())
    # 对生成的随机浮点数序列进行排序
    ms.sort()
    # 轮盘赌算法(选中的个体成为下一轮,没有被选中的直接淘汰,被选中的个体代替)
    fitin = 0
    newin = 0
    newpop = pop
    while newin < pop_len:
        if(ms[newin] < new_fit_value[fitin]):
            newpop[newin] = pop[fitin]
            newin = newin + 1
        else:
            fitin = fitin + 1
    pop = newpop
 
# 求适应值的总和
def sum(fit_value):
    total = 0
    for i in range(len(fit_value)):
        total += fit_value[i]
    return total
 
# 计算累积概率
def cumsum(fit_value):
    temp=[]
    for i in range(len(fit_value)):
        t = 0
        j = 0
        while(j <= i):
            t += fit_value[j]
            j = j + 1
        temp.append(t)
    for i in range(len(fit_value)):
        fit_value[i]=temp[i]
 
# Step 7: 交叉繁殖
def crossover(pop, pc): #个体间交叉,实现基因交换
    poplen = len(pop)
    for i in range(poplen - 1):
        if(random.random() < pc):
            cpoint = random.randint(0,len(pop[0]))
            temp1 = []
            temp2 = []
            temp1.extend(pop[i][0 : cpoint])
            temp1.extend(pop[i+1][cpoint : len(pop[i])])
            temp2.extend(pop[i+1][0 : cpoint])
            temp2.extend(pop[i][cpoint : len(pop[i])])
            pop[i] = temp1
            pop[i+1] = temp2
 
 
# Step 8: 基因突变
def mutation(pop, pm):
    px = len(pop)
    py = len(pop[0])
    for i in range(px):
        if(random.random() < pm):
            mpoint = random.randint(0, py-1)
            if(pop[i][mpoint] == 1):
                pop[i][mpoint] = 0
            else:
                pop[i][mpoint] = 1
 
 
def writeToFile(var, w_path):
    f=open(w_path,"a+")
    for item in var:
        f.write(str(item) + "\r\n")
    f.close()
 
 
def generAlgo(generations):
    pop = geneEncoding(pop_size, chrom_length)
    print(str(generations)+" start...")
    for i in range(generations):
        # print("第 " + str(i) + " 代开始繁殖......")
        obj_value = cal_obj_value(pop) # 计算目标函数值
        # print(obj_value)
        fit_value = calfitvalue(obj_value) #计算个体的适应值
        # print(fit_value)
        [best_individual, best_fit] = best(pop, fit_value) #选出最好的个体和最好的函数值
        # print("best_individual: "+ str(best_individual))
        v1, v2, v3, v4 = b2d(best_individual)
        results.append([best_fit, v1, v2, v3, v4]) #每次繁殖,将最好的结果记录下来
        # print(str(best_individual) + " " + str(best_fit))
        selection(pop, fit_value) #自然选择,淘汰掉一部分适应性低的个体
        crossover(pop, pc) #交叉繁殖
        mutation(pop, pc) #基因突变
    results.sort()
 
    writeToFile(results, "generation_" + str(generations) + ".txt")
    print(results[-1])
 
 
 
if __name__ == '__main__':
    gen = [100]   #gen = [100, 200, 300, 400, 500]
    for g in gen:
        generAlgo(int(g))
 

 

posted @ 2022-09-23 13:50  cup_leo  阅读(304)  评论(0编辑  收藏  举报