lightgbm
1.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | # coding=utf-8 import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.model_selection import GridSearchCV from lightgbm import LGBMRegressor import re from sklearn.decomposition import PCA import joblib import shap import time from lightgbm import plot_importance import seaborn import warnings warnings.filterwarnings( "ignore" ) #https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html data = pd.read_excel(r "E:\Desktop\data.xlsx" ) X = data.iloc[:, 0 : 13 ] # 选择第0~12列作为X值 y = data.iloc[:, 13 ] # 选择第13列作为y值 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state = 0 ) lgb = LGBMRegressor(random_state = 0 ,device = "gpu" ,boosting = "gbdt" ) param_grid = { 'n_estimators' :[ 100 ], 'max_depth' : [ 5 ], 'learning_rate' : [ 0.01 , 0.1 ], 'subsample' : [ 0.8 ], 'colsample_bytree' : [ 0.8 ], 'num_leaves' : [ 15 , 31 ], 'n_jobs' : [ - 1 ], 'device' : [ "gpu" ] } grid = GridSearchCV(lgb, param_grid, cv = 10 , scoring = "neg_mean_squared_error" ) start = time.time() grid.fit(X_train, y_train) best_lgb = grid.best_estimator_ y_pred = best_lgb.predict(X_test) #y_pred残差分布图 residuals = y_test - y_pred seaborn.histplot(residuals,bins = 20 , color = "orange" , kde = True ) plt.xlabel( "Residuals" ) plt.ylabel( "Count" ) plt.title( "y_test Residuals Distribution" ) plt.show() # y_pred残差图 residual_test = y_test - y_pred plt.scatter(y_test, residual_test) plt.hlines(y = 0 , xmin = y_test. min (), xmax = y_test. max ()) plt.title( 'Residual plot for y_pred' ) plt.xlabel( 'True value' ) plt.ylabel( 'Residual' ) plt.show() mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) pcc = np.corrcoef(y_test, y_pred)[ 0 , 1 ] #散点图 plt.scatter(y_test, y_pred, c = "blue" ) plt.xlabel( "Truth" ) plt.ylabel( "predict" ) plt.title( "Truth vs predict" ) plt.show() #主成分图 pca = PCA(n_components = 2 ) X_pca = pca.fit_transform(X) plt.scatter(X_pca[:, 0 ], X_pca[:, 1 ], c = y, cmap = "rainbow" ) plt.xlabel( "1st_PCA" ) plt.ylabel( "2rd_PCA" ) plt.title( "PCA result" ) plt.colorbar() plt.show() y_train_pred = best_lgb.predict(X_train) # y_train_pred残差分布图 residuals = y_train - y_train_pred seaborn.histplot(residuals,bins = 20 , color = "orange" , kde = True ) plt.xlabel( "Residuals" ) plt.ylabel( "Count" ) plt.title( "y_train Residuals Distribution" ) plt.show() # y_train_pred残差图 residual_train = y_train - y_train_pred plt.scatter(y_train, residual_train) plt.hlines(y = 0 , xmin = y_train. min (), xmax = y_train. max ()) plt.title( 'Residual plot for y_train_pred' ) plt.xlabel( 'True value' ) plt.ylabel( 'Residual' ) plt.show() # 显示重要特征,max_num_features 指定显示多少个特征 plot_importance(best_lgb) plt.show() # 绘制柱状图显示每个特征的重要性 feature_names = X.columns # 获取特征名称 feature_importances = best_lgb.feature_importances_ # 获取特征重要性分数 plt.bar(feature_names, feature_importances) plt.xlabel( "Feature" ) plt.ylabel( "Importance" ) plt.title( "Feature importance" ) plt.show() mae_train = mean_absolute_error(y_train, y_train_pred) mse_train = mean_squared_error(y_train, y_train_pred) rmse_train = np.sqrt(mse_train) r2_train = r2_score(y_train, y_train_pred) pcc_train = np.corrcoef(y_train, y_train_pred)[ 0 , 1 ] # 使用shap库获取每个特征的SHAP值 explainer = shap.TreeExplainer(best_lgb) # 创建解释器对象 shap_values = explainer.shap_values(X) # 获取SHAP值 # 绘制汇总图显示每个特征的SHAP值 shap.summary_plot(shap_values, X, plot_type = "bar" ) joblib.dump(best_lgb, 'best_lgb5.pkl' ) # 调用best_lgb.pkl文件 model = joblib.load( 'best_lgb5.pkl' ) data = pd.read_excel(r "E:\Desktop\data.xlsx" ,header = 0 ) # 获取数据的行数和列数 rows, cols = data.shape # 遍历每一行的x值,输入到模型,并将预测的y值输入到最后一列 for i, row in data.iterrows(): # 获取x值,转换为二维数组 x = row[: 13 ].values.reshape( 1 , - 1 ) # 预测y值,转换为标量 y = model.predict(x)[ 0 ] # 输入y值到最后一列 data.loc[i, cols + 1 ] = y # 保存数据到Excel文件 data.to_excel(r "E:\Desktop\data.xlsx" , index = False ) print ( "best_params:" , grid.best_params_) print ( "mse:" , mse) print ( "rmse:" , rmse) print ( "mae:" , mae) print ( "r2:" , r2) print ( "pcc:" , pcc) print ( "mae_train:" , mae_train) print ( "mse_train:" , mse_train) print ( "rmse_train:" , rmse_train) print ( "r2_train:" , r2_train) print ( "pcc_train:" , pcc_train) # 记录结束时间 end = time.time() # 打印训练时间 print ( "Training time: {:.2f} seconds" . format (end - start)) |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· winform 绘制太阳,地球,月球 运作规律
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
2018-11-18 Discovery studio 添加Database