lightgbm

1.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# coding=utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
import re
from sklearn.decomposition import PCA
import joblib
import shap
import time
from lightgbm import plot_importance
import seaborn
import warnings
warnings.filterwarnings("ignore")
 
#https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html
 
data = pd.read_excel(r"E:\Desktop\data.xlsx")
X = data.iloc[:, 0:13] # 选择第0~12列作为X值
y = data.iloc[:, 13] # 选择第13列作为y值
 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
lgb = LGBMRegressor(random_state=0,device="gpu",boosting="gbdt")
param_grid = {
    'n_estimators':[100],
    'max_depth': [5],
    'learning_rate': [0.01,0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'num_leaves': [15, 31],
    'n_jobs': [-1],
    'device': ["gpu"]
}
grid = GridSearchCV(lgb, param_grid, cv=10, scoring="neg_mean_squared_error")
 
start = time.time()
grid.fit(X_train, y_train)
best_lgb = grid.best_estimator_
y_pred = best_lgb.predict(X_test)
 
#y_pred残差分布图
residuals = y_test - y_pred
seaborn.histplot(residuals,bins=20, color="orange", kde=True)
plt.xlabel("Residuals")
plt.ylabel("Count")
plt.title("y_test Residuals Distribution")
plt.show()
 
 
# y_pred残差图
residual_test = y_test - y_pred
plt.scatter(y_test, residual_test)
plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max())
plt.title('Residual plot for y_pred')
plt.xlabel('True value')
plt.ylabel('Residual')
plt.show()
 
 
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
pcc = np.corrcoef(y_test, y_pred)[0, 1]
 
#散点图
plt.scatter(y_test, y_pred, c="blue")
plt.xlabel("Truth")
plt.ylabel("predict")
plt.title("Truth vs predict")
plt.show()
 
#主成分图
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap="rainbow")
plt.xlabel("1st_PCA")
plt.ylabel("2rd_PCA")
plt.title("PCA result")
plt.colorbar()
plt.show()
 
y_train_pred = best_lgb.predict(X_train)
 
# y_train_pred残差分布图
residuals = y_train - y_train_pred 
seaborn.histplot(residuals,bins=20, color="orange", kde=True)
plt.xlabel("Residuals")
plt.ylabel("Count")
plt.title("y_train Residuals Distribution")
plt.show()
 
# y_train_pred残差图
residual_train = y_train - y_train_pred 
plt.scatter(y_train, residual_train)
plt.hlines(y=0, xmin=y_train.min(), xmax=y_train.max()) 
plt.title('Residual plot for y_train_pred')
plt.xlabel('True value')
plt.ylabel('Residual'
plt.show()
 
 
# 显示重要特征,max_num_features 指定显示多少个特征
plot_importance(best_lgb)
plt.show()
 
# 绘制柱状图显示每个特征的重要性
feature_names = X.columns # 获取特征名称
feature_importances = best_lgb.feature_importances_ # 获取特征重要性分数
plt.bar(feature_names, feature_importances)
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Feature importance")
plt.show()
 
mae_train = mean_absolute_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)
pcc_train = np.corrcoef(y_train, y_train_pred)[0, 1]
 
# 使用shap库获取每个特征的SHAP值
explainer = shap.TreeExplainer(best_lgb) # 创建解释器对象
shap_values = explainer.shap_values(X) # 获取SHAP值
# 绘制汇总图显示每个特征的SHAP值
shap.summary_plot(shap_values, X, plot_type="bar")
 
joblib.dump(best_lgb, 'best_lgb5.pkl')
 
# 调用best_lgb.pkl文件
model = joblib.load('best_lgb5.pkl')
data = pd.read_excel(r"E:\Desktop\data.xlsx",header=0)
# 获取数据的行数和列数
rows, cols = data.shape
 
# 遍历每一行的x值,输入到模型,并将预测的y值输入到最后一列
for i, row in data.iterrows():
    # 获取x值,转换为二维数组
    x = row[:13].values.reshape(1, -1)
    # 预测y值,转换为标量
    y = model.predict(x)[0]
    # 输入y值到最后一列
    data.loc[i, cols+1] = y
# 保存数据到Excel文件
data.to_excel(r"E:\Desktop\data.xlsx", index=False)
 
print("best_params:", grid.best_params_)
print("mse:", mse)
print("rmse:", rmse)
print("mae:", mae)
print("r2:", r2)
print("pcc:", pcc)
print("mae_train:", mae_train)
print("mse_train:", mse_train)
print("rmse_train:", rmse_train)
print("r2_train:", r2_train)
print("pcc_train:", pcc_train)
# 记录结束时间
end = time.time()
# 打印训练时间
print("Training time: {:.2f} seconds".format(end - start))

  

posted @   kehan  阅读(36)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· winform 绘制太阳,地球,月球 运作规律
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
历史上的今天:
2018-11-18 Discovery studio 添加Database
点击右上角即可分享
微信分享提示