lightgbm
1.
# coding=utf-8 import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.model_selection import GridSearchCV from lightgbm import LGBMRegressor import re from sklearn.decomposition import PCA import joblib import shap import time from lightgbm import plot_importance import seaborn import warnings warnings.filterwarnings("ignore") #https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html data = pd.read_excel(r"E:\Desktop\data.xlsx") X = data.iloc[:, 0:13] # 选择第0~12列作为X值 y = data.iloc[:, 13] # 选择第13列作为y值 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) lgb = LGBMRegressor(random_state=0,device="gpu",boosting="gbdt") param_grid = { 'n_estimators':[100], 'max_depth': [5], 'learning_rate': [0.01,0.1], 'subsample': [0.8], 'colsample_bytree': [0.8], 'num_leaves': [15, 31], 'n_jobs': [-1], 'device': ["gpu"] } grid = GridSearchCV(lgb, param_grid, cv=10, scoring="neg_mean_squared_error") start = time.time() grid.fit(X_train, y_train) best_lgb = grid.best_estimator_ y_pred = best_lgb.predict(X_test) #y_pred残差分布图 residuals = y_test - y_pred seaborn.histplot(residuals,bins=20, color="orange", kde=True) plt.xlabel("Residuals") plt.ylabel("Count") plt.title("y_test Residuals Distribution") plt.show() # y_pred残差图 residual_test = y_test - y_pred plt.scatter(y_test, residual_test) plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max()) plt.title('Residual plot for y_pred') plt.xlabel('True value') plt.ylabel('Residual') plt.show() mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) pcc = np.corrcoef(y_test, y_pred)[0, 1] #散点图 plt.scatter(y_test, y_pred, c="blue") plt.xlabel("Truth") plt.ylabel("predict") plt.title("Truth vs predict") plt.show() #主成分图 pca = PCA(n_components=2) X_pca = pca.fit_transform(X) plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap="rainbow") plt.xlabel("1st_PCA") plt.ylabel("2rd_PCA") plt.title("PCA result") plt.colorbar() plt.show() y_train_pred = best_lgb.predict(X_train) # y_train_pred残差分布图 residuals = y_train - y_train_pred seaborn.histplot(residuals,bins=20, color="orange", kde=True) plt.xlabel("Residuals") plt.ylabel("Count") plt.title("y_train Residuals Distribution") plt.show() # y_train_pred残差图 residual_train = y_train - y_train_pred plt.scatter(y_train, residual_train) plt.hlines(y=0, xmin=y_train.min(), xmax=y_train.max()) plt.title('Residual plot for y_train_pred') plt.xlabel('True value') plt.ylabel('Residual') plt.show() # 显示重要特征,max_num_features 指定显示多少个特征 plot_importance(best_lgb) plt.show() # 绘制柱状图显示每个特征的重要性 feature_names = X.columns # 获取特征名称 feature_importances = best_lgb.feature_importances_ # 获取特征重要性分数 plt.bar(feature_names, feature_importances) plt.xlabel("Feature") plt.ylabel("Importance") plt.title("Feature importance") plt.show() mae_train = mean_absolute_error(y_train, y_train_pred) mse_train = mean_squared_error(y_train, y_train_pred) rmse_train = np.sqrt(mse_train) r2_train = r2_score(y_train, y_train_pred) pcc_train = np.corrcoef(y_train, y_train_pred)[0, 1] # 使用shap库获取每个特征的SHAP值 explainer = shap.TreeExplainer(best_lgb) # 创建解释器对象 shap_values = explainer.shap_values(X) # 获取SHAP值 # 绘制汇总图显示每个特征的SHAP值 shap.summary_plot(shap_values, X, plot_type="bar") joblib.dump(best_lgb, 'best_lgb5.pkl') # 调用best_lgb.pkl文件 model = joblib.load('best_lgb5.pkl') data = pd.read_excel(r"E:\Desktop\data.xlsx",header=0) # 获取数据的行数和列数 rows, cols = data.shape # 遍历每一行的x值,输入到模型,并将预测的y值输入到最后一列 for i, row in data.iterrows(): # 获取x值,转换为二维数组 x = row[:13].values.reshape(1, -1) # 预测y值,转换为标量 y = model.predict(x)[0] # 输入y值到最后一列 data.loc[i, cols+1] = y # 保存数据到Excel文件 data.to_excel(r"E:\Desktop\data.xlsx", index=False) print("best_params:", grid.best_params_) print("mse:", mse) print("rmse:", rmse) print("mae:", mae) print("r2:", r2) print("pcc:", pcc) print("mae_train:", mae_train) print("mse_train:", mse_train) print("rmse_train:", rmse_train) print("r2_train:", r2_train) print("pcc_train:", pcc_train) # 记录结束时间 end = time.time() # 打印训练时间 print("Training time: {:.2f} seconds".format(end - start))