机器学习算法原理实现——gbdt
前面的文章介绍了决策树的实现,我们基于之前决策树的实现(https://www.cnblogs.com/bonelee/p/17691555.html),写一个gbdt:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import matplotlib.pyplot as plt class TreeNode: def __init__( self , mse, num_samples, predicted_value): self .mse = mse self .num_samples = num_samples self .predicted_value = predicted_value self .feature_index = 0 self .threshold = 0 self .left = None self .right = None def mse(y): if len (y) = = 0 : return 0 return np.mean((y - np.mean(y)) * * 2 ) def grow_tree(X, y, depth = 0 , max_depth = None ): num_samples = len (y) predicted_value = np.mean(y) node = TreeNode( mse = mse(y), num_samples = num_samples, predicted_value = predicted_value, ) if depth < max_depth: idx, thr = best_split(X, y) if idx is not None : indices_left = X[:, idx] < thr X_left, y_left = X[indices_left], y[indices_left] X_right, y_right = X[~indices_left], y[~indices_left] node.feature_index = idx node.threshold = thr node.left = grow_tree(X_left, y_left, depth + 1 , max_depth) node.right = grow_tree(X_right, y_right, depth + 1 , max_depth) return node def best_split(X, y): n_samples, n_features = X.shape if n_samples < = 1 : return None , None best = {} min_mse = float ( 'inf' ) for feature_idx in range (n_features): thresholds = np.unique(X[:, feature_idx]) for threshold in thresholds: left_mask = X[:, feature_idx] < threshold right_mask = ~left_mask mse_left = mse(y[left_mask]) mse_right = mse(y[right_mask]) weighted_mse = len (y[left_mask]) / n_samples * mse_left + len (y[right_mask]) / n_samples * mse_right if weighted_mse < min_mse: best = { 'feature_index' : feature_idx, 'threshold' : threshold, 'left_values' : y[left_mask], 'right_values' : y[right_mask], 'mse' : weighted_mse } min_mse = weighted_mse return best[ 'feature_index' ], best[ 'threshold' ] def predict_tree(node, X): if node.left is None and node.right is None : return node.predicted_value if X[node.feature_index] < node.threshold: return predict_tree(node.left, X) else : return predict_tree(node.right, X) class CARTRegressor: def __init__( self , max_depth = None ): self .max_depth = max_depth def fit( self , X, y): self .tree_ = grow_tree(X, y, max_depth = self .max_depth) def predict( self , X): return [predict_tree( self .tree_, x) for x in X] ### 平方损失 class SquareLoss: # 平方损失函数 def loss( self , y, y_pred): return 0.5 * np.power((y - y_pred), 2 ) # 平方损失的一阶导数 def gradient( self , y, y_pred): return - (y - y_pred) ### GBDT定义 class GBDT( object ): def __init__( self , n_estimators, learning_rate, min_samples_split, min_gini_impurity, max_depth, regression): ### 基本超参数 # 树的棵数 self .n_estimators = n_estimators # 学习率 self .learning_rate = learning_rate # 结点最小分裂样本数 self .min_samples_split = min_samples_split # todo # 结点最小基尼不纯度 self .min_gini_impurity = min_gini_impurity # todo # 最大深度 self .max_depth = max_depth # 默认为回归树 self .regression = regression # 损失为平方损失 self .loss = SquareLoss() # 如果是分类树,需要定义分类树损失函数 # 这里省略,如需使用,需自定义分类损失函数 if not self .regression: self .loss = None # 多棵树叠加 self .estimators = [] for i in range ( self .n_estimators): self .estimators.append(CARTRegressor(max_depth = self .max_depth)) # 拟合方法 def fit( self , X, y): # 前向分步模型初始化,第一棵树 self .estimators[ 0 ].fit(X, y) # 第一棵树的预测结果 y_pred = self .estimators[ 0 ].predict(X) # 前向分步迭代训练 for i in range ( 1 , self .n_estimators): gradient = self .loss.gradient(y, y_pred) self .estimators[i].fit(X, gradient) y_pred - = np.multiply( self .learning_rate, self .estimators[i].predict(X)) # 预测方法 def predict( self , X): # 回归树预测 y_pred = self .estimators[ 0 ].predict(X) for i in range ( 1 , self .n_estimators): y_pred - = np.multiply( self .learning_rate, self .estimators[i].predict(X)) # 分类树预测 if not self .regression: # 将预测值转化为概率 y_pred = np.exp(y_pred) / np.expand_dims(np. sum (np.exp(y_pred), axis = 1 ), axis = 1 ) # 转化为预测标签 y_pred = np.argmax(y_pred, axis = 1 ) return y_pred ### GBDT分类树 class GBDTClassifier(GBDT): def __init__( self , n_estimators = 300 , learning_rate = . 5 , min_samples_split = 2 , min_info_gain = 1e - 6 , max_depth = 2 ): super (GBDTClassifier, self ).__init__( n_estimators = n_estimators, learning_rate = learning_rate, min_samples_split = min_samples_split, min_gini_impurity = min_info_gain, max_depth = max_depth, regression = False ) ### GBDT回归树 class GBDTRegressor(GBDT): def __init__( self , n_estimators = 300 , learning_rate = 0.1 , min_samples_split = 2 , min_var_reduction = 1e - 6 , max_depth = 3 ): super (GBDTRegressor, self ).__init__( n_estimators = n_estimators, learning_rate = learning_rate, min_samples_split = min_samples_split, min_gini_impurity = min_var_reduction, max_depth = max_depth, regression = True ) ### GBRT回归树 # 导入数据集模块 from sklearn import datasets # 导入波士顿房价数据集 iris = datasets.load_iris() # 加载数据 # X = iris.data # 仅使用前两个特征 X = iris.data[:, : 2 ] y = iris.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state = 42 ) # 创建GBRT实例 model = GBDTRegressor() # 模型训练 model.fit(X_train, y_train) # 模型预测 y_pred = model.predict(X_test) # 计算模型预测的均方误差 mse = mean_squared_error(y_test, y_pred) print ( "Mean Squared Error of NumPy GBRT:" , mse) # 导入GradientBoostingRegressor模块 from sklearn.ensemble import GradientBoostingRegressor # 创建模型实例 reg = GradientBoostingRegressor(n_estimators = 300 , learning_rate = 0.1 , max_depth = 3 , random_state = 0 ) # 模型拟合 reg.fit(X_train, y_train) # 模型预测 y_pred = reg.predict(X_test) # 计算模型预测的均方误差 mse = mean_squared_error(y_test, y_pred) print ( "Mean Squared Error of sklearn GBDT:" , mse) |
输出:
Mean Squared Error of NumPy GBRT: 0.24127739600520248
Mean Squared Error of sklearn GBDT: 0.20883609477073248
可以看到,我们的实现和sklearn的还是比较接近的。
注意里面也有学习率:
本质上也就是迭代过程也是不断减少残差的过程!
标签:
机器学习
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· DeepSeek 开源周回顾「GitHub 热点速览」
2022-09-16 我的sysmon采集全量数据配置
2022-09-16 Sysmon 使用查询进程名称获取 DNS 查询日志==》看来早些版本是不支持溯源的!
2020-09-16 网络流量画像(IP,主机维度)业界应用调研——time、port、size、rate、网络访问关系、IP归属、是否代理+历史异常情况(ddos常用)