机器学习算法原理实现——gbdt

 

 

前面的文章介绍了决策树的实现,我们基于之前决策树的实现(https://www.cnblogs.com/bonelee/p/17691555.html),写一个gbdt:

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
 
 
class TreeNode:
    def __init__(self, mse, num_samples, predicted_value):
        self.mse = mse
        self.num_samples = num_samples
        self.predicted_value = predicted_value
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None
  
def mse(y):
    if len(y) == 0:
        return 0
    return np.mean((y - np.mean(y)) ** 2)
  
def grow_tree(X, y, depth=0, max_depth=None):
    num_samples = len(y)
    predicted_value = np.mean(y)
    node = TreeNode(
        mse=mse(y),
        num_samples=num_samples,
        predicted_value=predicted_value,
    )
  
    if depth < max_depth:
        idx, thr = best_split(X, y)
        if idx is not None:
            indices_left = X[:, idx] < thr
            X_left, y_left = X[indices_left], y[indices_left]
            X_right, y_right = X[~indices_left], y[~indices_left]
            node.feature_index = idx
            node.threshold = thr
            node.left = grow_tree(X_left, y_left, depth + 1, max_depth)
            node.right = grow_tree(X_right, y_right, depth + 1, max_depth)
    return node
  
def best_split(X, y):
    n_samples, n_features = X.shape
    if n_samples <= 1:
        return None, None
  
    best = {}
    min_mse = float('inf')
  
    for feature_idx in range(n_features):
        thresholds = np.unique(X[:, feature_idx])
        for threshold in thresholds:
            left_mask = X[:, feature_idx] < threshold
            right_mask = ~left_mask
  
            mse_left = mse(y[left_mask])
            mse_right = mse(y[right_mask])
  
            weighted_mse = len(y[left_mask]) / n_samples * mse_left + len(y[right_mask]) / n_samples * mse_right
            if weighted_mse < min_mse:
                best = {
                    'feature_index': feature_idx,
                    'threshold': threshold,
                    'left_values': y[left_mask],
                    'right_values': y[right_mask],
                    'mse': weighted_mse
                }
                min_mse = weighted_mse
  
    return best['feature_index'], best['threshold']
  
  
def predict_tree(node, X):
    if node.left is None and node.right is None:
        return node.predicted_value
    if X[node.feature_index] < node.threshold:
        return predict_tree(node.left, X)
    else:
        return predict_tree(node.right, X)
  
class CARTRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
  
    def fit(self, X, y):
        self.tree_ = grow_tree(X, y, max_depth=self.max_depth)
  
    def predict(self, X):
        return [predict_tree(self.tree_, x) for x in X]
     
 
### 平方损失
class SquareLoss:
    # 平方损失函数
    def loss(self, y, y_pred):
        return 0.5 * np.power((y - y_pred), 2)
    # 平方损失的一阶导数
    def gradient(self, y, y_pred):
        return -(y - y_pred)
 
### GBDT定义
class GBDT(object):
    def __init__(self, n_estimators, learning_rate, min_samples_split,
                 min_gini_impurity, max_depth, regression):
        ### 基本超参数
        # 树的棵数
        self.n_estimators = n_estimators
        # 学习率
        self.learning_rate = learning_rate
        # 结点最小分裂样本数
        self.min_samples_split = min_samples_split # todo
        # 结点最小基尼不纯度
        self.min_gini_impurity = min_gini_impurity # todo
        # 最大深度
        self.max_depth = max_depth
        # 默认为回归树
        self.regression = regression
        # 损失为平方损失
        self.loss = SquareLoss()
        # 如果是分类树,需要定义分类树损失函数
        # 这里省略,如需使用,需自定义分类损失函数
        if not self.regression:
            self.loss = None
        # 多棵树叠加
        self.estimators = []
        for i in range(self.n_estimators):
            self.estimators.append(CARTRegressor(max_depth=self.max_depth))
 
    # 拟合方法
    def fit(self, X, y):
        # 前向分步模型初始化,第一棵树
        self.estimators[0].fit(X, y)
        # 第一棵树的预测结果
        y_pred = self.estimators[0].predict(X)
        # 前向分步迭代训练
        for i in range(1, self.n_estimators):
            gradient = self.loss.gradient(y, y_pred)
            self.estimators[i].fit(X, gradient)
            y_pred -= np.multiply(self.learning_rate, self.estimators[i].predict(X))
 
    # 预测方法
    def predict(self, X):
        # 回归树预测
        y_pred = self.estimators[0].predict(X)
        for i in range(1, self.n_estimators):
            y_pred -= np.multiply(self.learning_rate, self.estimators[i].predict(X))
        # 分类树预测
        if not self.regression:
            # 将预测值转化为概率
            y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)
            # 转化为预测标签
            y_pred = np.argmax(y_pred, axis=1)
        return y_pred
 
 
 
### GBDT分类树
class GBDTClassifier(GBDT):
    def __init__(self, n_estimators=300, learning_rate=.5,
                 min_samples_split=2, min_info_gain=1e-6, max_depth=2):
        super(GBDTClassifier,self).__init__(
              n_estimators=n_estimators,
              learning_rate=learning_rate,
              min_samples_split=min_samples_split,
              min_gini_impurity=min_info_gain,
              max_depth=max_depth,
              regression=False)
         
 
 
### GBDT回归树
class GBDTRegressor(GBDT):
    def __init__(self, n_estimators=300, learning_rate=0.1, min_samples_split=2,
                 min_var_reduction=1e-6, max_depth=3):
        super(GBDTRegressor, self).__init__(
              n_estimators=n_estimators,
              learning_rate=learning_rate,
              min_samples_split=min_samples_split,
              min_gini_impurity=min_var_reduction,
              max_depth=max_depth,
              regression=True)
         
 
### GBRT回归树
# 导入数据集模块
from sklearn import datasets
# 导入波士顿房价数据集
iris = datasets.load_iris()
# 加载数据
# X = iris.data
# 仅使用前两个特征
X = iris.data[:, :2]
y = iris.target
 
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建GBRT实例
model = GBDTRegressor()
# 模型训练
model.fit(X_train, y_train)
# 模型预测
y_pred = model.predict(X_test)
# 计算模型预测的均方误差
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error of NumPy GBRT:", mse)
 
 
 
# 导入GradientBoostingRegressor模块
from sklearn.ensemble import GradientBoostingRegressor
# 创建模型实例
reg = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1,
                                max_depth=3, random_state=0)
# 模型拟合
reg.fit(X_train, y_train)
# 模型预测
y_pred = reg.predict(X_test)
# 计算模型预测的均方误差
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error of sklearn GBDT:", mse)

 

输出:

Mean Squared Error of NumPy GBRT: 0.24127739600520248
Mean Squared Error of sklearn GBDT: 0.20883609477073248
 
可以看到,我们的实现和sklearn的还是比较接近的。

  

 

注意里面也有学习率:

 本质上也就是迭代过程也是不断减少残差的过程!

 

posted @   bonelee  阅读(59)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· DeepSeek 开源周回顾「GitHub 热点速览」
历史上的今天:
2022-09-16 我的sysmon采集全量数据配置
2022-09-16 Sysmon 使用查询进程名称获取 DNS 查询日志==》看来早些版本是不支持溯源的!
2020-09-16 网络流量画像(IP,主机维度)业界应用调研——time、port、size、rate、网络访问关系、IP归属、是否代理+历史异常情况(ddos常用)
点击右上角即可分享
微信分享提示