机器学习算法原理实现——随机森林,核心是就是行列抽样,可以并行
记得参考之前的文章
机器学习算法原理实现——cart决策树:分类&回归
随机森林算法训练步骤:
代码实现(决策树复用了之前的深度剪枝实现):
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 | # 导入numpy库 import numpy as np from sklearn.metrics import accuracy_score class TreeNode: def __init__( self , gini, num_samples, num_samples_per_class, predicted_class): self .gini = gini self .num_samples = num_samples self .num_samples_per_class = num_samples_per_class self .predicted_class = predicted_class self .feature_index = 0 self .threshold = 0 self .left = None self .right = None def gini(y): m = len (y) return 1.0 - sum ([(np. sum (y = = c) / m) * * 2 for c in np.unique(y)]) def grow_tree(X, y, depth = 0 , max_depth = None ): classes = np.unique(y) num_samples_per_class = [np. sum (y = = c) for c in classes] predicted_class = classes[np.argmax(num_samples_per_class)] node = TreeNode( gini = gini(y), num_samples = len (y), num_samples_per_class = num_samples_per_class, predicted_class = predicted_class, ) if depth < max_depth: idx, thr = best_split(X, y) if idx is not None : indices_left = X[:, idx] < thr X_left, y_left = X[indices_left], y[indices_left] X_right, y_right = X[~indices_left], y[~indices_left] node.feature_index = idx node.threshold = thr node.left = grow_tree(X_left, y_left, depth + 1 , max_depth) node.right = grow_tree(X_right, y_right, depth + 1 , max_depth) return node def best_split(X, y): """ 用numpy实现best_split,见下,可以先看不用numpy的实现 """ n_samples, n_features = X.shape if len (np.unique(y)) = = 1 : return None , None best = {} min_gini = float ( 'inf' ) for feature_idx in range (n_features): thresholds = np.unique(X[:, feature_idx]) for threshold in thresholds: left_mask = X[:, feature_idx] < threshold right_mask = ~left_mask gini_left = gini(y[left_mask]) gini_right = gini(y[right_mask]) weighted_gini = len (y[left_mask]) / n_samples * gini_left + len (y[right_mask]) / n_samples * gini_right if weighted_gini < min_gini: best = { 'feature_index' : feature_idx, 'threshold' : threshold, 'left_labels' : y[left_mask], 'right_labels' : y[right_mask], 'gini' : weighted_gini } min_gini = weighted_gini return best[ 'feature_index' ], best[ 'threshold' ] def best_split2(X, y): """ 不用numpy实现best_split """ n_samples, n_features = len (X), len (X[ 0 ]) # 如果样本中只有一种输出标签或样本为空,则返回None if len ( set (y)) = = 1 : return None , None # 初始化最佳分割的信息 best = {} min_gini = float ( 'inf' ) # 遍历每个特征 for feature_idx in range (n_features): # 获取当前特征的所有唯一值,并排序 unique_values = sorted ( set (row[feature_idx] for row in X)) # 遍历每个唯一值,考虑将其作为分割阈值 for value in unique_values: left_y, right_y = [], [] # 对于每个样本,根据其特征值与阈值的关系分到左子集或右子集 for i, row in enumerate (X): if row[feature_idx] < value: left_y.append(y[i]) else : right_y.append(y[i]) # 计算左子集和右子集的基尼指数 gini_left = 1.0 - sum ([(left_y.count(label) / len (left_y)) * * 2 for label in set (left_y)]) gini_right = 1.0 - sum ([(right_y.count(label) / len (right_y)) * * 2 for label in set (right_y)]) # 计算加权基尼指数 weighted_gini = len (left_y) / len (y) * gini_left + len (right_y) / len (y) * gini_right # 如果当前基尼值小于已知的最小基尼值,更新最佳分割 if weighted_gini < min_gini: best = { 'feature_index' : feature_idx, 'threshold' : value, 'left_labels' : left_y, 'right_labels' : right_y, 'gini' : weighted_gini } min_gini = weighted_gini return best[ 'feature_index' ], best[ 'threshold' ] def predict_tree(node, X): if node.left is None and node.right is None : return node.predicted_class if X[node.feature_index] < node.threshold: return predict_tree(node.left, X) else : return predict_tree(node.right, X) def predict_tree2(node, X): if node.left is None and node.right is None : return node.predicted_class if X[node.feature_index] < node.threshold: return predict_tree(node.left, X) else : return predict_tree(node.right, X) class CARTClassifier: def __init__( self , max_depth = None ): self .max_depth = max_depth def fit( self , X, y): self .tree_ = grow_tree(X, y, max_depth = self .max_depth) def predict( self , X): return [predict_tree( self .tree_, x) for x in X] ### 定义随机森林类 class RandomForest: def __init__( self , n_estimators = 100 , max_depth = float ( "inf" ), max_features = None ): # 树的棵数 self .n_estimators = n_estimators # 树最大深度 self .max_depth = max_depth # 所使用最大特征数 self .max_features = max_features self .trees = [] # 基于决策树构建森林 for _ in range ( self .n_estimators): tree = CARTClassifier(max_depth = self .max_depth) self .trees.append(tree) # 自助抽样 def bootstrap_sampling( self , X, y): X_y = np.concatenate([X, y.reshape( - 1 , 1 )], axis = 1 ) np.random.shuffle(X_y) n_samples = X.shape[ 0 ] sampling_subsets = [] for _ in range ( self .n_estimators): # 第一个随机性,行抽样, 从 0到n_samples-1 的整数中,有放回地随机抽取 n_samples 个样本 idx1 = np.random.choice(n_samples, n_samples, replace = True ) bootstrap_Xy = X_y[idx1, :] bootstrap_X = bootstrap_Xy[:, : - 1 ] # 最后一列是类别,前面是数据 bootstrap_y = bootstrap_Xy[:, - 1 ] # 类别 sampling_subsets.append([bootstrap_X, bootstrap_y]) return sampling_subsets # 随机森林训练 def fit( self , X, y): # 对森林中每棵树训练一个双随机抽样子集 sub_sets = self .bootstrap_sampling(X, y) n_features = X.shape[ 1 ] # 设置max_feature if self .max_features = = None : self .max_features = int (np.sqrt(n_features)) for i in range ( self .n_estimators): # 第二个随机性,列抽样 sub_X, sub_y = sub_sets[i] idx2 = np.random.choice(n_features, self .max_features, replace = True ) sub_X = sub_X[:, idx2] self .trees[i].fit(sub_X, sub_y) # 保存每次列抽样的列索引,方便预测时每棵树调用 self .trees[i].feature_indices = idx2 print ( 'The {}th tree is trained done...' . format (i + 1 )) # 随机森林预测 def predict( self , X): # 初始化预测结果列表 y_preds = [] # 遍历预测 for i in range ( self .n_estimators): idx = self .trees[i].feature_indices sub_X = X[:, idx] y_pred = self .trees[i].predict(sub_X) y_preds.append(y_pred) # 对分类结果进行集成 y_preds = np.array(y_preds).T res = [] # 取多数类为预测类 for j in y_preds: res.append(np.bincount(j.astype( 'int' )).argmax()) return res # 导入相关模块 from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split # 生成模拟二分类数据集 X, y = make_classification(n_samples = 1000 ,n_features = 20 , n_redundant = 0 , n_informative = 2 ,random_state = 1 , n_clusters_per_class = 1 ) rng = np.random.RandomState( 2 ) X + = 2 * rng.uniform(size = X.shape) # 划分数据集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3 ) # 创建随机森林模型实例 rf = RandomForest(n_estimators = 10 , max_features = 15 ) # 模型训练 rf.fit(X_train, y_train) # 模型预测 y_pred = rf.predict(X_test) acc = accuracy_score(y_test, y_pred) # 输出分类准确率 print ( "Accuracy of NumPy Random Forest:" , acc) # 导入随机森林分类器 from sklearn.ensemble import RandomForestClassifier # 创建随机森林分类器实例 clf = RandomForestClassifier(max_depth = 3 , random_state = 0 ) # 模型拟合 clf.fit(X_train, y_train) # 预测 y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) # 输出分类准确率 print ( "Accuracy of sklearn Random Forest:" , acc) |
输出:
The 1th tree is trained done...
The 2th tree is trained done...
The 3th tree is trained done...
The 4th tree is trained done...
The 5th tree is trained done...
The 6th tree is trained done...
The 7th tree is trained done...
The 8th tree is trained done...
The 9th tree is trained done...
The 10th tree is trained done...
Accuracy of NumPy Random Forest: 0.7666666666666667
Accuracy of sklearn Random Forest: 0.7933333333333333
至于精度和sklearn库的实现还是有差别!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· DeepSeek 开源周回顾「GitHub 热点速览」
2020-09-19 理解未知威胁——是针对签名的防护来说,签名绕过太容易,需要基于行为提供泛化能力更强的检测算法(AI)==>已知的未知威胁+未知的未知威胁
2019-09-19 IPS检测
2017-09-19 IPS
2016-09-19 字符串匹配的sunday算法