2024.12.3

import numpy as np
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split


class C45Tree:
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.tree = None

def fit(self, X, y):
self.tree = self._build_tree(X, y, depth=0)
return self

def _build_tree(self, X, y, depth):
# 终止条件
if len(np.unique(y)) == 1 or len(y) < self.min_samples_split or (self.max_depth and depth >= self.max_depth):
return self._create_leaf(y)

# 选择最优特征和分裂点
feature, threshold = self._choose_best_split(X, y)
if feature is None: # 无法继续分裂
return self._create_leaf(y)

# 分裂数据
left_mask = X[:, feature] <= threshold
right_mask = ~left_mask
left_child = self._build_tree(X[left_mask], y[left_mask], depth + 1)
right_child = self._build_tree(X[right_mask], y[right_mask], depth + 1)
return {'feature': feature, 'threshold': threshold, 'left': left_child, 'right': right_child}

def _choose_best_split(self, X, y):
best_gain = -1
best_feature = None
best_threshold = None
for feature in range(X.shape[1]):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
gain = self._information_gain(X[:, feature], y, threshold)
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold

def _information_gain(self, feature_column, y, threshold):
# 计算信息增益
parent_entropy = self._entropy(y)
left_mask = feature_column <= threshold
right_mask = ~left_mask
if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
return 0
left_entropy = self._entropy(y[left_mask])
right_entropy = self._entropy(y[right_mask])
n = len(y)
child_entropy = (len(y[left_mask]) / n) * left_entropy + (len(y[right_mask]) / n) * right_entropy
return parent_entropy - child_entropy

def _entropy(self, y):
probabilities = np.bincount(y) / len(y)
probabilities = probabilities[probabilities > 0]
return -np.sum(probabilities * np.log2(probabilities))

def _create_leaf(self, y):
return {'label': np.bincount(y).argmax()}

def predict(self, X):
return np.array([self._predict_single(sample, self.tree) for sample in X])

def _predict_single(self, sample, tree):
if 'label' in tree:
return tree['label']
feature = tree['feature']
threshold = tree['threshold']
if sample[feature] <= threshold:
return self._predict_single(sample, tree['left'])
else:
return self._predict_single(sample, tree['right'])


# 使用 C4.5 算法
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3, random_state=42, stratify=y)

# 训练模型并预测
c45 = C45Tree(max_depth=3)
c45.fit(X_train, y_train)
y_pred = c45.predict(X_test)

# 评估性能
print("准确率:", accuracy_score(y_test, y_pred))
print("F1 值:", f1_score(y_test, y_pred, average='weighted'))
posted @   我也不想的  阅读(3)  评论(0编辑  收藏  举报
(评论功能已被禁用)
相关博文:
阅读排行:
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
点击右上角即可分享
微信分享提示