import numpy as np
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
class C45Tree:
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.tree = None
def fit(self, X, y):
self.tree = self._build_tree(X, y, depth=0)
return self
def _build_tree(self, X, y, depth):
# 终止条件
if len(np.unique(y)) == 1 or len(y) < self.min_samples_split or (self.max_depth and depth >= self.max_depth):
return self._create_leaf(y)
# 选择最优特征和分裂点
feature, threshold = self._choose_best_split(X, y)
if feature is None: # 无法继续分裂
return self._create_leaf(y)
# 分裂数据
left_mask = X[:, feature] <= threshold
right_mask = ~left_mask
left_child = self._build_tree(X[left_mask], y[left_mask], depth + 1)
right_child = self._build_tree(X[right_mask], y[right_mask], depth + 1)
return {'feature': feature, 'threshold': threshold, 'left': left_child, 'right': right_child}
def _choose_best_split(self, X, y):
best_gain = -1
best_feature = None
best_threshold = None
for feature in range(X.shape[1]):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
gain = self._information_gain(X[:, feature], y, threshold)
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold
def _information_gain(self, feature_column, y, threshold):
# 计算信息增益
parent_entropy = self._entropy(y)
left_mask = feature_column <= threshold
right_mask = ~left_mask
if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
return 0
left_entropy = self._entropy(y[left_mask])
right_entropy = self._entropy(y[right_mask])
n = len(y)
child_entropy = (len(y[left_mask]) / n) * left_entropy + (len(y[right_mask]) / n) * right_entropy
return parent_entropy - child_entropy
def _entropy(self, y):
probabilities = np.bincount(y) / len(y)
probabilities = probabilities[probabilities > 0]
return -np.sum(probabilities * np.log2(probabilities))
def _create_leaf(self, y):
return {'label': np.bincount(y).argmax()}
def predict(self, X):
return np.array([self._predict_single(sample, self.tree) for sample in X])
def _predict_single(self, sample, tree):
if 'label' in tree:
return tree['label']
feature = tree['feature']
threshold = tree['threshold']
if sample[feature] <= threshold:
return self._predict_single(sample, tree['left'])
else:
return self._predict_single(sample, tree['right'])
# 使用 C4.5 算法
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3, random_state=42, stratify=y)
# 训练模型并预测
c45 = C45Tree(max_depth=3)
c45.fit(X_train, y_train)
y_pred = c45.predict(X_test)
# 评估性能
print("准确率:", accuracy_score(y_test, y_pred))
print("F1 值:", f1_score(y_test, y_pred, average='weighted'))
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人