2024.12.3

import numpy as np
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split


class C45Tree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)
        return self

    def _build_tree(self, X, y, depth):
        # 终止条件
        if len(np.unique(y)) == 1 or len(y) < self.min_samples_split or (self.max_depth and depth >= self.max_depth):
            return self._create_leaf(y)

        # 选择最优特征和分裂点
        feature, threshold = self._choose_best_split(X, y)
        if feature is None:  # 无法继续分裂
            return self._create_leaf(y)

        # 分裂数据
        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask
        left_child = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_child = self._build_tree(X[right_mask], y[right_mask], depth + 1)
        return {'feature': feature, 'threshold': threshold, 'left': left_child, 'right': right_child}

    def _choose_best_split(self, X, y):
        best_gain = -1
        best_feature = None
        best_threshold = None
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = self._information_gain(X[:, feature], y, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold
        return best_feature, best_threshold

    def _information_gain(self, feature_column, y, threshold):
        # 计算信息增益
        parent_entropy = self._entropy(y)
        left_mask = feature_column <= threshold
        right_mask = ~left_mask
        if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
            return 0
        left_entropy = self._entropy(y[left_mask])
        right_entropy = self._entropy(y[right_mask])
        n = len(y)
        child_entropy = (len(y[left_mask]) / n) * left_entropy + (len(y[right_mask]) / n) * right_entropy
        return parent_entropy - child_entropy

    def _entropy(self, y):
        probabilities = np.bincount(y) / len(y)
        probabilities = probabilities[probabilities > 0]
        return -np.sum(probabilities * np.log2(probabilities))

    def _create_leaf(self, y):
        return {'label': np.bincount(y).argmax()}

    def predict(self, X):
        return np.array([self._predict_single(sample, self.tree) for sample in X])

    def _predict_single(self, sample, tree):
        if 'label' in tree:
            return tree['label']
        feature = tree['feature']
        threshold = tree['threshold']
        if sample[feature] <= threshold:
            return self._predict_single(sample, tree['left'])
        else:
            return self._predict_single(sample, tree['right'])


# 使用 C4.5 算法
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3, random_state=42, stratify=y)

# 训练模型并预测
c45 = C45Tree(max_depth=3)
c45.fit(X_train, y_train)
y_pred = c45.predict(X_test)

# 评估性能
print("准确率:", accuracy_score(y_test, y_pred))
print("F1 值:", f1_score(y_test, y_pred, average='weighted'))

posted @ 2024-12-26 00:58 我也不想的阅读(3) 评论(0) 编辑收藏举报

刷新页面返回顶部

（评论功能已被禁用）

相关博文：

· 2024.11.25

· 2024.12.12

· 2024.12.2（周一）

· 2024.12.4（周三）

· 2024.12.9（周一）

公告

昵称：我也不想的
园龄： 1年11个月
粉丝： 0
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

ztydebeishanglaojia

2024.12.3

公告

搜索

常用链接

我的标签

随笔档案

阅读排行榜