

(2)从scikit-learn 库中直接加载iris 数据集;




import numpy as np

import pandas as pd

from collections import Counter


class DecisionTree:

    def __init__(self, max_depth=None):

        self.max_depth = max_depth

        self.tree = None


    def fit(self, X, y):

        self.tree = self._grow_tree(X, y)


    def _grow_tree(self, X, y, depth=0):

        num_samples, num_features = X.shape

        if num_samples == 0 or (self.max_depth and depth == self.max_depth):

            return Counter(y).most_common(1)[0][0]


        # Select the best feature to split on

        best_feature = self._best_feature(X, y)

        tree = {best_feature: {}}


        for value in np.unique(X[:, best_feature]):

            mask = (X[:, best_feature] == value)

            tree[best_feature][value] = self._grow_tree(X[mask], y[mask], depth + 1)


        return tree


    def _best_feature(self, X, y):

        num_features = X.shape[1]

        best_gain = -1

        best_feature = None


        for feature in range(num_features):

            gain = self._information_gain(X, y, feature)

            if gain > best_gain:

                best_gain = gain

                best_feature = feature

        return best_feature

    def _information_gain(self, X, y, feature):

        # Calculate the information gain for a feature

        # Implementation details omitted for brevity


    def predict(self, X):

        return [self._predict(sample, self.tree) for sample in X]

    def _predict(self, sample, tree):

        if not isinstance(tree, dict):

            return tree

        feature = next(iter(tree))

        branch = tree[feature].get(sample[feature])

        if branch is None:

            return None

        return self._predict(sample, branch)

class RandomForest:

    def __init__(self, n_estimators=100, max_depth=None):

        self.n_estimators = n_estimators

        self.max_depth = max_depth

        self.trees = []

    def fit(self, X, y):

        for _ in range(self.n_estimators):

            X_sample, y_sample = self._bootstrap(X, y)

            tree = DecisionTree(max_depth=self.max_depth)

            tree.fit(X_sample, y_sample)


    def _bootstrap(self, X, y):

        n_samples = X.shape[0]

        indices = np.random.choice(range(n_samples), n_samples, replace=True)

        return X[indices], y[indices]

    def predict(self, X):

        tree_preds = np.array([tree.predict(X) for tree in self.trees])

        return [Counter(tree_pred).most_common(1)[0][0] for tree_pred in tree_preds.T]

# 从文件读取数据

iris_data = pd.read_csv('iris.csv')

X = iris_data.drop('species', axis=1).values

y = iris_data['species'].values

# 训练随机森林

rf = RandomForest(n_estimators=100, max_depth=10)

rf.fit(X, y)

# 预测

predictions = rf.predict(X)

# 真实标签与预测标签比较,计算准确率等评估指标

accuracy = np.mean(predictions == y)

print(f'Accuracy: {accuracy:.4f}')


from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score, KFold

from sklearn.metrics import classification_report

import pandas as pd

from sklearn import datasets

iris_data = pd.read_csv('iris.csv')

X = iris_data.drop('species', axis=1)  # 特征数据(输入特征)

y = iris_data['species']                 # 目标数据(标签)


# RandomForestClassifier参数说明:

# - n_estimators: 森林中树的数量(默认为100),树越多,模型越复杂,准确性通常越高,但计算成本也会增加

# - max_depth: 每棵树的最大深度(默认为None),设置为None表示树会一直生长,直到所有叶子都是纯的或叶子少于min_samples_split

model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# 五折交叉验证

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 交叉验证返回的准确度

cross_val_scores = cross_val_score(model, X, y, cv=kf)

print(f'Cross-validated accuracy: {cross_val_scores.mean():.4f}')

# 训练模型并评估

model.fit(X, y)

y_pred = model.predict(X)

# 输出分类报告,展示准确率、精度、召回率和F1值等

report = classification_report(y, y_pred)


