12.20

（1）利用pandas库从本地读取iris数据集；

（2）从scikit-learn 库中直接加载iris 数据集；

（3）实现五折交叉验证进行模型训练；

（4）计算并输出模型的准确度、精度、召回率和F1值。

import numpy as np

import pandas as pd

from collections import Counter

class DecisionTree:

def __init__(self, max_depth=None):

self.max_depth = max_depth

self.tree = None

def fit(self, X, y):

self.tree = self._grow_tree(X, y)

def _grow_tree(self, X, y, depth=0):

num_samples, num_features = X.shape

if num_samples == 0 or (self.max_depth and depth == self.max_depth):

return Counter(y).most_common(1)[0][0]

# Select the best feature to split on

best_feature = self._best_feature(X, y)

tree = {best_feature: {}}

for value in np.unique(X[:, best_feature]):

mask = (X[:, best_feature] == value)

tree[best_feature][value] = self._grow_tree(X[mask], y[mask], depth + 1)

return tree

def _best_feature(self, X, y):

num_features = X.shape[1]

best_gain = -1

best_feature = None

for feature in range(num_features):

gain = self._information_gain(X, y, feature)

if gain > best_gain:

best_gain = gain

best_feature = feature

return best_feature

def _information_gain(self, X, y, feature):

# Calculate the information gain for a feature

# Implementation details omitted for brevity

pass

def predict(self, X):

return [self._predict(sample, self.tree) for sample in X]

def _predict(self, sample, tree):

if not isinstance(tree, dict):

return tree

feature = next(iter(tree))

branch = tree[feature].get(sample[feature])

if branch is None:

return None

return self._predict(sample, branch)

class RandomForest:

def __init__(self, n_estimators=100, max_depth=None):

self.n_estimators = n_estimators

self.max_depth = max_depth

self.trees = []

def fit(self, X, y):

for _ in range(self.n_estimators):

X_sample, y_sample = self._bootstrap(X, y)

tree = DecisionTree(max_depth=self.max_depth)

tree.fit(X_sample, y_sample)

self.trees.append(tree)

def _bootstrap(self, X, y):

n_samples = X.shape[0]

indices = np.random.choice(range(n_samples), n_samples, replace=True)

return X[indices], y[indices]

def predict(self, X):

tree_preds = np.array([tree.predict(X) for tree in self.trees])

return [Counter(tree_pred).most_common(1)[0][0] for tree_pred in tree_preds.T]

# 从文件读取数据

iris_data = pd.read_csv('iris.csv')

X = iris_data.drop('species', axis=1).values

y = iris_data['species'].values

# 训练随机森林

rf = RandomForest(n_estimators=100, max_depth=10)

rf.fit(X, y)

# 预测

predictions = rf.predict(X)

# 真实标签与预测标签比较，计算准确率等评估指标

accuracy = np.mean(predictions == y)

print(f'Accuracy: {accuracy:.4f}')

（2）调用库方法

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score, KFold

from sklearn.metrics import classification_report

import pandas as pd

from sklearn import datasets

iris_data = pd.read_csv('iris.csv')

X = iris_data.drop('species', axis=1) # 特征数据（输入特征）

y = iris_data['species'] # 目标数据（标签）

#创建随机森林分类器

# RandomForestClassifier参数说明：

# - n_estimators: 森林中树的数量（默认为100），树越多，模型越复杂，准确性通常越高，但计算成本也会增加

# - max_depth: 每棵树的最大深度（默认为None），设置为None表示树会一直生长，直到所有叶子都是纯的或叶子少于min_samples_split

model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# 五折交叉验证

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 交叉验证返回的准确度

cross_val_scores = cross_val_score(model, X, y, cv=kf)

print(f'Cross-validated accuracy: {cross_val_scores.mean():.4f}')

# 训练模型并评估

model.fit(X, y)

y_pred = model.predict(X)

# 输出分类报告，展示准确率、精度、召回率和F1值等

report = classification_report(y, y_pred)

print(report)

posted @ 2024-12-21 15:28 涨涨涨张阅读(6) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· 12.11

· 12.18

· 2024.11.27（周三）

· 2024.11.28（周四）

· 2024.12.9（周一）

阅读排行：
· TypeScript + Deepseek 打造卜卦网站：技术与玄学的结合
· Manus的开源复刻OpenManus初探
· 三行代码完成国际化适配，妙~啊~
· .NET Core 中如何实现缓存的预热？
· 如何调用 DeepSeek 的自然语言处理 API 接口并集成到在线客服系统

公告

昵称：涨涨涨张
园龄： 1年11个月
粉丝： 0
关注： 6

+加关注

2025年3月

日

一

二

三

四

五

六

zh-ang-zhang

12.20

公告

搜索

常用链接

随笔分类

随笔档案

阅读排行榜