12.20
(1)利用pandas库从本地读取iris数据集;
(2)从scikit-learn 库中直接加载iris 数据集;
(3)实现五折交叉验证进行模型训练;
(4)计算并输出模型的准确度、精度、召回率和F1值。
import numpy as np
import pandas as pd
from collections import Counter
class DecisionTree:
def __init__(self, max_depth=None):
self.max_depth = max_depth
self.tree = None
def fit(self, X, y):
self.tree = self._grow_tree(X, y)
def _grow_tree(self, X, y, depth=0):
num_samples, num_features = X.shape
if num_samples == 0 or (self.max_depth and depth == self.max_depth):
return Counter(y).most_common(1)[0][0]
# Select the best feature to split on
best_feature = self._best_feature(X, y)
tree = {best_feature: {}}
for value in np.unique(X[:, best_feature]):
mask = (X[:, best_feature] == value)
tree[best_feature][value] = self._grow_tree(X[mask], y[mask], depth + 1)
return tree
def _best_feature(self, X, y):
num_features = X.shape[1]
best_gain = -1
best_feature = None
for feature in range(num_features):
gain = self._information_gain(X, y, feature)
if gain > best_gain:
best_gain = gain
best_feature = feature
return best_feature
def _information_gain(self, X, y, feature):
# Calculate the information gain for a feature
# Implementation details omitted for brevity
pass
def predict(self, X):
return [self._predict(sample, self.tree) for sample in X]
def _predict(self, sample, tree):
if not isinstance(tree, dict):
return tree
feature = next(iter(tree))
branch = tree[feature].get(sample[feature])
if branch is None:
return None
return self._predict(sample, branch)
class RandomForest:
def __init__(self, n_estimators=100, max_depth=None):
self.n_estimators = n_estimators
self.max_depth = max_depth
self.trees = []
def fit(self, X, y):
for _ in range(self.n_estimators):
X_sample, y_sample = self._bootstrap(X, y)
tree = DecisionTree(max_depth=self.max_depth)
tree.fit(X_sample, y_sample)
self.trees.append(tree)
def _bootstrap(self, X, y):
n_samples = X.shape[0]
indices = np.random.choice(range(n_samples), n_samples, replace=True)
return X[indices], y[indices]
def predict(self, X):
tree_preds = np.array([tree.predict(X) for tree in self.trees])
return [Counter(tree_pred).most_common(1)[0][0] for tree_pred in tree_preds.T]
# 从文件读取数据
iris_data = pd.read_csv('iris.csv')
X = iris_data.drop('species', axis=1).values
y = iris_data['species'].values
# 训练随机森林
rf = RandomForest(n_estimators=100, max_depth=10)
rf.fit(X, y)
# 预测
predictions = rf.predict(X)
# 真实标签与预测标签比较,计算准确率等评估指标
accuracy = np.mean(predictions == y)
print(f'Accuracy: {accuracy:.4f}')
(2)调用库方法
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import classification_report
import pandas as pd
from sklearn import datasets
iris_data = pd.read_csv('iris.csv')
X = iris_data.drop('species', axis=1) # 特征数据(输入特征)
y = iris_data['species'] # 目标数据(标签)
#创建随机森林分类器
# RandomForestClassifier参数说明:
# - n_estimators: 森林中树的数量(默认为100),树越多,模型越复杂,准确性通常越高,但计算成本也会增加
# - max_depth: 每棵树的最大深度(默认为None),设置为None表示树会一直生长,直到所有叶子都是纯的或叶子少于min_samples_split
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
# 五折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# 交叉验证返回的准确度
cross_val_scores = cross_val_score(model, X, y, cv=kf)
print(f'Cross-validated accuracy: {cross_val_scores.mean():.4f}')
# 训练模型并评估
model.fit(X, y)
y_pred = model.predict(X)
# 输出分类报告,展示准确率、精度、召回率和F1值等
report = classification_report(y, y_pred)
print(report)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?
· 如何调用 DeepSeek 的自然语言处理 API 接口并集成到在线客服系统