from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# (1)加载iris数据集,并留出1/3作为测试集
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42, stratify=y)
# (2)使用训练集训练带有预剪枝和后剪枝的决策树模型
# 这里我们设置max_depth为3来进行预剪枝,min_samples_leaf为2来进行后剪枝
clf = DecisionTreeClassifier(max_depth=3, min_samples_leaf=2, random_state=42)
clf.fit(X_train, y_train)
# (3)使用五折交叉验证对模型性能进行评估
cv = StratifiedKFold(n_splits=5)
scores_accuracy = cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy')
scores_precision = cross_val_score(clf, X_train, y_train, cv=cv, scoring='precision_macro')
scores_recall = cross_val_score(clf, X_train, y_train, cv=cv, scoring='recall_macro')
scores_f1 = cross_val_score(clf, X_train, y_train, cv=cv, scoring='f1_macro')
# 打印训练集的性能评估结果
print(f'训练集准确率: {scores_accuracy.mean():.2f}')
print(f'训练集精确率: {scores_precision.mean():.2f}')
print(f'训练集召回率: {scores_recall.mean():.2f}')
print(f'训练集F1值: {scores_f1.mean():.2f}')
# (4)使用测试集测试模型性能
y_pred = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred, average='macro')
test_recall = recall_score(y_test, y_pred, average='macro')
test_f1 = f1_score(y_test, y_pred, average='macro')
# 打印测试集的性能评估结果
print(f'测试集准确率: {test_accuracy:.2f}')
print(f'测试集精确率: {test_precision:.2f}')
print(f'测试集召回率: {test_recall:.2f}')
print(f'测试集F1值: {test_f1:.2f}')