2024.12.6(周五)
# 导入相关库 import numpy as np from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split, cross_val_score from sklearn.cluster import KMeans from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import StratifiedKFold # (1)从 scikit-learn 库中加载 iris 数据集,使用留出法留出 1/3 的样本作为测试集 # 加载Iris数据集 iris = load_iris() X = iris.data y = iris.target # 使用留出法将数据集分为训练集和测试集,比例为 2:1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3, random_state=42, stratify=y) # (2)使用训练集训练 K 均值聚类算法,类别数为 3 kmeans = KMeans(n_clusters=3, random_state=42) kmeans.fit(X_train) # (3)使用五折交叉验证对模型性能进行评估 # 注意:对于 KMeans 聚类模型,需要做一些处理才能用交叉验证评估 # 因为 KMeans 是无监督学习算法,它的目标是将样本分成 K 个簇,并且簇的标签是无关的 # 所以我们需要将预测的簇标签与真实的标签进行映射。 # 交叉验证评分的评估指标 def clustering_metrics(true_labels, predicted_labels): # 对标签进行映射 label_encoder = LabelEncoder() true_labels = label_encoder.fit_transform(true_labels) predicted_labels = label_encoder.transform(predicted_labels) # 计算评估指标 accuracy = accuracy_score(true_labels, predicted_labels) precision = precision_score(true_labels, predicted_labels, average='weighted') recall = recall_score(true_labels, predicted_labels, average='weighted') f1 = f1_score(true_labels, predicted_labels, average='weighted') return accuracy, precision, recall, f1 # 五折交叉验证 cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) accuracies, precisions, recalls, f1_scores = [], [], [], [] for train_index, val_index in cv.split(X_train, y_train): X_cv_train, X_cv_val = X_train[train_index], X_train[val_index] y_cv_train, y_cv_val = y_train[train_index], y_train[val_index] # 训练 KMeans 模型 kmeans.fit(X_cv_train) # 预测 y_pred = kmeans.predict(X_cv_val) # 计算并记录评估指标 accuracy, precision, recall, f1 = clustering_metrics(y_cv_val, y_pred) accuracies.append(accuracy) precisions.append(precision) recalls.append(recall) f1_scores.append(f1) # 输出五折交叉验证的评估结果 print(f"交叉验证的平均准确率: {np.mean(accuracies):.4f}") print(f"交叉验证的平均精度: {np.mean(precisions):.4f}") print(f"交叉验证的平均召回率: {np.mean(recalls):.4f}") print(f"交叉验证的平均F1值: {np.mean(f1_scores):.4f}") # (4)使用测试集,测试模型的性能,对测试结果进行分析 # 使用训练好的 KMeans 模型对测试集进行预测 y_test_pred = kmeans.predict(X_test) # 计算测试集上的评估指标 test_accuracy, test_precision, test_recall, test_f1 = clustering_metrics(y_test, y_test_pred) # 输出测试集上的评估结果 print(f"测试集的准确率: {test_accuracy:.4f}") print(f"测试集的精度: {test_precision:.4f}") print(f"测试集的召回率: {test_recall:.4f}") print(f"测试集的F1值: {test_f1:.4f}")