2024.12.6(周五)

# 导入相关库
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

# (1)从 scikit-learn 库中加载 iris 数据集,使用留出法留出 1/3 的样本作为测试集
# 加载Iris数据集
iris = load_iris()
X = iris.data
y = iris.target

# 使用留出法将数据集分为训练集和测试集,比例为 2:1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3, random_state=42, stratify=y)

# (2)使用训练集训练 K 均值聚类算法,类别数为 3
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_train)


# (3)使用五折交叉验证对模型性能进行评估
# 注意:对于 KMeans 聚类模型,需要做一些处理才能用交叉验证评估
# 因为 KMeans 是无监督学习算法,它的目标是将样本分成 K 个簇,并且簇的标签是无关的
# 所以我们需要将预测的簇标签与真实的标签进行映射。

# 交叉验证评分的评估指标
def clustering_metrics(true_labels, predicted_labels):
    # 对标签进行映射
    label_encoder = LabelEncoder()
    true_labels = label_encoder.fit_transform(true_labels)
    predicted_labels = label_encoder.transform(predicted_labels)

    # 计算评估指标
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')

    return accuracy, precision, recall, f1


# 五折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies, precisions, recalls, f1_scores = [], [], [], []

for train_index, val_index in cv.split(X_train, y_train):
    X_cv_train, X_cv_val = X_train[train_index], X_train[val_index]
    y_cv_train, y_cv_val = y_train[train_index], y_train[val_index]

    # 训练 KMeans 模型
    kmeans.fit(X_cv_train)

    # 预测
    y_pred = kmeans.predict(X_cv_val)

    # 计算并记录评估指标
    accuracy, precision, recall, f1 = clustering_metrics(y_cv_val, y_pred)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# 输出五折交叉验证的评估结果
print(f"交叉验证的平均准确率: {np.mean(accuracies):.4f}")
print(f"交叉验证的平均精度: {np.mean(precisions):.4f}")
print(f"交叉验证的平均召回率: {np.mean(recalls):.4f}")
print(f"交叉验证的平均F1值: {np.mean(f1_scores):.4f}")

# (4)使用测试集,测试模型的性能,对测试结果进行分析
# 使用训练好的 KMeans 模型对测试集进行预测
y_test_pred = kmeans.predict(X_test)

# 计算测试集上的评估指标
test_accuracy, test_precision, test_recall, test_f1 = clustering_metrics(y_test, y_test_pred)

# 输出测试集上的评估结果
print(f"测试集的准确率: {test_accuracy:.4f}")
print(f"测试集的精度: {test_precision:.4f}")
print(f"测试集的召回率: {test_recall:.4f}")
print(f"测试集的F1值: {test_f1:.4f}")

 

posted @ 2024-12-02 16:47  记得关月亮  阅读(1)  评论(0编辑  收藏  举报