12.12

(1)从scikit-learn 库中加载 iris 数据集,使用留出法留出 1/3 的样本作为测试集(注意同分布取样);

(2)使用训练集训练K均值聚类算法,类别数为3;

(3)使用五折交叉验证对模型性能(准确度、精度、召回率和 F1 值)进行评估和选择;

(4)使用测试集,测试模型的性能,对测试结果进行分析,完成实验报告中实验七的部分。

 

import numpy as np

import pandas as pd

from sklearn import datasets

from sklearn.model_selection import train_test_split, KFold

from sklearn.cluster import KMeans

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

 

# (1) 加载 iris 数据集,使用留出法留出 1/3 的样本作为测试集

iris = datasets.load_iris()

X = iris.data

y = iris.target

 

# 留出法,33% 测试集,保持相同分布

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

 

# (2) 使用训练集训练 K 均值聚类算法,类别数为 3

kmeans = KMeans(n_clusters=3, random_state=42)

kmeans.fit(X_train)

 

# (3) 使用五折交叉验证对模型性能进行评估

kf = KFold(n_splits=5, shuffle=True, random_state=42)

acc_scores = []

precision_scores = []

recall_scores = []

f1_scores = []

 

for train_index, val_index in kf.split(X_train):

    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]

    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

   

    # 训练 K 均值聚类

    kmeans_fold = KMeans(n_clusters=3, random_state=42)

    kmeans_fold.fit(X_train_fold)

   

    # 预测

    y_val_pred = kmeans_fold.predict(X_val_fold)

   

    # 我们需要将预测标签映射到真实标签(以便评估指标更加准确)

    # 获取混淆矩阵

    confusion = np.zeros((3, 3))

    for true, pred in zip(y_val_fold, y_val_pred):

        confusion[true, pred] += 1

   

    # 选取每一类预测中数量最多的标签

    y_pred_mapped = np.asarray([np.argmax(confusion[:, pred]) for pred in y_val_pred])

   

    acc_scores.append(accuracy_score(y_val_fold, y_pred_mapped))

    precision_scores.append(precision_score(y_val_fold, y_pred_mapped, average='macro'))

    recall_scores.append(recall_score(y_val_fold, y_pred_mapped, average='macro'))

    f1_scores.append(f1_score(y_val_fold, y_pred_mapped, average='macro'))

 

# 打印五折交叉验证的平均得分

print("五折交叉验证性能:")

print("平均准确度: ", np.mean(acc_scores))

print("平均精度: ", np.mean(precision_scores))

print("平均召回率: ", np.mean(recall_scores))

print("平均 F1 值: ", np.mean(f1_scores))

 

# (4) 使用测试集测试模型性能

test_labels = kmeans.predict(X_test)

 

# 也需要将预测标签映射到真实标签

confusion_test = np.zeros((3, 3))

for true, pred in zip(y_test, test_labels):

    confusion_test[true, pred] += 1

 

test_pred_mapped = np.asarray([np.argmax(confusion_test[:, pred]) for pred in test_labels])

 

# 计算测试集的各项性能指标:

test_accuracy = accuracy_score(y_test, test_pred_mapped)

test_precision = precision_score(y_test, test_pred_mapped, average='macro')

test_recall = recall_score(y_test, test_pred_mapped, average='macro')

test_f1 = f1_score(y_test, test_pred_mapped, average='macro')

 

print("\n测试集性能:")

print("准确度: ", test_accuracy)

print("精度: ", test_precision)

print("召回率: ", test_recall)

print("F1值: ", test_f1)

 

# 结果分析

print("\n模型性能分析:")

if test_accuracy > 0.9:

    print("模型表现优异!")

elif test_accuracy > 0.7:

    print("模型表现良好,存在进一步优化的空间。")

else:

    print("模型表现不佳,应考虑调整模型或其他优化方案。")

 

posted @   涨涨涨张  阅读(5)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?
· 如何调用 DeepSeek 的自然语言处理 API 接口并集成到在线客服系统
点击右上角即可分享
微信分享提示