# 导入必要的库
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score, make_scorer
)
import numpy as np
from collections import Counter
# Step 1: 加载 iris 数据集并分割为训练集和测试集
iris = load_iris() # 加载数据集
X, y = iris.data, iris.target # 提取特征和标签
# 标准化数据
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # 对数据进行标准化
# 留出法分割数据集,测试集占 1/3,保证同分布
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=1/3, random_state=42, stratify=y
)
# 修正后的类别映射函数
def map_clusters_to_labels(y_true, y_pred):
labels = np.zeros_like(y_pred)
for cluster in np.unique(y_pred):
mask = (y_pred == cluster)
if np.sum(mask) == 0: # 如果当前 cluster 没有匹配的数据,跳过
continue
# 使用 Counter 统计每个类别的出现次数,找出最多的类别
most_common_label = Counter(y_true[mask]).most_common(1)[0][0]
labels[mask] = most_common_label
return labels
# Step 2: 初始化并训练 K 均值聚类模型
# 初始化 KMeans 模型
kmeans = KMeans(n_clusters=3, random_state=42) # 类别数为 3
# 使用训练集训练模型
kmeans.fit(X_train)
# 映射训练集的预测结果
y_train_pred = map_clusters_to_labels(y_train, kmeans.labels_)
# Step 3: 使用五折交叉验证评估模型性能
# 定义评估指标
scoring = {
'accuracy': make_scorer(accuracy_score),
'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
'recall_macro': make_scorer(recall_score, average='macro', zero_division=0),
'f1_macro': make_scorer(f1_score, average='macro', zero_division=0)
}
# 自定义交叉验证函数
from sklearn.base import ClusterMixin, BaseEstimator
class KMeansWithMapping(ClusterMixin, BaseEstimator):
def __init__(self, n_clusters=3, random_state=None):
self.n_clusters = n_clusters
self.random_state = random_state
def fit(self, X, y=None):
self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
self.kmeans.fit(X)
if y is not None:
self.labels_ = map_clusters_to_labels(y, self.kmeans.labels_)
return self
def predict(self, X):
return self.kmeans.predict(X)
# 初始化 KMeans 带映射模型
kmeans_cv = KMeansWithMapping(n_clusters=3, random_state=42)
# 五折交叉验证
cv_results = cross_validate(kmeans_cv, X_train, y_train, cv=5, scoring=scoring)
# 打印交叉验证结果
print("五折交叉验证结果:")
for metric in scoring.keys():
mean = cv_results['test_' + metric].mean()
std = cv_results['test_' + metric].std()
print(f"{metric}: {mean:.4f} ± {std:.4f}")
# Step 4: 测试集评估模型性能
# 测试集预测
y_test_pred = map_clusters_to_labels(y_test, kmeans.predict(X_test))
# 计算性能指标
print("\n测试集性能报告:")
print(f"准确度: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"精度: {precision_score(y_test, y_test_pred, average='macro', zero_division=0):.4f}")
print(f"召回率: {recall_score(y_test, y_test_pred, average='macro', zero_division=0):.4f}")
print(f"F1 值: {f1_score(y_test, y_test_pred, average='macro', zero_division=0):.4f}")
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人