2024.12.13

# 导入必要的库
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score, make_scorer
)
import numpy as np
from collections import Counter

# Step 1: 加载 iris 数据集并分割为训练集和测试集
iris = load_iris() # 加载数据集
X, y = iris.data, iris.target # 提取特征和标签

# 标准化数据
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # 对数据进行标准化

# 留出法分割数据集,测试集占 1/3,保证同分布
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=1/3, random_state=42, stratify=y
)

# 修正后的类别映射函数
def map_clusters_to_labels(y_true, y_pred):
labels = np.zeros_like(y_pred)
for cluster in np.unique(y_pred):
mask = (y_pred == cluster)
if np.sum(mask) == 0: # 如果当前 cluster 没有匹配的数据,跳过
continue
# 使用 Counter 统计每个类别的出现次数,找出最多的类别
most_common_label = Counter(y_true[mask]).most_common(1)[0][0]
labels[mask] = most_common_label
return labels

# Step 2: 初始化并训练 K 均值聚类模型
# 初始化 KMeans 模型
kmeans = KMeans(n_clusters=3, random_state=42) # 类别数为 3

# 使用训练集训练模型
kmeans.fit(X_train)

# 映射训练集的预测结果
y_train_pred = map_clusters_to_labels(y_train, kmeans.labels_)

# Step 3: 使用五折交叉验证评估模型性能
# 定义评估指标
scoring = {
'accuracy': make_scorer(accuracy_score),
'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
'recall_macro': make_scorer(recall_score, average='macro', zero_division=0),
'f1_macro': make_scorer(f1_score, average='macro', zero_division=0)
}

# 自定义交叉验证函数
from sklearn.base import ClusterMixin, BaseEstimator

class KMeansWithMapping(ClusterMixin, BaseEstimator):
def __init__(self, n_clusters=3, random_state=None):
self.n_clusters = n_clusters
self.random_state = random_state

def fit(self, X, y=None):
self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
self.kmeans.fit(X)
if y is not None:
self.labels_ = map_clusters_to_labels(y, self.kmeans.labels_)
return self

def predict(self, X):
return self.kmeans.predict(X)

# 初始化 KMeans 带映射模型
kmeans_cv = KMeansWithMapping(n_clusters=3, random_state=42)

# 五折交叉验证
cv_results = cross_validate(kmeans_cv, X_train, y_train, cv=5, scoring=scoring)

# 打印交叉验证结果
print("五折交叉验证结果:")
for metric in scoring.keys():
mean = cv_results['test_' + metric].mean()
std = cv_results['test_' + metric].std()
print(f"{metric}: {mean:.4f} ± {std:.4f}")

# Step 4: 测试集评估模型性能
# 测试集预测
y_test_pred = map_clusters_to_labels(y_test, kmeans.predict(X_test))

# 计算性能指标
print("\n测试集性能报告:")
print(f"准确度: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"精度: {precision_score(y_test, y_test_pred, average='macro', zero_division=0):.4f}")
print(f"召回率: {recall_score(y_test, y_test_pred, average='macro', zero_division=0):.4f}")
print(f"F1 值: {f1_score(y_test, y_test_pred, average='macro', zero_division=0):.4f}")
posted @   我也不想的  阅读(4)  评论(0编辑  收藏  举报
(评论功能已被禁用)
相关博文:
阅读排行:
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
点击右上角即可分享
微信分享提示