python聚类分析

 

#!/usr/bin/env python
#-*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import preprocessing
from scipy.spatial.distance import cdist
from sklearn import metrics

# 读取原始数据
X = []
y_true = []
id = []

f = open('data/wina.data')
for line in f:
    y = []
    for index,item in enumerate(line.split(",")):
        if index == 0:
            id.append(int(item))
            continue
        y.append(float(item))
    X.append(y)
# 转化为numpy array
X = np.array(X)
y_true = np.array(id)


min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)


K = range(1, 10)
meandistortions = []
for k in K:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
plt.plot(K, meandistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('meandistortions')
plt.title('best K of the model')
plt.show()
n_clusters = 3


cls = KMeans(n_clusters).fit(X)
y_pre = cls.predict(X)

n_samples,n_features=X.shape     #总样本量,总特征数
inertias = cls.inertia_   #样本距离最近的聚类中心的总和
adjusted_rand_s=metrics.adjusted_rand_score(y_true,y_pre)           #调整后的兰德指数
homogeneity_s=metrics.homogeneity_score(y_true,y_pre)               #同质化得分
silhouette_s=metrics.silhouette_score(X,y_pre,metric='euclidean')   #平均轮廓系数
print("兰德指数ART",adjusted_rand_s)
print("同质化得分homo",homogeneity_s)
print("平均轮廓系数",silhouette_s)

centers=cls.cluster_centers_  #各类别中心

colors=['#ff0000','#00ff00','#0000ff']   #设置不同类别的颜色
plt.figure()    #建立画布
for i in range(n_clusters):    #循环读取类别
    index_sets=np.where(y_pre==i)  #找到相同类的索引集合、
    cluster=X[index_sets]   #将相同类的数据划分为一个聚类子集
    plt.scatter(cluster[:,0],cluster[:,0],c=colors[i],marker='.')   #展示聚类子集内的样本点
    plt.plot(centers[i][0],centers[i][0],'*',markerfacecolor=colors[i],markeredgecolor='k',markersize=6)
plt.show()

 

 

 

posted @ 2020-03-28 15:47  masuo  阅读(1454)  评论(0编辑  收藏  举报