K均值聚类

调用肘部法返回K值送入Kmeans 根据K值在中心点集合中选取前K-1个中心点

1.聚类结果 调用肘部法返回K值送入Kmeans

import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
 
 
# 计算欧式距离
def Distance(dataSet, centroids, k) -> np.array:
    dis = []
    for data in dataSet:
        diff = np.tile(data, (k, 1)) - centroids  # 行数上复制k份,方便作差
        temp1 = diff ** 2
        temp2 = np.sum(temp1, axis=1)  # 按行相加
        dis_temp = temp2 ** 0.5
        dis.append(dis_temp)
    dis = np.array(dis)  # 转换为一个array类型
    #print(dis)
    return dis
 
 
# 更新质心
def Update_cen(dataSet, centroids, k):
    # 计算每个样本到质心的距离,返回值是array数组
    distance = Distance(dataSet, centroids, k)
    # print("输出所有样本到质心的距离:", distance)
    # 分组并计算新的质心
    minIndex = np.argmin(distance, axis=1)  # axis=1 返回每行最小值的索引
    #print("输出最小值索引", minIndex)
    newCentroids = pd.DataFrame(dataSet).groupby(minIndex).mean()  # 每个数据离哪一个中心最近?按此分组 求均值得到新的质心
    #print("新质心",newCentroids)
    #newCentroids_Data = pd.DataFrame(dataset)
    #print(newCentroids_Data)
    #newCentroids_temp = newCentroids_Data.groupby(minIndex)
    #for key, value in newCentroids_temp:  #输出查看排序后数组
    #   print(key)
    #   print(value)
    #   print("")
    # print("新的质心(dataframe):", newCentroids)
    newCentroids = newCentroids.values
    # print("新的质心(值):", newCentroids)
 
    # 计算变化量
    changed = newCentroids - centroids
    return changed, newCentroids
 
 
# k-means 算法实现
def kmeans(dataSet, k):
    # (1) 随机选定k个质心
    centroids = random.sample(dataSet, k)
    #centroids=[[2,10],[5,8],[1,2],[35,4],[15,1],[5,5],[15,5]]
    print("质心:", centroids)
 
    # (2) 计算样本值到质心之间的距离,直到质心的位置不再改变
    #np.sum((cur_centers - org_centers) / org_centers * 100.0) > self.tolerance_:
    changed, newCentroids = Update_cen(dataSet, centroids, k)
    while np.any(changed):
        changed, newCentroids = Update_cen(dataSet, newCentroids, k)
    centroids = sorted(newCentroids.tolist())
 
    # (3) 根据最终的质心,计算每个集群的样本
    cluster = []
    dis = Distance(dataSet, centroids, k)  # 调用欧拉距离
    minIndex = np.argmin(dis, axis=1)
    for i in range(k):
        cluster.append([])
    for i, j in enumerate(minIndex):  # enumerate()可同时遍历索引和遍历元素
        cluster[j].append(dataSet[i])
        #print("集群样本")
        #print(cluster[1])
 
    return centroids, cluster
 
#求最佳k值
def train_cluster(train_vecs, model_name=None, start_k=2, end_k=15):
    print('training cluster')
    SSE = []
    SSE_d1 = [] #sse的一阶导数
    SSE_d2 = [] #sse的二阶导数
    models = [] #保存每次的模型
 
    for i in range(start_k, end_k):
        kmeans_model = KMeans(n_clusters=i )
        kmeans_model.fit(train_vecs)
        SSE.append(kmeans_model.inertia_)  # 保存每一个k值的SSE值
        print(SSE)
        #print('{} Means SSE loss = {}'.format(i, kmeans_model.inertia_))
        models.append(kmeans_model)
        print(models)
    plt.figure(dpi=150)
    plt.plot(SSE,marker='o')
    plt.show()
 
    # 求二阶导数,通过sse方法计算最佳k值
    SSE_length = len(SSE)
    for i in range(1, SSE_length):
        SSE_d1.append((SSE[i - 1] - SSE[i]) / 2)
    for i in range(1, len(SSE_d1) - 1):
        SSE_d2.append((SSE_d1[i - 1] - SSE_d1[i]) / 2)
        #print(SSE_d2)
 
    best_model = models[SSE_d2.index(max(SSE_d2)) + 1]
 
    #return best_model
    #print(best_model, type(best_model))
    print(best_model.n_clusters)
    return best_model.n_clusters
 
 
# 创建数据集
def createDataSet():
   return [[9,37],[42,37],[11,50],[27,25],[27,6],[7,22],[44,9],[33,28],[43,37],[12,50],[14,48],[34,21],[1,26],[13,2],[24,11],[24,28],[48,16],[42,31],[34,20],[46,25],[31,47],[29,3],[39,50],[45,50],[48,46],[35,48],[48,35],[40,18],[31,50],[28,36],[11,48],[3,27],[30,18],[32,21],[24,37],[48,48],[48,47],[11,2],[15,48],[6,12],[45,27],[47,49],[49,43],[46,46],[13,19],[18,31],[44,47],[21,8],[36,48],[27,27],[9,9],[39,50],[50,11],[27,32],[50,14],[4,11],[40,14],[26,42],[23,48],[29,35],[6,7],[11,4],[43,3],[41,8],[24,15],[3,48],[9,6],[20,45],[27,40],[49,18],[8,39],[40,25],[28,28],[7,36],[7,38],[26,47],[49,12],[49,39],[27,26],[39,17],[21,19],[12,17],[17,50],[40,18],[31,21],[35,26],[42,11],[45,34],[6,32],[25,28],[14,21],[37,22],[50,22],[3,16],[7,3],[29,47],[25,6],[11,5],[45,15],[26,33],[37,39],[14,31],[50,48],[30,27],[31,4],[33,32],[34,17],[42,37],[22,5],[42,23],[19,38],[49,31],[46,43],[33,44],[2,12],[12,26],[27,18],[18,37],[13,26],[2,35],[7,12],[19,45],[15,45],[40,18],[43,29],[39,22],[11,44],[24,1],[13,7],[48,30],[8,42],[17,42],[4,42],[47,46],[41,41],[15,17],[9,13],[21,15],[27,22],[15,34],[21,8],[15,39],[2,27],[7,41],[26,4],[15,50],[21,7],[46,29],[31,14],[42,22],[49,25],[6,42],[4,35],[24,34],[43,5],[32,39],[3,11],[34,8],[20,10],[4,29],[36,32],[17,16],[17,26],[39,16],[16,28],[40,22],[7,7],[6,35],[36,9],[33,7],[28,17],[42,47],[9,50],[50,21],[22,12],[42,43],[35,4],[36,10],[27,6],[13,12],[41,43],[50,44],[24,34],[7,11],[49,16],[3,37],[10,48],[15,12],[36,48],[3,17],[31,16],[47,14],[6,25],[31,28],[1,46],[47,7],[41,27],[35,22],[20,46],[28,49]]
 
 
 
if __name__ == '__main__':
    dataset = createDataSet()
    k=train_cluster(dataset, model_name=None, start_k=2, end_k=15)
    centroids, cluster = kmeans(dataset, k)
    print("共",k,"聚类")
    for i in range(k):
        print(i+1,'聚类质心为: %s' % centroids[i])
        print(i+1,'聚类集群为:%s' % cluster[i])
    color_all = ['c', 'g', 'r', 'm', 'y', 'k', 'b']
    # x = list(np.array(dataset).T[0])
    # y = list(np.array(dataset).T[1])
    #plt.scatter(list(np.array(dataset).T[0]), list(np.array(dataset).T[1]), marker='o', color='green', label="数据集" )
    plt.scatter(list(np.array(centroids).T[0]), list(np.array(centroids).T[1]), marker='x', color='red', label="质心")
    for i in range(k):
        plt.scatter(list(np.array(cluster[i]).T[0]), list(np.array(cluster[i]).T[1]), marker='o', color=color_all[i], label="数据集" )
        
    plt.show()
 
 
def createDataSet():
   return [[18,29014],[18,77868],[19,58759],[19,37786],[20,8899],[20,46768],[20,774613],[21,488014],[21,769689],[22,359562],[22,352376],[24,111367],[24,152688],[24,109327],[24,106804],[24,137929],[25,71109],[25,113614],[25,119435],[26,154356],[26,44811],[27,166804],[28,98015],[29,21363],[29,125762],[29,93072],[29,153561],[30,39274],[30,168760],[30,24461],[30,191557],[30,61752],[32,124488],[32,170708],[33,130019],[33,99870],[33,331454],[34,190433],[34,177582],[34,300554],[34,138273],[34,314246],[35,276270],[35,37760],[35,296483],[35,377311],[35,6868],[35,9689],[36,992675],[37,830087],[37,937103],[37,40717],[38,953928],[39,13622507],[39,12048197],[40,1920635],[40,18916664],[40,3880638],[40,25428],[41,162939],[41,71320],[41,360108],[42,322575],[42,97904],[42,215271],[43,34127],[43,203836],[43,6359],[43,9408],[43,970418],[44,428948],[44,349269],[45,232780],[46,293502],[46,67138],[46,834070],[47,27633],[47,624311],[48,63117],[48,549391],[49,99226],[49,358856],[49,436160],[50,3984],[51,1799],[51,405163],[51,846203],[52,479420],[53,92465],[53,136059],[53,56145],[53,158809],[54,947448],[54,625963],[55,666463],[55,995614],[55,17004],[56,566007],[56,919124],[56,484365],[57,938506],[57,164692],[57,936188],[57,617512],[58,455966],[58,59662],[58,297124],[59,605331],[59,497737],[59,214608],[59,156849],[60,920477],[60,5894008],[60,2455622],[61,480931],[62,95628],[63,50546],[63,39582],[63,428934],[64,390934],[65,920156],[65,107724],[65,3509273],[65,7842456],[66,8909362],[66,5311804],[67,46879],[68,24905],[68,83869],[68,880292],[71,894481],[71,524879],[72,368116],[72,3865443],[72,405367],[72,8691331],[72,3060200],[73,9806966],[73,1874448],[73,5716398],[73,9629520],[73,30687],[73,59736],[74,55489],[74,318877],[74,859963],[75,633227],[75,725002],[75,3695087],[75,5982553],[75,3043470],[75,2390573],[75,45888],[76,40348],[76,88729],[77,229941],[77,70829],[77,836217],[77,717727],[78,4418],[78,75522],[79,89147],[79,49533],[80,21818],[80,90642],[80,772160],[80,781887],[81,262857],[81,768323],[81,39781],[81,6117],[81,85550],[81,90980],[82,81310],[82,57097],[82,684607],[82,169619],[82,830320],[83,912973],[83,34105],[84,55130],[84,10447],[84,10405],[85,21196],[85,34589],[85,50499],[86,72650],[86,397602],[86,649289],[86,490891],[87,115778],[87,89786],[88,15375],[88,80349],[89,94850],[89,20309],[89,87940],[89,410890],[90,573323],[90,417892]]
posted @ 2023-09-13 17:31  孙犯困  阅读(62)  评论(0编辑  收藏  举报