昆仑山:眼中无形心中有穴之穴人合一

夫君子之行,静以修身,俭以养德;非澹泊无以明志,非宁静无以致远。夫学须静也,才须学也;非学无以广才,非志无以成学。怠慢则不能励精,险躁则不能冶性。年与时驰,意与岁去,遂成枯落,多不接世。悲守穷庐,将复何及!

 

K均值聚类

方法1


import numpy as np
import matplotlib.pyplot as plt


def loadDataSet(filename):
    '''
    读取数据集

    Args:
        filename: 文件名
    Returns:
        dataMat: 数据样本矩阵
    '''
    dataMat = []
    with open(filename, 'rb') as f:
        for line in f:
            # 读取的字节流需要先解码成utf-8再处理
            eles = list(map(float, line.decode('utf-8').strip().split('\t')))
            dataMat.append(eles)
    return dataMat


def distEclud(vecA, vecB):
    '''
    计算两向量的欧氏距离

    Args:
        vecA: 向量A
        vecB: 向量B
    Returns:
        欧式距离
    '''
    return np.sqrt(np.sum(np.power(vecA - vecB, 2)))


def randCent(dataSet, k):
    '''
    随机生成k个聚类中心

    Args:
        dataSet: 数据集
        k: 簇数目
    Returns:
        centroids: 聚类中心矩阵
    '''
    m, _ = dataSet.shape
    # 随机从数据集中选几个作为初始聚类中心
    centroids = dataSet.take(np.random.choice(80, k), axis=0)
    return centroids


def kMeans(dataSet, k, maxIter=20):
    '''
    K-Means

    Args:
        dataSet: 数据集
        k: 聚类数
    Returns:
        centroids: 聚类中心
        clusterAssment: 点分配结果
    '''
    # 随机初始化聚类中心
    centroids = randCent(dataSet, k)
    init_centroids = centroids.copy()

    m, n = dataSet.shape

    # 点分配结果:第一列指明样本所在的簇,第二列指明该样本到聚类中心的距离
    clusterAssment = np.mat(np.zeros((m, 2)))

    # 标识聚类中心是否仍在变化
    clusterChanged = True

    # 直至聚类中心不再变化
    iterCount = 0
    while clusterChanged and iterCount < maxIter:
        iterCount += 1
        clusterChanged = False
        # 分配样本到簇
        for i in range(m):
            # 计算第i个样本到各个聚类中心的距离
            minIndex = 0
            minDist = np.inf
            for j in range(k):
                dist = distEclud(dataSet[i, :], centroids[j, :])
                if dist < minDist:
                    minIndex = j
                    minDist = dist
            # 任何一个样本的类簇分配发生变化则认为变化
            if clusterAssment[i, 0] != minIndex:
                clusterChanged = True
            clusterAssment[i, :] = minIndex, minDist ** 2

        # 刷新聚类中心:移动聚类中心点到所有簇的均值位置
        for cent in range(k):
            # 通过数组过滤得到簇中的点
            # matrix.A 是将matrix-->array
            ptsInCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == cent)[0]]
            if ptsInCluster.shape[0] > 0:
                # 计算均值并移动
                centroids[cent, :] = np.mean(ptsInCluster, axis=0)
    return centroids, clusterAssment, init_centroids


dataMat = np.mat(loadDataSet('D:/python_project/HandlePythonExample/day2/data/testSet.txt'))
m, n = np.shape(dataMat)

set_k = 3
centroids, clusterAssment, init_centroids = kMeans(dataMat, set_k)

clusterCount = np.shape(centroids)[0]
# 我们这里只设定了最多四个簇的样式,所以前面如果set_k设置超过了4,后面就会出现index error
patterns = ['o', 'D', '^', 's']
colors = ['b', 'g', 'y', 'black']

fig = plt.figure()
title = 'kmeans with k={}'.format(set_k)
ax = fig.add_subplot(111, title=title)
for k in range(clusterCount):
    # 绘制聚类中心
    ax.scatter(centroids[k, 0], centroids[k, 1], color='r', marker='+', linewidth=20)
    # 绘制初始聚类中心
    ax.scatter(init_centroids[k, 0], init_centroids[k, 1], color='purple', marker='*', linewidth=10)
    for i in range(m):
        # 绘制属于该聚类中心的样本
        ptsInCluster = dataMat[np.nonzero(clusterAssment[:, 0].A == k)[0]]
        ax.scatter(ptsInCluster[:, 0].flatten().A[0], ptsInCluster[:, 1].flatten().A[0], color=colors[k],
                   marker=patterns[k])
plt.show()

数据集

1.658985	4.285136
-3.453687	3.424321
4.838138	-1.151539
-5.379713	-3.362104
0.972564	2.924086
-3.567919	1.531611
0.450614	-3.302219
-3.487105	-1.724432
2.668759	1.594842
-3.156485	3.191137
3.165506	-3.999838
-2.786837	-3.099354
4.208187	2.984927
-2.123337	2.943366
0.704199	-0.479481
-0.392370	-3.963704
2.831667	1.574018
-0.790153	3.343144
2.943496	-3.357075
-3.195883	-2.283926
2.336445	2.875106
-1.786345	2.554248
2.190101	-1.906020
-3.403367	-2.778288
1.778124	3.880832
-1.688346	2.230267
2.592976	-2.054368
-4.007257	-3.207066
2.257734	3.387564
-2.679011	0.785119
0.939512	-4.023563
-3.674424	-2.261084
2.046259	2.735279
-3.189470	1.780269
4.372646	-0.822248
-2.579316	-3.497576
1.889034	5.190400
-0.798747	2.185588
2.836520	-2.658556
-3.837877	-3.253815
2.096701	3.886007
-2.709034	2.923887
3.367037	-3.184789
-2.121479	-4.232586
2.329546	3.179764
-3.284816	3.273099
3.091414	-3.815232
-3.762093	-2.432191
3.542056	2.778832
-1.736822	4.241041
2.127073	-2.983680
-4.323818	-3.938116
3.792121	5.135768
-4.786473	3.358547
2.624081	-3.260715
-4.009299	-2.978115
2.493525	1.963710
-2.513661	2.642162
1.864375	-3.176309
-3.171184	-3.572452
2.894220	2.489128
-2.562539	2.884438
3.491078	-3.947487
-2.565729	-2.012114
3.332948	3.983102
-1.616805	3.573188
2.280615	-2.559444
-2.651229	-3.103198
2.321395	3.154987
-1.685703	2.939697
3.031012	-3.620252
-4.599622	-2.185829
4.196223	1.126677
-2.133863	3.093686
4.668892	-2.562705
-2.793241	-2.149706
2.884105	3.043438
-2.967647	2.848696
4.479332	-1.764772
-4.905566	-2.911070


方法2 调用库的手段

print(__doc__)

# Author: Phil Roth <mr.phil.roth@gmail.com>
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

plt.figure(figsize=(12, 12))

n_samples = 1500
random_state = 200
X, y = make_blobs(n_samples=n_samples, random_state=random_state)

# Incorrect number of clusters
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X)

plt.plot()
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
# plt.scatter(X[:, 0], X[:, 1])
plt.title("kmeans")

plt.show()

posted on 2019-07-23 09:20  Indian_Mysore  阅读(214)  评论(0编辑  收藏  举报

导航