K均值聚类
方法1
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet(filename):
'''
读取数据集
Args:
filename: 文件名
Returns:
dataMat: 数据样本矩阵
'''
dataMat = []
with open(filename, 'rb') as f:
for line in f:
# 读取的字节流需要先解码成utf-8再处理
eles = list(map(float, line.decode('utf-8').strip().split('\t')))
dataMat.append(eles)
return dataMat
def distEclud(vecA, vecB):
'''
计算两向量的欧氏距离
Args:
vecA: 向量A
vecB: 向量B
Returns:
欧式距离
'''
return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
def randCent(dataSet, k):
'''
随机生成k个聚类中心
Args:
dataSet: 数据集
k: 簇数目
Returns:
centroids: 聚类中心矩阵
'''
m, _ = dataSet.shape
# 随机从数据集中选几个作为初始聚类中心
centroids = dataSet.take(np.random.choice(80, k), axis=0)
return centroids
def kMeans(dataSet, k, maxIter=20):
'''
K-Means
Args:
dataSet: 数据集
k: 聚类数
Returns:
centroids: 聚类中心
clusterAssment: 点分配结果
'''
# 随机初始化聚类中心
centroids = randCent(dataSet, k)
init_centroids = centroids.copy()
m, n = dataSet.shape
# 点分配结果:第一列指明样本所在的簇,第二列指明该样本到聚类中心的距离
clusterAssment = np.mat(np.zeros((m, 2)))
# 标识聚类中心是否仍在变化
clusterChanged = True
# 直至聚类中心不再变化
iterCount = 0
while clusterChanged and iterCount < maxIter:
iterCount += 1
clusterChanged = False
# 分配样本到簇
for i in range(m):
# 计算第i个样本到各个聚类中心的距离
minIndex = 0
minDist = np.inf
for j in range(k):
dist = distEclud(dataSet[i, :], centroids[j, :])
if dist < minDist:
minIndex = j
minDist = dist
# 任何一个样本的类簇分配发生变化则认为变化
if clusterAssment[i, 0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist ** 2
# 刷新聚类中心:移动聚类中心点到所有簇的均值位置
for cent in range(k):
# 通过数组过滤得到簇中的点
# matrix.A 是将matrix-->array
ptsInCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == cent)[0]]
if ptsInCluster.shape[0] > 0:
# 计算均值并移动
centroids[cent, :] = np.mean(ptsInCluster, axis=0)
return centroids, clusterAssment, init_centroids
dataMat = np.mat(loadDataSet('D:/python_project/HandlePythonExample/day2/data/testSet.txt'))
m, n = np.shape(dataMat)
set_k = 3
centroids, clusterAssment, init_centroids = kMeans(dataMat, set_k)
clusterCount = np.shape(centroids)[0]
# 我们这里只设定了最多四个簇的样式,所以前面如果set_k设置超过了4,后面就会出现index error
patterns = ['o', 'D', '^', 's']
colors = ['b', 'g', 'y', 'black']
fig = plt.figure()
title = 'kmeans with k={}'.format(set_k)
ax = fig.add_subplot(111, title=title)
for k in range(clusterCount):
# 绘制聚类中心
ax.scatter(centroids[k, 0], centroids[k, 1], color='r', marker='+', linewidth=20)
# 绘制初始聚类中心
ax.scatter(init_centroids[k, 0], init_centroids[k, 1], color='purple', marker='*', linewidth=10)
for i in range(m):
# 绘制属于该聚类中心的样本
ptsInCluster = dataMat[np.nonzero(clusterAssment[:, 0].A == k)[0]]
ax.scatter(ptsInCluster[:, 0].flatten().A[0], ptsInCluster[:, 1].flatten().A[0], color=colors[k],
marker=patterns[k])
plt.show()
数据集
1.658985 4.285136
-3.453687 3.424321
4.838138 -1.151539
-5.379713 -3.362104
0.972564 2.924086
-3.567919 1.531611
0.450614 -3.302219
-3.487105 -1.724432
2.668759 1.594842
-3.156485 3.191137
3.165506 -3.999838
-2.786837 -3.099354
4.208187 2.984927
-2.123337 2.943366
0.704199 -0.479481
-0.392370 -3.963704
2.831667 1.574018
-0.790153 3.343144
2.943496 -3.357075
-3.195883 -2.283926
2.336445 2.875106
-1.786345 2.554248
2.190101 -1.906020
-3.403367 -2.778288
1.778124 3.880832
-1.688346 2.230267
2.592976 -2.054368
-4.007257 -3.207066
2.257734 3.387564
-2.679011 0.785119
0.939512 -4.023563
-3.674424 -2.261084
2.046259 2.735279
-3.189470 1.780269
4.372646 -0.822248
-2.579316 -3.497576
1.889034 5.190400
-0.798747 2.185588
2.836520 -2.658556
-3.837877 -3.253815
2.096701 3.886007
-2.709034 2.923887
3.367037 -3.184789
-2.121479 -4.232586
2.329546 3.179764
-3.284816 3.273099
3.091414 -3.815232
-3.762093 -2.432191
3.542056 2.778832
-1.736822 4.241041
2.127073 -2.983680
-4.323818 -3.938116
3.792121 5.135768
-4.786473 3.358547
2.624081 -3.260715
-4.009299 -2.978115
2.493525 1.963710
-2.513661 2.642162
1.864375 -3.176309
-3.171184 -3.572452
2.894220 2.489128
-2.562539 2.884438
3.491078 -3.947487
-2.565729 -2.012114
3.332948 3.983102
-1.616805 3.573188
2.280615 -2.559444
-2.651229 -3.103198
2.321395 3.154987
-1.685703 2.939697
3.031012 -3.620252
-4.599622 -2.185829
4.196223 1.126677
-2.133863 3.093686
4.668892 -2.562705
-2.793241 -2.149706
2.884105 3.043438
-2.967647 2.848696
4.479332 -1.764772
-4.905566 -2.911070
方法2 调用库的手段
print(__doc__)
# Author: Phil Roth <mr.phil.roth@gmail.com>
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
plt.figure(figsize=(12, 12))
n_samples = 1500
random_state = 200
X, y = make_blobs(n_samples=n_samples, random_state=random_state)
# Incorrect number of clusters
y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X)
plt.plot()
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
# plt.scatter(X[:, 0], X[:, 1])
plt.title("kmeans")
plt.show()
posted on 2019-07-23 09:20 Indian_Mysore 阅读(214) 评论(0) 编辑 收藏 举报