k-means算法的Python实现
1 #coding=utf-8 2 import codecs 3 import numpy 4 from numpy import * 5 import pylab 6 7 def loadDataSet(fileName): 8 dataMat = [] 9 fr = codecs.open(fileName) 10 for line in fr.readlines(): 11 curLine = line.strip().split('\t') 12 fltLine = map(float, curLine) 13 dataMat.append(fltLine) 14 return dataMat 15 16 def distMeasure(vecA, vecB): 17 #print vecA 18 dist = sqrt(sum(power(vecA - vecB, 2))) 19 return dist 20 21 def kMeansInitCentroids(X, K): 22 """ 23 KMEANSINITCENTROIDS This function initializes K centroids that are to be 24 used in K-Means on the dataset X 25 centroids = KMEANSINITCENTROIDS(X, K) returns K initial centroids to be 26 used with the K-Means on the dataset X. 27 """ 28 n = shape(X)[1] 29 centroids = mat(zeros((K,n))) 30 for j in range(n): 31 #print X[:,j] 32 minJ = min(X[:,j]) 33 rangeJ = float(max(array(X)[:,j]) - minJ) 34 centroids[:,j] = minJ + rangeJ * random.rand(K,1) 35 return centroids 36 37 def findClosestCentroids(X, centroids): 38 """ 39 FINDCLOSESTCENTROIDS computes the centroid memberships for every example 40 idx = FINDCLOSESTCENTROIDS (X, centroids) returns the closest centroids 41 in idx for a dataset X where each row is a single example. idx = m x 1 42 vector of centroid assignments (i.e. each entry in range [1..K]) 43 """ 44 # 数据总量 45 m = shape(X)[0] 46 K = shape(centroids)[0] 47 clusterAssment = mat(zeros((m,2)))#create mat to assign data points 48 #to a centroid, also holds SE of each point 49 #centroids = createCent(dataSet, k) 50 clusterChanged = True 51 while clusterChanged: 52 clusterChanged = False 53 for i in range(m):#for each data point assign it to the closest centroid 54 minDist = inf; minIndex = -1 55 # k个中间数据(质心)都与数据i进行欧氏比较,选择距离最近的第minIndex类 56 for j in range(K): 57 distJI = distMeasure(centroids[j,:],X[i,:]) 58 if distJI < minDist: 59 minDist = distJI; minIndex = j 60 if clusterAssment[i,0] != minIndex: clusterChanged = True 61 clusterAssment[i,:] = minIndex,minDist**2 62 return clusterAssment 63 64 def computeCentroids(X, clusterAssment, K): 65 """ 66 COMPUTECENTROIDS returs the new centroids by computing the means of the 67 data points assigned to each centroid. 68 centroids = COMPUTECENTROIDS(X, idx, K) returns the new centroids by 69 computing the means of the data points assigned to each centroid. It is 70 given a dataset X where each row is a single data point, a vector 71 idx of centroid assignments (i.e. each entry in range [1..K]) for each 72 example, and K, the number of centroids. You should return a matrix 73 centroids, where each row of centroids is the mean of the data points 74 assigned to it. 75 """ 76 n = shape(X)[1] 77 centroids = mat(zeros((K,n))) 78 for centroid in range(K):#recalculate centroids 79 # nonzero会产生两个array,第一个非零的为序号列表 80 ptsInClust = X[nonzero(clusterAssment[:,0].A==centroid)[0]]#get all the point in this cluster 81 #print 'ererer:',ptsInClust,'dfdf' 82 centroids[centroid,:] = mean(ptsInClust, axis=0) #assign centroid to mean 83 return centroids 84 85 def show(dataSet, k, centroids, clusterAssment): 86 from matplotlib import pyplot as plt 87 numSamples, dim = dataSet.shape 88 mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr'] 89 print type(dataSet) 90 for i in xrange(numSamples): 91 markIndex = int(clusterAssment[i, 0]) 92 plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex]) 93 mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb'] 94 for i in range(k): 95 plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize = 12) 96 plt.show() 97 98 def runkMeans(X, initial_centroids,max_iters, plot_progress): 99 """ 100 RUNKMEANS runs the K-Means algorithm on data matrix X, where each row of X 101 is a single example 102 [centroids, idx] = RUNKMEANS(X, initial_centroids, max_iters, ... 103 plot_progress) runs the K-Means algorithm on data matrix X, where each 104 row of X is a single example. It uses initial_centroids used as the 105 initial centroids. max_iters specifies the total number of interactions 106 of K-Means to execute. plot_progress is a true/false flag that 107 indicates if the function should also plot its progress as the 108 learning happens. This is set to false by default. runkMeans returns 109 centroids, a Kxn matrix of the computed centroids and idx, a m x 1 110 vector of centroid assignments (i.e. each entry in range [1..K]). 111 """ 112 (m,n) = shape(X) 113 K = shape(initial_centroids)[0] 114 centroids = initial_centroids 115 clusterAssment = zeros((m,2)) 116 117 #Run K-Means 118 for i in range(max_iters): 119 clusterAssment = findClosestCentroids(X, centroids) 120 centroids = computeCentroids(X, clusterAssment, K); 121 122 return centroids, clusterAssment 123 124 def main(): 125 K =5 126 max_iters = 10 127 dataSet = loadDataSet('E://PythonSpace//TextClustering//data//test2.txt') 128 X = array(dataSet) 129 X = (X - mean(X)) / std(X) 130 131 initial_centroids = kMeansInitCentroids(X, K) 132 myCentroids, clusterAssment = runkMeans(X, initial_centroids, max_iters,False); 133 print "-------------------------------------" 134 show(X, K, myCentroids, clusterAssment) 135 136 main()
参考了Andrew Ng的Machine Learning Assignment(https://github.com/rieder91/MachineLearning/blob/master/Exercise%207/ex7/runkMeans.m)
以及博文http://www.cnblogs.com/MrLJC/p/4127553.html
运行结果: