k均值算法
import matplotlib.pyplot as plt import numpy as np import time from django.template.defaultfilters import center def loadDataSet(fileName): dataMat=[] fr=open(fileName) for line in fr.readlines(): curLine=line.strip().split('\t') fltLine=map(float,curLine) dataMat.append([i for i in fltLine]) return dataMat def distEclud(vecA,vecB): return np.sqrt(np.sum(np.power(vecA-vecB,2))) def randCent(dataSet,k): n=np.shape(dataSet)[1] centroids=np.mat(np.zeros((k,n))) for j in range(n): minJ=min(dataSet[:,j]) rangeJ=float(max(dataSet[:,j])-minJ) centroids[:,j]=minJ+rangeJ*np.random.rand(k,1) return centroids def kMeans(dataSet,k): m=np.shape(dataSet)[0] clusterAssment=np.mat(np.zeros((m,2))) centroids=randCent(dataSet, k) clusterChanged=True while clusterChanged: clusterChanged=False for i in range(m): minDist=np.inf minIndex=-1 for j in range(k): distJI=distEclud(centroids[j,:], dataSet[i,:]) if distJI < minDist: minDist=distJI;minIndex=j if clusterAssment[i,0] != minIndex: clusterChanged=True clusterAssment[i,:]=minIndex,minDist**2 for cent in range(k): ptsInClust=dataSet[np.nonzero(clusterAssment[:,0].A == cent)[0]] centroids[cent,:]=np.mean(ptsInClust, axis=0) return centroids,clusterAssment def showImage(dataSet,center,label): c=['r','g','w','b'] n=np.shape(dataSet)[0] for i in range(4): x=[]
y=[] for j in range(n): if label[j]==i: x.append(dataSet[j,0]) y.append(dataSet[j,1]) plt.scatter(x,y,s=40,c=c[i]) center=center.A plt.scatter(center[:,0],center[:,1],c='m',marker='p',s=200) plt.show() if __name__ == '__main__': startTime=time.clock() dataSet=loadDataSet("testSet.txt") dataSet=np.array(dataSet) print(dataSet) center,cluster=kMeans(dataSet, 4) print(center) endTime=time.clock() print(endTime-startTime) showImage(dataSet, center, cluster[:,0])