Kmeans_python

from numpy import *
import matplotlib.pyplot as plt
def loadData(filename):
    data=[]
    for line in open(filename).readlines():
        linth=line.strip().split("\t")
        dataline=map(float,linth)
        data.append(dataline)
    return data
data=loadData("testSet.txt")
def randcent(data,k):
    n=shape(data)[1]
    centdata=mat(zeros((k,n)))
    for i in range(n):
        mind=min(data[:,i])
        max2min=float(max(data[:,i])-mind)
        centdata[:,i]=mind+max2min*random.rand(k,1)
    return centdata

def calcdist(A,B):
    return sqrt(sum(power(A-B,2)))
def kmeans(data,k):
    m=shape(data)[0]
    mark=mat(zeros((m,2)))
    cent=randcent(data,k)
    centerchanged=True
    while centerchanged:
        centerchanged=False
        for i in range(m):
            index=-1
            mindata=10000
            for j in range(k):
                temp=calcdist(cent[j,:],data[i,:])
                if temp<mindata:
                    mindata=temp
                    index=j
            if mark[i, 0] != index: centerchanged = True
            mark[i, :] = index, mindata
        for t in range(k):
            ar=data[nonzero(mark[:,0].A==t)[0]]
            cent[t,:]=mean(ar,0)
    return cent,mark
datamat=mat(loadData("testSet.txt"))
centdata,cluster=kmeans(datamat,4)
centarr=centdata.A
clusterarr=cluster.A
findonemax=max([tt[1] for tt in clusterarr if tt[0]==0])
findtwomax=max(tt[1] for tt in clusterarr if tt[0]==1)
findthreemax=max(tt[1] for tt in clusterarr if tt[0]==2)
findfourmax=max(tt[1] for tt in clusterarr if tt[0]==3)
theta=arange(0,2*pi,0.01)
one=[tb[0] for tb in [tt for tt in datamat[nonzero(cluster[:,0].A==0)[0]].A]]
onemore=[tb[1] for tb in [tt for tt in datamat[nonzero(cluster[:,0].A==0)[0]].A]]
two=[tb[0] for tb in [tt for tt in datamat[nonzero(cluster[:,0].A==1)[0]].A]]
twomore=[tb[1] for tb in [tt for tt in datamat[nonzero(cluster[:,0].A==1)[0]].A]]
three=[tb[0] for tb in [tt for tt in datamat[nonzero(cluster[:,0].A==2)[0]].A]]
threemore=[tb[1] for tb in [tt for tt in datamat[nonzero(cluster[:,0].A==2)[0]].A]]
four=[tb[0] for tb in [tt for tt in datamat[nonzero(cluster[:,0].A==3)[0]].A]]
fourmore=[tb[1] for tb in [tt for tt in datamat[nonzero(cluster[:,0].A==3)[0]].A]]
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(one,onemore,c="blue",s=10,marker='s')
ax.scatter(two,twomore,c="green",marker='*')
ax.scatter(three,threemore,c='gray',marker='h')
ax.scatter(four,fourmore)
ax.scatter(centarr[:,0],centarr[:,1],c='red')
ax.plot(centarr[0][0]+findonemax*cos(theta),centarr[0][1]+findonemax*sin(theta))
ax.plot(centarr[1][0]+findtwomax*cos(theta),centarr[1][1]+findtwomax*sin(theta))
ax.plot(centarr[2][0]+findthreemax*cos(theta),centarr[2][1]+findthreemax*sin(theta))
ax.plot(centarr[3][0]+findfourmax*cos(theta),centarr[3][1]+findfourmax*sin(theta))
plt.show()

  

posted @ 2017-06-13 22:53  semen  阅读(284)  评论(1编辑  收藏  举报