python实现K-means

import pandas as pd
import numpy as np
data = pd.read_csv(r'data.csv')
train = data.iloc[:,0:4]

#计算不同样本之间的欧几里得距离,
#如果不同样本数据的刻度不一致,要对数据进行规格化处理
def nearest(traini,center):
    distance = np.zeros((len(center),1))
    for i in range(len(center)):
        dist = traini-center.ix[i,:]
        distance[i]=dist.dot(dist.T)
    return distance.argmin()

def zhidian(x):
    return x.sum()/len(x)

#收敛条件
def shoulian(train,center):
    julihe = 0
    for i in range(len(train)):
        #print(train.iloc[i,0:4])
        made = train.ix[i,'near']
        dist = train.iloc[i,0:4] - center.ix[made,0:4]
        julihe = julihe +dist.dot(dist.T)
    return julihe

def kmeans(train,center,julihe):
    #随机选择3个质点
    #每个样本的最近的类
    print('return')
    oldtrain = train
    oldcenter = center
    near = np.zeros((len(train),1)).astype(int)
    for i in range(len(train)):
        near[i] = nearest(train.ix[i,:],center)
    #重新计算质点
    train['near']=near    
    center = train.groupby(train['near']).apply(zhidian)
    #收敛条件
    newjulihe = shoulian(train,center)
    if newjulihe<julihe:
        del train['near']
        del center['near']
        return kmeans(train,center,newjulihe)
    else:
        print(oldcenter)
        return oldtrain,oldcenter

def sdasd(train,julihe):
    center = train.ix[0:3,:]
    train,center = kmeans(train,center,julihe)
    return train,center

julihe = 100000
train,center = sdasd(train,julihe)

 

posted @ 2017-07-31 11:11  草莓干123456  阅读(954)  评论(0编辑  收藏  举报