K-Means算法实战(Python)
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
iris_df = pd.DataFrame(iris.data,columns=iris.feature_names)
iris_class =pd.DataFrame("",index=iris_df.index,columns=["class"])
iris_df= pd.concat([iris_df,iris_class],axis=1)
k=3 #设置要聚成的类数
iter_n_max=500 #设置最大的迭代次数
# np.random.RandomState(seed=100)
# center_index=np.random.randint(len(iris_df),size=k) #初始随机生成k个类中心
# center_data=iris_df.iloc[center_index,:4]
center_data=iris_df.iloc[:k,:4]
center_data_second=center_data
center_data_last=center_data
center_data.index=["class"+str(i+1)for i in range(k)]
iter_n=0 #初始化迭代次数为0
#定义距离表格并初始化距离为0
dist_df = pd.DataFrame(0,index=iris_df.index,columns=["class"+str(i+1)for i in range(k)])
while True:
print("第%d次迭代"%(iter_n))
center_data_second=center_data_last
for i in range(len(iris_df)):
for j in range(k):
dist_df.loc[i,"class"+str(j+1)]=(sum((iris_df.iloc[i,:4]-center_data_second.iloc[j])**2))**0.5
iris_class =pd.DataFrame("",index=iris_df.index,columns=["class"])
for i in range(len(iris_class)):
iris_df.loc[i,"class"]=dist_df.loc[i].idxmin()
center_data_last= iris_df.groupby(by=["class"]).mean()
iter_n=iter_n+1
print(dist_df)
if sum(sum((center_data_last == center_data_second).values))==len(center_data.index)*len(center_data.columns):
print("center_data_last==center_data_second")
break
if iter_n == iter_n_max :
print("iter_n == iter_n_max")
break
iris_df.loc[:,"class"]=iris_df["class"].apply(lambda x:int(x[-1]))
print("%f"%(iris_df.loc[0:49,"class"].value_counts().iloc[0]/iris_df.loc[0:49,"class"].value_counts().sum()))
print("%f"%(iris_df.loc[50:99,"class"].value_counts().iloc[0]/iris_df.loc[50:99,"class"].value_counts().sum()))
print("%f"%(iris_df.loc[100:149,"class"].value_counts().iloc[0]/iris_df.loc[100:149,"class"].value_counts().sum()))