前言:聚类是非监督学习的主要任务之一,根据原理可分为:基于质心、基于密度、基于连通性、基于概率以及基于神经网络等多种类型。
本文汇总了常用聚类算法及其评价指标,方便快速查询使用。(本文使用波士顿房价数据集,可用于回归)
以下为试验结果:
1 from time import time 2 3 import numpy as np 4 import pandas as pd 5 import matplotlib as mpl 6 import matplotlib.pyplot as plt 7 import sklearn 8 from sklearn import datasets 9 10 from sklearn.decomposition import PCA 11 from sklearn.preprocessing import scale 12 13 from sklearn import metrics 14 from sklearn.cluster import KMeans 15 from sklearn.cluster import MeanShift 16 from sklearn.cluster import DBSCAN 17 from sklearn.cluster import AgglomerativeClustering 18 19 #1. 加载数据 20 boston = sklearn.datasets.load_boston() 21 x,y = boston.data, boston.target 22 y = y.reshape(len(y),1) 23 data = np.hstack([x,y]) 24 25 #2. 特征归一化 26 x = scale(x) 27 data = scale(data) 28 29 30 #3. 分析数据 31 name_data = boston.feature_names 32 #print(name_data) 33 34 df_x = pd.DataFrame(x,columns=name_data) 35 df_y = pd.DataFrame(y,columns=['MEDV'],dtype=np.int32) 36 df = pd.concat([df_x,df_y],axis=1) 37 38 # #506条数据,没有空值,float64类型 39 # print(df.head()) 40 # print(df.info()) 41 # print(df['MEDV'].describe()) 42 43 #拟分为4类,目标为:<=17.025,>17.025 and <= 21.2, >21.2 and <=25, >25 44 n_clusters = 4#聚簇数量 45 df_y.loc[df_y['MEDV'] < 20] = 0 46 df_y.loc[(df_y['MEDV'] > 17.025) & (df_y['MEDV'] <= 21.2)] = 1 47 df_y.loc[(df_y['MEDV'] > 21.2) & (df_y['MEDV'] <= 25)] = 2 48 df_y.loc[df_y['MEDV'] > 25] = 3 49 labels = df_y.values.ravel() 50 51 def bench_k_means(estimator, name, data, method): 52 t0 = time() 53 estimator.fit(data) 54 55 print('%-9s\t%-9s\t%.2fs\t\t%.3f\t\t\t%.3f\t\t%.3f\t\t%.3f\t\t\t%.3f\t\t\t%.3f' 56 % (method, name, (time() - t0), 57 metrics.homogeneity_score(labels, estimator.labels_), 58 metrics.completeness_score(labels, estimator.labels_), 59 metrics.v_measure_score(labels, estimator.labels_), 60 metrics.adjusted_rand_score(labels, estimator.labels_), 61 metrics.adjusted_mutual_info_score(labels, estimator.labels_, 62 average_method='arithmetic'), 63 metrics.silhouette_score(data, estimator.labels_, 64 metric='euclidean', 65 sample_size=300))) 66 67 print(115 * '_') 68 print('聚类方式\t\t聚类原理\t\t执行时间\t\t同质性得分\t\t完整性评分\tv-测量得分\t调整后兰德指数\t调整的相互信息\t轮廓系数') 69 70 # #5.1 KMeans 71 bench_k_means(KMeans(init='k-means++', n_clusters=n_clusters, n_init=10), 72 name="质心", data=data, method='KMeans') 73 74 #5.2 KMeasn 75 bench_k_means(KMeans(init='random', n_clusters=n_clusters, n_init=10), 76 name="质心", data=data, method='KMeans') 77 78 #5.3 KMeasn 79 pca = PCA(n_components=n_clusters).fit(data) 80 bench_k_means(KMeans(init=pca.components_, n_clusters=n_clusters, n_init=1), 81 name="质心", 82 data=data, method='KMeans') 83 84 #5.4 MeanShift 85 bench_k_means(MeanShift(), 86 name="密度", 87 data=data, method='MeanShift') 88 89 #5.5 DBSCAN 90 bench_k_means(DBSCAN(eps=3, min_samples=2), 91 name="密度", 92 data=data, method='DBSCAN') 93 94 #5.6 HCA 95 bench_k_means(AgglomerativeClustering(n_clusters=n_clusters), 96 name="连通性", 97 data=data, method='HCA') 98 print(115 * '_')