INT104-lab11 [聚类] [iris数据集] [K-means Algorithm]
[K-means Algorithm][3D子图]
由于K-means Algorithm是基于随机点选取的,
所以可能结果较差,甚至RE ! ! !
1 import numpy as np 2 import random 3 import matplotlib.pyplot as plt 4 from sklearn.manifold import TSNE 5 6 7 def read(path: str) -> list: 8 with open(path, "r") as f: 9 text = f.readlines() 10 D = [] 11 for row in text: 12 features = str.split(row, ",") 13 X = [] 14 for feature in features: 15 X.append(feature) 16 if len(X) == 5: 17 D.append(X) 18 return D 19 20 21 def init(D) -> tuple: 22 n, m = len(D), len(D[0]) - 1 23 X, Y = [], [] 24 for i in range(n): 25 x = [] 26 for j in range(m): 27 x.append(float(D[i][j])) 28 X.append(x) 29 Y.append(str.split(D[i][m], "\n")[0]) 30 return X, Y, n, m 31 32 33 def randomDataset(seed: int, D) -> list: 34 # set the random seed then the order is fixed and random 35 # random.seed(seed) 36 random.shuffle(D) 37 return D 38 39 40 def setPoints(K: int, X: list, n: int, m: int) -> tuple: 41 minValues, maxValues = [], [] 42 for j in range(m): 43 maxValue, minValue = -10000.0, 10000.0 44 for i in range(n): 45 maxValue = max(maxValue, X[i][j]) 46 minValue = min(minValue, X[i][j]) 47 maxValues.append(maxValue) 48 minValues.append(minValue) 49 P = [] 50 for i in range(K): 51 X = [] 52 for j in range(m): 53 X.append(random.uniform(minValues[j], maxValues[j])) 54 P.append(X) 55 return minValues, maxValues, P 56 57 58 def getType(a, b, c): 59 if a < b and a < b: 60 return 1 61 if b < a and b < c: 62 return 2 63 return 3 64 65 66 eps = 1e-3 67 68 69 def compare(P, newP, K, m): 70 for i in range(K): 71 for j in range(m): 72 if abs(P[i][j] - newP[i][j]) > eps: 73 return False 74 return True 75 76 77 def G(A, m): 78 n = len(A) 79 sumA = [0 for _ in range(m)] 80 for i in range(n): 81 for j in range(m): 82 sumA[j] += A[i][j] 83 return [(x / n) for x in sumA] 84 85 86 def getNewP(X, dis, n, m): 87 A, B, C = [], [], [] 88 for i in range(n): 89 if dis[i][0] == 1: 90 A.append(X[i]) 91 elif dis[i][0] == 2: 92 B.append(X[i]) 93 else: 94 C.append(X[i]) 95 return [G(A, m), G(B, m), G(C, m)] 96 97 98 def K_means_algorithm(K: int, X: list, Y: list, n: int, m: int): 99 minValues, maxValues, P = setPoints(K, X, n, m) 100 K_distances = [] 101 102 print(minValues) 103 print(maxValues) 104 print(np.array(P)) 105 106 while True: 107 108 for i in range(n): 109 dis1 = euclideanDistance(P[0], X[i], m) 110 dis2 = euclideanDistance(P[1], X[i], m) 111 dis3 = euclideanDistance(P[2], X[i], m) 112 Type = getType(dis1, dis2, dis3) 113 K_distances.append([Type, dis1, dis2, dis3]) 114 115 newP = getNewP(X, K_distances, n, m) 116 117 if compare(P, newP, K, m): 118 break 119 P = newP 120 print(np.array(newP)) 121 print("Yes") 122 return P, K_distances 123 124 125 def similarity(A, B, m) -> float: 126 Sigma_AixBi = 0 127 Sigma_Ai_Square = 0 128 Sigma_Bi_Square = 0 129 for i in range(m): 130 Sigma_AixBi += A[i] * B[i] 131 Sigma_Ai_Square += A[i] * A[i] 132 Sigma_Bi_Square += B[i] * B[i] 133 return Sigma_AixBi / (np.sqrt(Sigma_Ai_Square) * np.sqrt(Sigma_Bi_Square)) 134 135 136 def euclideanDistance(A, B, m) -> float: 137 Sigma_Xi_Yi_square = 0 138 for i in range(m): 139 Sigma_Xi_Yi_square += (A[i] - B[i]) * (A[i] - B[i]) 140 return np.sqrt(Sigma_Xi_Yi_square) 141 142 143 def answer(X, Y, dis, n): 144 x = np.array(X) 145 tsne = TSNE(n_components=3) 146 tsne.fit_transform(x) 147 one_x, one_y, one_z = [], [], [] 148 two_x, two_y, two_z = [], [], [] 149 three_x, three_y, three_z = [], [], [] 150 _one_x, _one_y, _one_z = [], [], [] 151 _two_x, _two_y, _two_z = [], [], [] 152 _three_x, _three_y, _three_z = [], [], [] 153 for i in range(n): 154 _x = tsne.embedding_[i][0] 155 _y = tsne.embedding_[i][1] 156 _z = tsne.embedding_[i][2] 157 if dis[i][0] == 1: 158 one_x.append(_x) 159 one_y.append(_y) 160 one_z.append(_z) 161 elif dis[i][0] == 2: 162 two_x.append(_x) 163 two_y.append(_y) 164 two_z.append(_z) 165 else: 166 three_x.append(_x) 167 three_y.append(_y) 168 three_z.append(_z) 169 if Y[i] == "Iris-setosa": 170 _one_x.append(_x) 171 _one_y.append(_y) 172 _one_z.append(_z) 173 elif Y[i] == "Iris-versicolor": 174 _two_x.append(_x) 175 _two_y.append(_y) 176 _two_z.append(_z) 177 else: 178 _three_x.append(_x) 179 _three_y.append(_y) 180 _three_z.append(_z) 181 # answer 182 fig = plt.figure(figsize=(12, 6), facecolor='w') 183 ax1 = fig.add_subplot(121, projection='3d') 184 plt.title('answer') 185 ax1.scatter(one_x, one_y, one_z) 186 ax1.scatter(two_x, two_y, two_z) 187 ax1.scatter(three_x, three_y, three_z) 188 # data 189 ax2 = fig.add_subplot(122, projection='3d') 190 plt.title('data') 191 ax2.scatter(_one_x, _one_y, _one_z) 192 ax2.scatter(_two_x, _two_y, _two_z) 193 ax2.scatter(_three_x, _three_y, _three_z) 194 plt.show() 195 print("Showing done") 196 197 198 if __name__ == '__main__': 199 dataset = read("iris.data") 200 dataset = randomDataset(17, dataset) 201 X, Y, n, m = init(dataset) 202 P, dis = K_means_algorithm(3, X, Y, n, m) 203 204 print("Done!") 205 print(np.array(P)) 206 207 answer(X, Y, dis, n) 208 plt.show()
~~Jason_liu O(∩_∩)O