代码改变世界

python实现的k-means算法(原创)

2011-09-21 14:39  Haippy  阅读(8023)  评论(1编辑  收藏  举报
  1 #! /usr/bin/env python
2 # -*- coding: utf-8 -*-
3 import os
4 import sys
5 import cmath
6 import os.path
7
8 class KMeans:
9 '''
10 @descriptions: K-means Algorithm implementation.
11 @filename: Filename of input data.
12 @knums: Clusters number.
13 '''
14 def __init__(self, filename, knums):
15 self._filename = filename;
16 self._knums = knums
17 self._dimension = 0
18 """self._samples := [(seqx, x1, x2, ..., xn),
19 (seqy, y1, y2, ..., yn),
20 ...,
21 (seqz, z1, z2, ..., zn)]"""
22 self._samples= []
23 """self._clusters :=[[(0, c1, c2, ..., cn), (seqx, x1, x2, ..., xn), (seqy, y1, y2, ..., yn)],
24 [],
25 ...,
26 []]"""
27 self._clusters = []
28
29 self._open(self._filename)
30 self._normalize()
31 #print self._samples
32 self._select(self._knums)
33
34
35 def _normalize(self):
36 """
37 @description: Normalize the attributes of input data.
38 """
39 new_samples = []
40 for t in xrange(len(self._samples)):
41 st = list(self._samples[t])
42 new_samples.append(st)
43
44 for t in xrange(len(self._samples)):
45 self._samples.pop()
46
47 for d in xrange(1, (self._dimension + 1)):
48 container_att = []
49 for idx in xrange(len(new_samples)):
50 att = new_samples[idx][d]
51 container_att.append(att)
52
53 max_att = max(container_att)
54 min_att = min(container_att)
55
56 for idx in xrange(len(new_samples)):
57 new_att = (new_samples[idx][d] - min_att) / (max_att - min_att)
58 new_samples[idx][d] = new_att
59
60 for t in xrange(len(new_samples)):
61 st = tuple(new_samples[t])
62 self._samples.append(st)
63
64
65
66 def _open(self, filename):
67 """
68 @descriptions: Open the data file and fill each item into memory.
69 @filename : Filename of input data.
70 """
71 data_file= open(self._filename, "r")
72 data_lines= data_file.readlines();
73 for line in data_lines:
74 string_samples = line.split("")
75 integer_samples= []
76
77 integer_samples.append(int(string_samples[0]))
78
79 for e in string_samples[1:]:
80 integer_samples.append(float(e))
81 samples = tuple(integer_samples)
82 self._samples.append(samples)
83 #print self._samples
84 self._dimension = len(self._samples[0]) - 1
85 #print self._dimension
86
87
88 def _select(self, knums):
89 """
90 @descriptions: Choose the first knums cluster center.
91 @knums : Clusters number.
92 """
93 for i in xrange(knums):
94 selected = self._samples[i]
95 temp = list(selected)
96 temp[0] = 0
97 self._clusters.append([])
98 self._clusters[i].append(temp)
99 #print self._clusters
100
101
102 def _distance(self, va, vb):
103 '''
104 @description: Return the (distance ** 2) of tuple va and tuple vb.
105 @va : tuple va (x1, x2, ..., xn)
106 @vb : tuple vb (y1, y2, ..., yn)
107 '''
108 distance = 0
109 for i in xrange(self._dimension):
110 distance += (va[i] - vb[i]) * (va[i] - vb[i])
111 #print distance
112
113 return distance
114
115
116 def _means(self, va):
117 """
118 @description: Return the means of va.
119 @va : A tuple of list va, with the form [(flagx, x1, x2, ..., xn),
120 (flagy, y1, y2, ..., yn),
121 (flagz, z1, z2, ..., zn), ...]
122 """
123 if (len(va) == 0):
124 return va
125
126 means_cluster = []
127 means_cluster.append(1)#Indicate that the means has changed.
128
129 #print va
130 for d in xrange(self._dimension):
131 tmp = 0
132 for i in xrange(len(va)):
133 tmp += va[i][d+1]
134 means_cluster.append(tmp/len(va))
135 means = tuple(means_cluster)
136
137 return means
138
139 def _equal(self, ta, tb):
140 """
141 @description: Check if tuple ta equals to tuple tb.
142 @ta : Tuple ta.(flagx, x1, x2, ..., xn)
143 @tb : Tuple tb.(flagy, y1, y1, ..., ym)
144 """
145 if (len(ta) != len(tb)):
146 return False
147
148 for i in xrange(1, len(ta)):
149 if (ta[i] != tb[i]):
150 return False
151
152 return True
153
154 def flush(self, filename):
155 """
156 @description: Flush data the disk.
157 @filename : Filename of output data.
158 """
159 foutput = open(filename, "w")
160
161 for c in xrange(self._knums):
162 foutput.write("Group %d" % c)
163 for e in self._clusters[c][1:]:
164 foutput.write("%s" % repr(e))
165 foutput.write("\n\n\n")
166 print("Done.")
167 foutput.close()
168
169 def _reconstruct(self, idx):
170 """
171 @description: Reconstruct the cluster points.
172
173 @idx : Index of clusters, where clusters has the form as follows:
174 self._clusters :=[[(0, c1, c2, ..., cn), (seqx, x1, x2, ..., xn), (seqy, y1, y2, ..., yn)],
175 [],
176 ...,
177 []]
178 """
179 new_cluster = []
180 new_cluster.append(0)
181 for old_value in self._clusters[idx][0][1:]:
182 new_cluster.append(old_value)
183 for i in xrange(len(self._clusters[idx])):
184 self._clusters[idx].pop()
185 self._clusters[idx].insert(0, new_cluster)
186
187
188 def process(self):
189 """
190 @description: Process data, calculating k-means and clustering.
191 """
192 while True:
193 K = 0
194 for e in self._samples:
195 #print e
196 shortest = -1
197 for k in xrange(self._knums):
198 #for k in _clusters[]
199 #print e
200 #print self._clusters[k][0]
201 distance = self._distance(e[1:], self._clusters[k][0][1:])
202 #print distance
203 if (distance < 0.000001):
204 # add e to the k-th cluster.
205 self._clusters[k].append(e)
206 break
207 else:
208 if (shortest == -1):
209 shortest = distance
210 else:
211 if (shortest > distance):
212 shortest = distance
213 K = k
214 if (k != self._knums - 1):
215 continue
216 else:
217 # add e to the k-th cluster
218 self._clusters[K].append(e)
219 #print self._clusters
220
221 for k in xrange(self._knums):
222 new_ktuple = self._means(self._clusters[k][1:])
223 if (len(new_ktuple) == 0):
224 continue
225 if (self._equal(self._clusters[k][0], new_ktuple) == False):
226 self._clusters[k].pop(0)
227 self._clusters[k].insert(0, new_ktuple)
228
229 else:
230 continue
231
232 flag = 0
233 for idx in xrange(self._knums):
234 if (self._clusters[idx][0][0] == 1):
235 flag = 1
236 break
237 else:
238 continue
239
240 if (flag == 1):
241 for idx in xrange(self._knums):
242 self._reconstruct(idx)
243 else:
244 break
245
246
247 if __name__ =="__main__":
248 ikmeans = KMeans("./iris-1.dat", 3)
249 ikmeans.process()
250 ikmeans.flush("./k-means-out.dat")

K-means算法的python代码,写完 + 调试花了差不多一天的时间,希望对大家有用。关于K-means聚类算法和ISODATA算法解释见下一篇博文。