python实现的k-means算法(原创)

2011-09-21 14:39 Haippy 阅读(8027) 评论(1) 编辑收藏举报
  1 #! /usr/bin/env python
  2 # -*- coding: utf-8 -*-
  3 import os
  4 import sys
  5 import cmath
  6 import os.path
  7 
  8 class KMeans:
  9     '''
 10     @descriptions: K-means Algorithm implementation. 
 11     @filename:     Filename of input data.
 12     @knums:        Clusters number.
 13 '''
 14     def __init__(self, filename, knums):
 15         self._filename = filename;
 16         self._knums = knums
 17         self._dimension = 0
 18         """self._samples := [(seqx, x1, x2, ..., xn), 
 19                              (seqy, y1, y2, ..., yn), 
 20                              ..., 
 21                              (seqz, z1, z2, ..., zn)]"""
 22         self._samples= []
 23         """self._clusters :=[[(0, c1, c2, ..., cn), (seqx, x1, x2, ..., xn), (seqy, y1, y2, ..., yn)], 
 24                              [], 
 25                              ..., 
 26                              []]"""
 27         self._clusters = []
 28         
 29         self._open(self._filename)
 30         self._normalize()
 31         #print self._samples
 32         self._select(self._knums)
 33     
 34     
 35     def _normalize(self):
 36         """
 37         @description: Normalize the attributes of input data.
 38 """
 39         new_samples = []
 40         for t in xrange(len(self._samples)):
 41             st = list(self._samples[t])
 42             new_samples.append(st)
 43         
 44         for t in xrange(len(self._samples)):
 45             self._samples.pop()
 46              
 47         for d in xrange(1, (self._dimension + 1)):
 48             container_att = []
 49             for idx in xrange(len(new_samples)):
 50                 att = new_samples[idx][d]
 51                 container_att.append(att)
 52             
 53             max_att = max(container_att)
 54             min_att = min(container_att)
 55             
 56             for idx in xrange(len(new_samples)):
 57                 new_att = (new_samples[idx][d] - min_att) / (max_att - min_att)
 58                 new_samples[idx][d] = new_att
 59                 
 60         for t in xrange(len(new_samples)):
 61             st = tuple(new_samples[t])
 62             self._samples.append(st)
 63             
 64     
 65     
 66     def _open(self, filename):
 67         """
 68         @descriptions: Open the data file and fill each item into memory.
 69         @filename    : Filename of input data.
 70 """
 71         data_file= open(self._filename, "r")
 72         data_lines= data_file.readlines();
 73         for line in data_lines:
 74             string_samples = line.split("")
 75             integer_samples= []
 76 
 77             integer_samples.append(int(string_samples[0]))
 78 
 79             for e in string_samples[1:]:
 80                 integer_samples.append(float(e))
 81             samples = tuple(integer_samples)
 82             self._samples.append(samples)
 83         #print self._samples
 84         self._dimension = len(self._samples[0]) - 1
 85         #print self._dimension
 86 
 87 
 88     def _select(self, knums):
 89         """
 90         @descriptions: Choose the first knums cluster center.
 91         @knums       : Clusters number.
 92 """
 93         for i in xrange(knums):
 94             selected = self._samples[i]
 95             temp = list(selected)
 96             temp[0] = 0
 97             self._clusters.append([])
 98             self._clusters[i].append(temp)
 99         #print self._clusters
100 
101 
102     def _distance(self, va, vb):
103         '''
104         @description: Return the (distance ** 2) of tuple va and tuple vb.
105         @va         : tuple va (x1, x2, ..., xn)
106         @vb         : tuple vb (y1, y2, ..., yn)
107 '''
108         distance = 0
109         for i in xrange(self._dimension):
110             distance += (va[i] - vb[i]) * (va[i] - vb[i]) 
111         #print distance
112         
113         return distance
114 
115 
116     def _means(self, va):
117         """
118         @description: Return the means of va.
119         @va         : A tuple of list va, with the form [(flagx, x1, x2, ..., xn), 
120                                                          (flagy, y1, y2, ..., yn), 
121                                                          (flagz, z1, z2, ..., zn), ...]
122 """
123         if (len(va) == 0):
124             return va
125         
126         means_cluster = []
127         means_cluster.append(1)#Indicate that the means has changed.
128         
129         #print va
130         for d in xrange(self._dimension):
131             tmp = 0
132             for i in xrange(len(va)):
133                 tmp += va[i][d+1]
134             means_cluster.append(tmp/len(va))
135         means = tuple(means_cluster)
136         
137         return means
138     
139     def _equal(self, ta, tb):
140         """
141         @description: Check if tuple ta equals to tuple tb.
142         @ta         : Tuple ta.(flagx, x1, x2, ..., xn)
143         @tb         : Tuple tb.(flagy, y1, y1, ..., ym)
144 """
145         if (len(ta) != len(tb)):
146             return False
147         
148         for i in xrange(1, len(ta)):
149             if (ta[i] != tb[i]):
150                 return False
151             
152         return True
153     
154     def flush(self, filename):
155         """
156         @description: Flush data the disk.
157         @filename   : Filename of output data.
158 """
159         foutput = open(filename, "w")
160         
161         for c in xrange(self._knums):
162             foutput.write("Group %d" % c)
163             for e in self._clusters[c][1:]:
164                 foutput.write("%s" %  repr(e))
165             foutput.write("\n\n\n")
166         print("Done.")
167         foutput.close()
168 
169     def _reconstruct(self, idx):
170         """
171         @description: Reconstruct the cluster points.
172         
173         @idx         : Index of clusters, where clusters has the form as follows:
174         self._clusters :=[[(0, c1, c2, ..., cn), (seqx, x1, x2, ..., xn), (seqy, y1, y2, ..., yn)], 
175                           [], 
176                           ..., 
177                           []]
178 """
179         new_cluster = []
180         new_cluster.append(0)
181         for old_value in self._clusters[idx][0][1:]:
182             new_cluster.append(old_value)
183         for i in xrange(len(self._clusters[idx])):
184             self._clusters[idx].pop()
185         self._clusters[idx].insert(0, new_cluster)
186 
187 
188     def process(self):
189         """
190         @description: Process data, calculating k-means and clustering.
191 """
192         while True:
193             K = 0
194             for e in self._samples:
195                 #print e  
196                 shortest = -1
197                 for k in xrange(self._knums):
198                     #for k in _clusters[]
199                     #print e
200                     #print self._clusters[k][0]
201                     distance = self._distance(e[1:], self._clusters[k][0][1:])
202                     #print distance
203                     if (distance < 0.000001):
204                         # add e to the k-th cluster.
205                         self._clusters[k].append(e)
206                         break
207                     else:
208                         if (shortest == -1):
209                             shortest = distance
210                         else:
211                             if (shortest > distance):
212                                 shortest = distance
213                                 K = k
214                     if (k != self._knums - 1):
215                         continue
216                     else:
217                         # add e to the k-th cluster
218                         self._clusters[K].append(e)
219             #print self._clusters
220 
221             for k in xrange(self._knums):
222                 new_ktuple = self._means(self._clusters[k][1:])
223                 if (len(new_ktuple) == 0):
224                     continue
225                 if (self._equal(self._clusters[k][0], new_ktuple) == False):
226                     self._clusters[k].pop(0)
227                     self._clusters[k].insert(0, new_ktuple)
228                          
229                 else:
230                     continue
231                 
232             flag = 0
233             for idx in xrange(self._knums):
234                 if (self._clusters[idx][0][0] == 1):
235                     flag = 1
236                     break
237                 else:
238                     continue
239 
240             if (flag == 1):
241                 for idx in xrange(self._knums):
242                     self._reconstruct(idx) 
243             else:
244                 break
245 
246 
247 if __name__ =="__main__":
248     ikmeans = KMeans("./iris-1.dat", 3)
249     ikmeans.process()
250     ikmeans.flush("./k-means-out.dat")
K-means算法的python代码，写完 + 调试花了差不多一天的时间，希望对大家有用。关于K-means聚类算法和ISODATA算法解释见下一篇博文。
刷新页面返回顶部
256code, 生活与技术

python实现的k-means算法(原创)

About