<code> Kmeans && Kmeans++ && Davies-Bouldin && Dunn index

  1 import java.io.BufferedReader;
  2 import java.io.FileReader;
  3 import java.io.FileWriter;
  4 import java.io.ObjectInputStream.GetField;
  5 import java.util.ArrayList;
  6 import java.util.Random;
  7 
  8 
  9 public class Kmeans {
 10 
 11     private int k;
 12     private Vector[] cluster_centers;
 13     private int[] point_ids;
 14     
 15     private int num_clusters;//used to adapt to different initialization 
 16     
 17     Kmeans(int k){
 18         this.k=k;
 19         cluster_centers=new Vector[k];
 20     }
 21     
 22     private Vector get_cluster_center(int i){
 23         return cluster_centers[i];
 24     }
 25     
 26     private int get_point_id(int i){
 27         return point_ids[i];    
 28     }
 29 
 30     /**
 31      * returns the index of the cluster center the closest to the point X
 32      */
 33     private int get_closest_center_id(Vector X){    
 34         int id=0;
 35         double tmp,distance=distance(X,get_cluster_center(0));
 36         for(int i=1;i<k&&i<num_clusters;i++){
 37             tmp=distance(X,get_cluster_center(i));
 38             if(distance>tmp){
 39                 distance=tmp;
 40                 id=i;
 41             }
 42         }
 43         return id;    
 44     }
 45     
 46     
 47     /**
 48      * add the copy construction in class Vector and KahanSum
 49      * */
 50     private double distance(final Vector A, final Vector B){
 51         Vector diff=new Vector(A);
 52         diff.sub(B);
 53         return diff.norm();
 54     }
 55     
 56     /**
 57      * a simple random initialization, used in Kmeans
 58      */
 59     private void init_point_indexes(Random rnd){
 60         for(int i=0;i<point_ids.length;i++)
 61             point_ids[i] = rnd.nextInt(k);    
 62         num_clusters=k;
 63     }
 64     /**
 65      * Kmeans++ initialization
 66      */
 67     private void init_point_KMplusplus(Random rd,ArrayList<Vector> points){
 68         int cluster0=rd.nextInt(points.size());//choose the first center
 69         cluster_centers[0]=points.get(cluster0);
 70         //points.remove(cluster0);
 71         num_clusters=1;
 72         
 73         double[] D=new double[points.size()];
 74         
 75         while(num_clusters < k){
 76             int sum=0;
 77             for(int i=0;i<points.size();i++){    
 78                 double d=distance(points.get(i),cluster_centers[ get_closest_center_id(points.get(i)) ]);
 79                 sum+=d*d;
 80                 D[i]=sum;    
 81             }
 82             
 83              double r = rd.nextDouble()*sum;
 84              for (int i = 0 ; i < D.length; i++) {
 85                     if (D[i] >= r){
 86                         cluster_centers[num_clusters]=points.get(i);
 87                         //points.remove(i);
 88                         num_clusters++;
 89                         break;
 90                     }
 91              }
 92             
 93         }
 94         
 95         assignment(points);
 96         
 97     }
 98 /**********************************************************************
 99  * choose init_point_indexs(new Random) to do the Kmeans initialization
100  * or
101  * choose init_point_KMplusplus(new Random(),points) 
102  *     to do the Kmeans++ initialization
103  * 
104  * ******************************************************************/    
105     int clusterize(ArrayList<Vector> points){
106         int iterations=1;
107         point_ids = new int[points.size()];        
108 
109         //init_point_indexes(new Random());
110         init_point_KMplusplus(new Random(),points);
111         
112         update(points);    
113 
114         while(!assignment(points)){
115         update(points);
116         iterations++;
117         //if(iterations>100)return iterations;
118         }
119         return iterations;
120     }
121 
122     //update the center of different clusters
123     private void update(ArrayList<Vector> points){
124         ArrayList<Vector> cluster_members=new ArrayList<Vector>();
125         
126         //calculate the centers
127         for(int i=0;i<k;i++){
128             for(int j=0;j<point_ids.length;j++){
129                 if(get_point_id(j)==i)
130                     cluster_members.add(points.get(j));
131             }
132             
133             //System.out.println("!!!!!!"+cluster_members.size());
134             if(cluster_members.size()!=0)
135             cluster_centers[i]=Vector.vector_median(cluster_members);    
136             cluster_members.clear();
137         }
138     }
139     
140     private boolean assignment(ArrayList<Vector> points){
141         boolean convergence=true;
142         
143         for(int i=0;i<points.size();i++){
144             int closest_id=get_closest_center_id(points.get(i));
145             if(point_ids[i]!=closest_id){
146                 point_ids[i]=closest_id;
147                 convergence=false;
148             }
149         }
150         return convergence;    
151     }
152     
153     public void write_data_withID(String filename,ArrayList<Vector> list){
154         FileWriter fw;
155         int i=0,j=0;
156         Vector v;
157         try
158         {
159             fw = new FileWriter(filename);
160             while(j<list.size()){
161                 i=0;
162             v=list.get(j++);
163             while(i<v.get_length()){
164             fw.write(Double.toString(v.get(i++))+" ");
165             }
166             fw.write(Integer.toString(get_point_id(j-1)));
167             fw.write('\n');
168             
169             }
170             fw.flush();
171             fw.close();
172             System.out.println("Vector write with cluster_id finished");
173 
174         }catch(Exception e){
175             e.printStackTrace();
176         }    
177     }
178 
179     /************************
180      * cluster analysis:
181      * http://en.wikipedia.org/wiki/Cluster_analysis
182      * higher is better
183      * Dunn need the points to be clustered first 
184      * so load the List of points and do clusterize() function
185      * **********************/ 
186     public double Dunn(ArrayList<Vector> points){
187         
188         clusterize(points);
189         double max_intra_distance=0;
190         double min_cluster_distance=Double.MAX_VALUE;
191         double temp=0;
192         double temp_intra_distance=0;
193         for(int i=0;i<k;i++){
194             
195             temp_intra_distance = max_intra_distance(i,points);
196             if(temp_intra_distance > max_intra_distance)
197                 max_intra_distance = temp_intra_distance;
198             
199             for(int j=i+1;j<k;j++){
200                 temp=distance(cluster_centers[i], cluster_centers[j]);
201                 if(temp<min_cluster_distance)
202                     min_cluster_distance=temp;
203             }
204         }
205         if(min_cluster_distance==Double.MAX_VALUE||max_intra_distance==0)
206         {
207             System.out.println("Only have one cluster or Max intra cluster distance is 0" +
208                     "\nthe return value will be '0'.");
209             return 0;
210         }
211         
212         return min_cluster_distance/max_intra_distance;
213         
214     }
215     /**
216      *calculate the average distance of points of cluster i,  
217      *do clusterize to cluster the points first,  
218      *the lower the better
219      */
220     public double Davies_Bouldin(ArrayList<Vector> points){
221         clusterize(points);
222         double[] Average=AverageDistance(points);//average distance of points of cluster i
223         
224         double maxValue=0;
225         KahanSum sum=new KahanSum();
226         
227         for(int i=0;i<k;i++){
228             for(int j=i+1;j<k;j++){
229                 double temp=Average[i]+Average[j];
230                 temp/=distance(cluster_centers[i],cluster_centers[j]);
231                 if(temp>maxValue)
232                     maxValue=temp;
233             }
234             sum.add(maxValue);    
235         }
236         if(k==1)System.out.println("cluster number is 1, the value will be 0");
237         
238         return sum.getsum()/k;
239         
240     }
241     /**
242      * the function max_intra_distance get the max intra distance in cluster i
243      * */
244     private double max_intra_distance(int i,ArrayList<Vector> points){
245         double dis=0;
246         double max_inrta=0;
247         for(int j=0;j<points.size();j++){
248             if(point_ids[j]==i){
249                 dis=distance(cluster_centers[i],points.get(j));
250             if(dis>max_inrta)
251                 max_inrta=dis;
252             }
253         }
254 
255         return max_inrta;
256     }
257     /**
258      * return a double[], element i has the value of average distance
259      * of the points of cluster i
260      */
261     private double[] AverageDistance( ArrayList<Vector> points){
262         
263         double[] average =new double[k];
264         KahanSum distance=new KahanSum();
265         int count=0;
266         
267         for(int i=0;i<k;i++){
268             for(int j=0;j<point_ids.length;j++){
269                 if(get_point_id(j)==i)
270                     {
271                     count++;
272                     distance.add(distance(cluster_centers[i],points.get(j)));
273                     }
274             }
275             
276             average[i]=0;
277             if(count!=0)
278                 average[i]=distance.getsum()/count;    
279             distance.reset();
280             count=0;
281         }
282         return average;
283 
284     }
285     
286     public static void main(String[] args) {
287         // TODO Auto-generated method stub
288         ArrayList<Vector> points;
289         if(args.length==0)
290             points=Vector.read_data("dataset-4");//the dataset
291         else
292             points=Vector.read_data(args[0]);
293         //points.get(0).printvec();
294         
295         int dim=points.get(0).get_length();
296         //System.out.println(points.size()+" "+dim);
297         
298         Kmeans km=new Kmeans(2);
299         System.out.println("the iterations is: "+km.clusterize(points)+"\n" +
300                 "    by using the initialization of kmeans++.");
301         if(args.length==2)
302         km.write_data_withID(args[1], points);
303         else
304         km.write_data_withID("out-datasets", points);
305 
306         
307         for(int i=1;i<6;i++){
308             km=new Kmeans(i);
309             System.out.println("Dunn cluster_num ="+i+" "+km.Dunn(points));
310         }
311         
312         for(int i=1;i<6;i++){
313             km=new Kmeans(i);
314             System.out.println("Davies_Bouldin cluster_num ="+i+" "+km.Davies_Bouldin(points));
315         }
316         
317     
318     }
319 
320 }

 

posted @ 2013-04-05 01:19  SONGHY  阅读(1586)  评论(0编辑  收藏  举报