Alex He

...永远保持希望与激情...约会未来更强大的自己...

 

使用weka进行文本聚类的例子

先看上篇会容易看懂些,这篇的注释不多!

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;

import weka.clusterers.Clusterer;
import weka.clusterers.SimpleKMeans;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.StringToWordVector;

public class MessageClustering {
	
	private Instances instances=null;	
	private StringToWordVector filter=new StringToWordVector();
	private Clusterer clusterer=null;
	
	public MessageClustering(Clusterer clusterer)
	{
		this.clusterer=clusterer;
	}
	
	static String path="E:\\datasets\\alt.atheism\\";
	public void loadInstances() throws Exception
	{
		String name="text";
		FastVector attributes=new FastVector(1);
		attributes.addElement(new Attribute("message",(FastVector)null));
		instances=new Instances(name,attributes,100);
		for(File file : new File(path).listFiles())
		{
			String message=getAllMessage(file);
			Instance instance=new Instance(1);
			Attribute attribute=instances.attribute("message");
			instance.setValue(attribute, attribute.addStringValue(message));
			instance.setDataset(instances);
			instances.add(instance);
		}
		
		filter.setInputFormat(instances);		
		Instances filtedData=Filter.useFilter(instances,filter);		
		instances=filtedData;			
	}
	
	public void testCluster() throws Exception
	{
		clusterer.buildClusterer(instances);
		for (int i = 0; i < instances.numInstances(); i++) {			
			int cluster = clusterer.clusterInstance(instances.instance(i));
			System.out.println("\t"+(i+1)+":"+cluster);	
		}		
		System.out.println(clusterer.numberOfClusters());
//		System.out.println(clusterer.toString());
	}
	
	private String getAllMessage(File file) {
		StringBuilder sb=new StringBuilder();
		try
		{
			BufferedReader br=new BufferedReader(new FileReader(file));
			String line;
			while(true)
			{
				if((line=br.readLine())==null) break;
				sb.append(line.trim());
			}		
			br.close();
		} catch (Exception e){}
		return sb.toString();
	}

	public static void main(String[] args) throws Exception {
		SimpleKMeans cluster=new SimpleKMeans();//构造聚类算法
		cluster.setNumClusters(5);
		
		MessageClustering sk=new MessageClustering(cluster);
		sk.loadInstances();
		sk.testCluster();//测试聚类效果
	}
}

posted on 2011-11-01 18:26  Alex木头  阅读(2951)  评论(0编辑  收藏  举报

导航