使用weka进行文本聚类的例子
先看上篇会容易看懂些,这篇的注释不多!
import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import weka.clusterers.Clusterer; import weka.clusterers.SimpleKMeans; import weka.core.Attribute; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.filters.Filter; import weka.filters.unsupervised.attribute.StringToWordVector; public class MessageClustering { private Instances instances=null; private StringToWordVector filter=new StringToWordVector(); private Clusterer clusterer=null; public MessageClustering(Clusterer clusterer) { this.clusterer=clusterer; } static String path="E:\\datasets\\alt.atheism\\"; public void loadInstances() throws Exception { String name="text"; FastVector attributes=new FastVector(1); attributes.addElement(new Attribute("message",(FastVector)null)); instances=new Instances(name,attributes,100); for(File file : new File(path).listFiles()) { String message=getAllMessage(file); Instance instance=new Instance(1); Attribute attribute=instances.attribute("message"); instance.setValue(attribute, attribute.addStringValue(message)); instance.setDataset(instances); instances.add(instance); } filter.setInputFormat(instances); Instances filtedData=Filter.useFilter(instances,filter); instances=filtedData; } public void testCluster() throws Exception { clusterer.buildClusterer(instances); for (int i = 0; i < instances.numInstances(); i++) { int cluster = clusterer.clusterInstance(instances.instance(i)); System.out.println("\t"+(i+1)+":"+cluster); } System.out.println(clusterer.numberOfClusters()); // System.out.println(clusterer.toString()); } private String getAllMessage(File file) { StringBuilder sb=new StringBuilder(); try { BufferedReader br=new BufferedReader(new FileReader(file)); String line; while(true) { if((line=br.readLine())==null) break; sb.append(line.trim()); } br.close(); } catch (Exception e){} return sb.toString(); } public static void main(String[] args) throws Exception { SimpleKMeans cluster=new SimpleKMeans();//构造聚类算法 cluster.setNumClusters(5); MessageClustering sk=new MessageClustering(cluster); sk.loadInstances(); sk.testCluster();//测试聚类效果 } }