词频junit测试
package search; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Scanner; import java.util.TreeMap; public class UpdateWordSearch { /** * 输入文件 保存分隔后的单词集合 保存统计后的单词集合 */ String article;// 保存文章的内容 String[] rWords; String[] words; int[] wordFreqs;// 保存单词对应的词频 String filename;// 文件名 // 统计总数 int total = 0; // 构造函数:输入文章的内容 public UpdateWordSearch() throws IOException { Scanner sc = new Scanner(System.in); System.out.println("请输入文件名:"); filename = sc.nextLine(); File file = new File(filename); if (!file.exists()) { System.out.println("文件不存在!"); return; } BufferedReader bf = new BufferedReader(new FileReader(file)); StringBuffer article = new StringBuffer(); // 动态字符串数组 String temp = bf.readLine(); while (temp != null) { article.append(temp + " "); // 往动态字符串数组里添加数据 temp = bf.readLine(); if (temp == null) { break; } } this.article = article.toString(); } // 分词并统计相应词汇 public void sWord() { // 分词的时候,因为标点符号不参与,所以所有的符号全部替换为空格 final char SPACE = ' '; article = article.replace('\"', SPACE).replace(',', SPACE) .replace('.', SPACE).replace('\'', SPACE); article = article.replace('(', SPACE).replace(')', SPACE) .replace('-', SPACE); rWords = article.split("\\s+");// 凡是空格隔开的都算单词,上面替换了',所以I've被分成两个单词 } public List<String> sort() { // 将所有出现的字符串放入唯一的list中,不用map,是因为map寻找效率太低了 List<String> list = new ArrayList<String>(); for (String word : rWords) { list.add(word); } Collections.sort(list); return list; } // 词汇排序 public List countWordFreq() { // 统计词频信息 Map<String, Integer> wordsInfo = new TreeMap<String, Integer>(); String word = ""; // 词频名字 int count = 0; // 词频数量 // 统计单词总数 int total = 0; List<String> wordList = sort(); word = wordList.get(0); for (int i = 0; i <= wordList.size(); i++) { if (i == wordList.size()) { wordsInfo.put(word, count); total++; break; } if (wordList.get(i).equals(word)) { count++; } else { wordsInfo.put(word, count); total++; word = wordList.get(i); count = 1; } } // 词频信息排序 List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>( wordsInfo.entrySet()); Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() { @Override public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { // TODO Auto-generated method stub return o2.getValue().compareTo(o1.getValue()); } }); this.total = total; return list; } public void run() { // 拆分文本 sWord(); // 统计词频 List<Map.Entry<String, Integer>> list = countWordFreq(); // 打印词频总数 System.out.println("词频总数:"); System.out.println("total:" + this.total); System.out.println("词频统计信息:"); // 打印统计词频 int m = 0; for (Map.Entry<String, Integer> mapping : list) { if (m < 10) { System.out.println(mapping.getKey() + " : " + mapping.getValue()); m++; } else break; } } // 测试类的功能 public static void main(String[] args) throws IOException { UpdateWordSearch w = new UpdateWordSearch(); w.run(); } }
下图是词频统计所做的junit测试: