词频统计设计的改进
1 package zuoye1; 2 3 import java.io.BufferedReader; 4 import java.io.FileNotFoundException; 5 import java.io.FileReader; 6 import java.io.IOException; 7 import java.util.ArrayList; 8 import java.util.Collections; 9 import java.util.Comparator; 10 import java.util.HashMap; 11 import java.util.List; 12 import java.util.Map; 13 import java.util.StringTokenizer; 14 import java.util.Map.Entry; 15 16 public class FileWord { 17 18 /** 19 * 读入文件,实现词频统计 20 */ 21 public static void main(String[] args) { 22 HashMap<String,Integer> map=new HashMap<String,Integer>();//用于统计各个单词的个数,排序 23 //过滤字符串中的所有标点符号 24 String regex=" ?.!:,\"\"'';\n"; 25 BufferedReader br; 26 try { 27 //FileReader类创建了一个可以读取文件内容的Reader类、调用构造方法FileReader() 28 br = new BufferedReader(new FileReader("c:\\english.txt"));//文件完整路径 29 String sentence; 30 int wordCount = 0; 31 try { 32 while((sentence = br.readLine()) !=null){ //用readLine读取文件,判断读取文件是否为空 33 sentence = sentence.replaceAll(regex, ""); 34 StringTokenizer token=new StringTokenizer(sentence); 35 while(token.hasMoreTokens()){ //循环遍历 36 wordCount++; 37 String word = token.nextToken(); 38 if(map.containsKey(word)){ //HashMap不允许重复的key,所以利用这个特性,去统计单词的个数 39 int count=map.get(word); 40 map.put(word, count+1); //如果HashMap已有这个单词,则设置它的数量加1 41 } 42 else{ 43 map.put(word, 1); //如果没有这个单词,则新填入,数量为1 44 } 45 } 46 } 47 System.out.println("总共单词数:"+wordCount); 48 sort(map); 49 } catch (IOException e) { 50 e.printStackTrace(); 51 } 52 }catch(FileNotFoundException e) { 53 e.printStackTrace(); 54 } 55 } 56 //排序 57 public static void sort(HashMap<String,Integer> map){ 58 //声明集合folder,存放单词和单词个数 59 List<Map.Entry<String, Integer>> folder = new ArrayList<Map.Entry<String, Integer>>(map.entrySet()); 60 Collections.sort(folder, new Comparator<Map.Entry<String, Integer>>() { 61 public int compare(Map.Entry<String, Integer> obj1, Map.Entry<String, Integer> obj2) { 62 return (obj2.getValue() - obj1.getValue()); 63 } 64 }); 65 //输出 66 for (int i = 0; i < folder.size(); i++) { 67 Entry<String, Integer> en = folder.get(i); 68 System.out.println(en.getKey()+":"+en.getValue()); 69 } 70 } 71 }
实现结果
总共单词数:181 as:7 the:7 not:6 it:6 to:5 are:4 a:4 your:4 in:4 they:3 live:3 and:3 of:2 do:2 may:2 by:2 be:2 clothes:2 that:2 often:2 have:2 from:2 above:2 is:2 you:2 door:1 its:1 suppose.It:1 palace.The:1 contentedly:1 snow:1 friends,Turn:1 yourself:1 means.which:1 or:1 windows:1 life,poor:1 bad:1 quiet:1 like:1 without:1 thoughts.:1 simply:1 abode;the:1 change.Sell:1 will:1 some:1 fault-finder:1 herb,like:1 before:1 most:1 I:1 old,return:1 trouble:1 life:1 change;we:1 supported:1 is.You:1 spring.:1 me:1 mind:1 town;but:1 there,and:1 paradise.Love:1 hardnames.It:1 is,meet:1 should:1 seem:1 independent:1 new:1 alms-house:1 poor-house.The:1 pleasant,thrilling,glorious:1 ;do:1 garden:1 happens:1 keep:1 but:1 However:1 reflected:1 being:1 brightly:1 enough:1 Cultivate:1 any.May:1 looks:1 more:1 sage.Do:1 town's:1 when:1 faults:1 richest.The:1 disreputable.:1 think:1 get:1 so:1 much:1 lives:1 perhaps:1 early:1 things,whether:1 call:1 dishonest:1 sun:1 shun:1 melts:1 setting:1 them.Things:1 poverty:1 poorest:1 mean:1 receive:1 find:1 hourss,even:1 thoughts,as:1 rich:1 poor:1 man's:1 cheering:1 great:1 see:1 supporting:1 themselves:1 misgiving.Most:1
ssh://git@git.coding.net:linliaimeli/FileWord.git
https://git.coding.net/linliaimeli/FileWord.git