词频统计设计的改进

 1 package zuoye1;
 2 
 3 import java.io.BufferedReader;
 4 import java.io.FileNotFoundException;
 5 import java.io.FileReader;
 6 import java.io.IOException;
 7 import java.util.ArrayList;
 8 import java.util.Collections;
 9 import java.util.Comparator;
10 import java.util.HashMap;
11 import java.util.List;
12 import java.util.Map;
13 import java.util.StringTokenizer;
14 import java.util.Map.Entry;
15 
16 public class FileWord {
17 
18     /**
19      * 读入文件,实现词频统计
20      */
21     public static void main(String[] args) {
22         HashMap<String,Integer> map=new HashMap<String,Integer>();//用于统计各个单词的个数,排序
23         //过滤字符串中的所有标点符号
24         String regex=" ?.!:,\"\"'';\n";
25         BufferedReader br;
26         try {
27             //FileReader类创建了一个可以读取文件内容的Reader类、调用构造方法FileReader()
28             br = new BufferedReader(new FileReader("c:\\english.txt"));//文件完整路径
29             String sentence;
30             int wordCount = 0;
31             try {
32                 while((sentence = br.readLine()) !=null){     //用readLine读取文件,判断读取文件是否为空
33                     sentence = sentence.replaceAll(regex, "");
34                     StringTokenizer token=new StringTokenizer(sentence);
35                     while(token.hasMoreTokens()){     //循环遍历
36                         wordCount++;    
37                         String word = token.nextToken();
38                         if(map.containsKey(word)){     //HashMap不允许重复的key,所以利用这个特性,去统计单词的个数
39                         int count=map.get(word);
40                         map.put(word, count+1);     //如果HashMap已有这个单词,则设置它的数量加1
41                     }
42                     else{
43                         map.put(word, 1);          //如果没有这个单词,则新填入,数量为1
44                 }
45             }
46         }
47                 System.out.println("总共单词数:"+wordCount);
48                 sort(map); 
49             } catch (IOException e) {
50                 e.printStackTrace();
51             }
52         }catch(FileNotFoundException e) {
53             e.printStackTrace();
54         }
55     }
56         //排序
57         public static void sort(HashMap<String,Integer> map){
58             //声明集合folder,存放单词和单词个数
59             List<Map.Entry<String, Integer>> folder = new ArrayList<Map.Entry<String, Integer>>(map.entrySet()); 
60             Collections.sort(folder, new Comparator<Map.Entry<String, Integer>>() {   
61                 public int compare(Map.Entry<String, Integer> obj1, Map.Entry<String, Integer> obj2) {   
62                     return (obj2.getValue() - obj1.getValue());   
63                 }   
64         }); 
65         //输出
66         for (int i = 0; i < folder.size(); i++) {   
67             Entry<String, Integer> en = folder.get(i);
68             System.out.println(en.getKey()+":"+en.getValue());
69             }
70         }
71     }

实现结果

总共单词数:181
as:7
the:7
not:6
it:6
to:5
are:4
a:4
your:4
in:4
they:3
live:3
and:3
of:2
do:2
may:2
by:2
be:2
clothes:2
that:2
often:2
have:2
from:2
above:2
is:2
you:2
door:1
its:1
suppose.It:1
palace.The:1
contentedly:1
snow:1
friends,Turn:1
yourself:1
means.which:1
or:1
windows:1
life,poor:1
bad:1
quiet:1
like:1
without:1
thoughts.:1
simply:1
abode;the:1
change.Sell:1
will:1
some:1
fault-finder:1
herb,like:1
before:1
most:1
I:1
old,return:1
trouble:1
life:1
change;we:1
supported:1
is.You:1
spring.:1
me:1
mind:1
town;but:1
there,and:1
paradise.Love:1
hardnames.It:1
is,meet:1
should:1
seem:1
independent:1
new:1
alms-house:1
poor-house.The:1
pleasant,thrilling,glorious:1
;do:1
garden:1
happens:1
keep:1
but:1
However:1
reflected:1
being:1
brightly:1
enough:1
Cultivate:1
any.May:1
looks:1
more:1
sage.Do:1
town's:1
when:1
faults:1
richest.The:1
disreputable.:1
think:1
get:1
so:1
much:1
lives:1
perhaps:1
early:1
things,whether:1
call:1
dishonest:1
sun:1
shun:1
melts:1
setting:1
them.Things:1
poverty:1
poorest:1
mean:1
receive:1
find:1
hourss,even:1
thoughts,as:1
rich:1
poor:1
man's:1
cheering:1
great:1
see:1
supporting:1
themselves:1
misgiving.Most:1

 ssh://git@git.coding.net:linliaimeli/FileWord.git

 https://git.coding.net/linliaimeli/FileWord.git

posted @ 2016-09-05 12:35  林莉  阅读(261)  评论(3编辑  收藏  举报