单词统计续

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

<br>package wordcount;
//20173600 王重阳
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
 
public class wordcount {
    public static void main(String [] args) throws IOException  {
        wordcount xm=new wordcount();
        wordcount.meau();
         
    }
     
    public static  void meau() throws IOException
    {
         System.out.printf("1.查找字母    2.查找单词   3.按单词数查找");
         Scanner sc=new Scanner (System.in);
         int n=sc.nextInt();//输入数字
          
         switch(n)
         {
             case 1:zimu(new File("D:/学习/大二选修java/fly.txt"));break;
             case 2:danci(new File("D:/学习/大二选修java/fly.txt"));break;
             case 3:danci2(new File("D:/学习/大二选修java/fly.txt"));break;
         }
          
        sc.close();
 
    }
     
     
     public static void zimu(File file){
            double sum=0.0;
            BufferedReader bfr = null;   //定义字符读取(缓冲)流
            try{
                bfr = new BufferedReader(new FileReader(file));
                String value = null; 
                String newValue = "";    
                while((value = bfr.readLine())!=null){    //开始读取文件中的字符
                    newValue = newValue+value;    //存入newValue变量中
                }
                char[] ch = newValue.toCharArray();//把newValue变成字符数组
                TreeMap<Character,Integer> tm = new TreeMap<Character,Integer>(Collections.reverseOrder());
                for(int x = 0;x<ch.length;x++){   
                    char c = ch[x];
                    if((c>=65&&c<=90)||(c>=97&&c<=122)) {
                        sum++;
                        if(tm.containsKey(c)){  //如果TreeMap(tm)中有该键，则取出该键中的值，也就是出现的次数
                            int conut = tm.get(c);
                            tm.put(c,conut+1);  //存入把新值存入tm集合中，如果键相同的话， 新键会替换老键，值也随着变化了
                        }
                        else{
                            tm.put(c, 1);  //如果没有出现该键就说明是第一次出现，然后就存入1次
                        }
                    }
                }
                //下面的是取出TreeMap(tm)中的键和值
                Set<Map.Entry<Character, Integer>> set = tm.entrySet();
                Iterator<Map.Entry<Character, Integer>> iter = set.iterator();
                while(iter.hasNext()){
                    Map.Entry<Character, Integer> map = iter.next();
                    char k = map.getKey();
                    int v = map.getValue();
                   // double baifenbi=0.0;
                   // baifenbi=v/sum;
                   // System.out.println(baifenbi);
                    System.out.println(k+"("+v+")  ");
                }
                meau();
            }
            catch(IOException e){
                System.out.println("文件读取错误");
            }
            finally{
                try{
                    if(bfr!=null)
                        bfr.close();
                }
                catch(IOException e){
                    System.out.println("文件关闭错误");
                }
            }
        }
      
      
      
     public static void danci(File file) throws IOException {
 
            BufferedReader br = null;
            br = new BufferedReader(new FileReader(file));
            StringBuffer sb = new StringBuffer();
            String text =null;
            while ((text=br.readLine())!= null){
                sb.append(text);// 将读取出的字符追加到stringbuffer中
            }
            br.close();  // 关闭读入流
 
            String str = sb.toString().toLowerCase(); // 将stringBuffer转为字符并转换为小写
            String[] words = str.split("[^(a-zA-Z)]+");  // 非单词的字符来分割，得到所有单词
            Map<String ,Integer> map = new HashMap<String, Integer>() ;
 
            for(String word :words){
                if(map.get(word)==null){  // 若不存在说明是第一次，则加入到map,出现次数为1
                    map.put(word,1);
                }else{
                    map.put(word,map.get(word)+1);  // 若存在，次数累加1
                }
            }
 
            // 排序
            List<Map.Entry<String ,Integer>> list = new ArrayList<Map.Entry<String,Integer>>(map.entrySet());
 
            Comparator<Map.Entry<String,Integer>> comparator = new Comparator<Map.Entry<String, Integer>>() {
                public int compare(Map.Entry<String, Integer> left, Map.Entry<String, Integer> right) {
                    return (left.getValue().compareTo(right.getValue()));
                }
            };
            // 集合默认升序升序
            Collections.sort(list,comparator);
             
             
            float sum=0;
            float a=0;
             
      
            for(int i=0;i<list.size();i++){// 由高到低输出
                if(list.get(list.size()-i-1).getKey().equals("the")||list.get(list.size()-i-1).getKey().equals("and"))
                {
                     
                }
                else
                {
                    a=list.get(list.size()-i-1).getValue();  
                    sum=sum+a;        //求总字母数
                    System.out.println(list.get(list.size()-i-1).getKey() +":"+list.get(list.size()-i-1).getValue());
                }
            }
            System.out.println("单词总数为:"+sum);
            meau();
        }
 
      
     public static void danci2(File file) throws IOException {
 
            BufferedReader br = null;
            br = new BufferedReader(new FileReader(file));
            StringBuffer sb = new StringBuffer();
            String text =null;
            while ((text=br.readLine())!= null){
                sb.append(text);// 将读取出的字符追加到stringbuffer中
            }
            br.close();  // 关闭读入流
 
            String str = sb.toString().toLowerCase(); // 将stringBuffer转为字符并转换为小写
            String[] words = str.split("[^(a-zA-Z)]+");  // 非单词的字符来分割，得到所有单词
            Map<String ,Integer> map = new HashMap<String, Integer>() ;
 
            for(String word :words){
                if(map.get(word)==null){  // 若不存在说明是第一次，则加入到map,出现次数为1
                    map.put(word,1);
                }else{
                    map.put(word,map.get(word)+1);  // 若存在，次数累加1
                }
            }
 
            // 排序
            List<Map.Entry<String ,Integer>> list = new ArrayList<Map.Entry<String,Integer>>(map.entrySet());
 
            Comparator<Map.Entry<String,Integer>> comparator = new Comparator<Map.Entry<String, Integer>>() {
                public int compare(Map.Entry<String, Integer> left, Map.Entry<String, Integer> right) {
                    return (left.getValue().compareTo(right.getValue()));
                }
            };
            // 集合默认升序升序
            Collections.sort(list,comparator);
             
            BufferedReader br1 = new BufferedReader(new FileReader("D:/学习/大二选修java/nouseword.txt"));
 
            StringBuffer sb1 = new StringBuffer();
            String text1 =null;
            while ((text1=br1.readLine())!= null){
                sb1.append(text1);// 将读取出的字符追加到stringbuffer中
            }
            br1.close();  // 关闭读入流
 
            String str1 = sb1.toString().toLowerCase(); // 将stringBuffer转为字符并转换为小写
            String[] words1 = str1.split("[^(a-zA-Z)]+");  // 非单词的字符来分割，得到所有单词
            Map<String ,Integer> map1 = new HashMap<String, Integer>() ;
 
            for(String word :words1){
                if(map1.get(word)==null){  // 若不存在说明是第一次，则加入到map,出现次数为1
                    map1.put(word,1);
                }else{
                    map1.put(word,map1.get(word)+1);  // 若存在，次数累加1
                }
            }
           
            // 排序
            List<Map.Entry<String ,Integer>> list1 = new ArrayList<Map.Entry<String,Integer>>(map1.entrySet());
 
            Comparator<Map.Entry<String,Integer>> comparator1 = new Comparator<Map.Entry<String, Integer>>() {
                public int compare(Map.Entry<String, Integer> left, Map.Entry<String, Integer> right) {
                    return (left.getValue().compareTo(right.getValue()));
                }
            };
            // 集合默认升序升序
            
            Collections.sort(list1,comparator1);
             
 
             
             
             
             
            float sum=0;
            float a=0;
            System.out.printf("请输入单词数");
            Scanner sc=new Scanner (System.in);
             int num=sc.nextInt();//输入数字
 
              
            for(int i=0;i<num;i++){// 由高到低输出
            int b=0;
                for(int v=0;v<list1.size();v++) {
                    if(list1.get(list1.size()-v-1).getKey().equals(list.get(list.size()-i-1).getKey())) {
                        System.out.println(list.get(list.size()-i-1).getKey()+"单词为无用词汇");
                        b=1;
                        break;
                    }
                }
                if(b==1) {
                     
                }else {
                    a=list.get(list.size()-i-1).getValue();  
                    sum=sum+a;        //求总字母数
                    System.out.println(list.get(list.size()-i-1).getKey() +":"+list.get(list.size()-i-1).getValue());
                    FileOutputStream fileOutputStream = null;
                    File file1 = new File("D:/学习/大二选修java/output.txt");
                    if(!file.exists()){
                        file.createNewFile();
                    }
                    fileOutputStream = new FileOutputStream(file1);
                    fileOutputStream.write(list.get(list.size()-i-1).getKey().getBytes("gbk"));
                    fileOutputStream.flush();
                    fileOutputStream.close();
                }
                     
                 
            }
            System.out.println("单词总数为:"+sum);
            meau();
        }
 
}

实验题目：

第1步：输出单个文件中的前 N 个最常出现的英语单词。

功能1：输出文件中所有不重复的单词，按照出现次数由多到少排列，出现次数同样多的，以字典序排列。

功能2：指定文件目录，对目录下每一个文件执行统计的操作。

功能3：指定文件目录，是会递归遍历目录下的所有子目录的文件进行统计单词的功能。

功能4：输出出现次数最多的前 n 个单词，

例如，提示统计统计前多少名：输入10。就是输出最常出现单词的前 10 名。当没有指明数量的时候，我们默认列出所有单词的频率。

第2步：第二步: 支持 stop words

在一本小说里，频率出现最高的单词一般都是 "a", "it", "the", "and", "this", 这些词，可以做一个 stop word 文件（停词表），在统计词汇的时候，跳过这些词。我们把这个文件叫 "stopwords.txt" file.

第三步: 想看看常用的短语是什么，怎么办呢？

先定义短语："两个或多个英语单词，它们之间只有空格分隔". 请看下面的例子：

　　hello world //这是一个短语

　　hello, world //这不是一个短语

同一频率的词组，按照字典序来排列。

第四步：把动词形态都统一之后再计数。

想找到常用的单词和短语，但是发现英语动词经常有时态和语态的变化，导致同一个词，同一个短语却被认为是不同的。怎么解决这个问题呢？

假设我们有这样一个文本文件，这个文件的每一行都是这样构成：

动词原型动词变形1 动词变形2... ，词之间用空格分开。

e.g. 动词 TAKE 有下面的各种变形：take takes took taken taking

我们希望在实现上面的各种功能的时候，有一个选项，就是把动词的各种变形都归为它的原型来统计。

功能支持动词形态的归一化

实验截图：

posted @ 2019-05-12 11:05 互联.王阅读(135) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

阅读排行：
· 全程不用写代码，我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了，比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· DeepSeek 开源周回顾「GitHub 热点速览」
· 白话解读 Dapr 1.15：你的「微服务管家」又秀新绝活了

公告

温馨提示cute-cnblogs 样式已开源查看一期样式

个人信息

+加关注

昵称：互联.王
园龄： 7年3个月
粉丝： 5
关注： 14

+加关注

麋鹿鲁哟

靡不有初鲜克有终

日历

2025年3月

日

一

二

三

四

五

六

迷人的字符

编程得玩起来，那样才美妙

单词统计续

公告

个人信息

日历

搜索

常用链接

随笔分类

随笔档案

阅读排行榜

评论排行榜

推荐排行榜

最新评论

迷人的字符

编程得玩起来，那样才美妙

单词统计 续

公告

个人信息

日历

搜索

常用链接

随笔分类

随笔档案

阅读排行榜

评论排行榜

推荐排行榜

最新评论

单词统计续