java_对7个文件中的单词进行词频统计所有单词进行词频统计, 要求去除停用词、 去除 单词首尾 的标点符号, 并按词频大小按 降序排列 写到文件WordCount.txt中
版本1(探索版)
package experiment6.exp4; /* 对7个文件Lincoln, Abraham - The Writings of Abraham Lincoln Volume 1.txt ~ Lincoln, Abraham - The Writings of Abraham Lincoln Volume 7.txt中的 所有单词进行词频统计, 要求去除停用词、 去除 单词首尾 的标点符号, 并按词频大小按 降序排列 写到文件WordCount.txt中。 可用HashMap实现单词词频记录。 写文件可用语句: import java.io.PrintWriter; PrintWriter pw = new PrintWriter("data/wordcount.txt"); pw.write(); */ import experiment5.exp4.Tuple; import java.io.File; import java.io.FileNotFoundException; import java.util.*; public class WordFrequencyStatistics { public static void main(String[] args) { List<String> list = new ArrayList<>(); Map<String, Integer> map = new TreeMap<>(); Set<String> setStopWords = new HashSet<>();/*Set里的对象是String,不需要重写equals和hashCode方法.*/ /*读入多个文件的数据到list中去.*/ /*分析文件名结构,以便利用循环读入数据.*/ String filesDirectory = "D:\\ecloud\\textbooks\\java\\experiment_doc\\dataExperiment6"; String filenamesPre = filesDirectory + "\\Lincoln, Abraham - The Writings of Abraham Lincoln Volume "; int no = 1; String filenamesPos = ".txt"; Scanner scanner = null; /*读取停用词:*/ File fileStopWord = new File(filesDirectory + "\\stopwords" + filenamesPos); try { scanner = new Scanner(fileStopWord); String stringStopWord = scanner.nextLine(); for (; scanner.hasNextLine(); stringStopWord = scanner.nextLine()) setStopWords.add(stringStopWord); } catch (FileNotFoundException e) { e.printStackTrace(); } /*开始读入各个文件,并处理(这里读入7个文件)*/ for (; no < 8; no++) { File file = new File(filenamesPre + no + filenamesPos); // File file2 = new File("Lincoln, Abraham - The Writings of Abraham Lincoln Volume 2.txt"); try { scanner = new Scanner(file); String s; for (s = scanner.next(); scanner.hasNext(); s = scanner.next()) { char tmpCharPos = (s.charAt(s.length() - 1)); char tmpCharPre = (s.charAt(0)); /*由于待分析文本成分比较复杂,如果不利用正则表达式,会显得力不从心,某些特殊情况无法良好解析导致有偏差 * 也就是说,主要是正确的解析单词是修正本程序的关键(待优化....).*/ /*将不是字母同时也不是数字的边缘字符丢掉(注意isDigital会将'.'作为数字的一部分*/ /* */ if (!Character.isAlphabetic(tmpCharPos) && (!(Character.isDigit(tmpCharPos) && tmpCharPos != '.')) && s.length() > 1) { s = s.substring(0, s.length() - 1); }//endIf1 if (!Character.isAlphabetic(tmpCharPre) && (!(Character.isDigit(tmpCharPre) && tmpCharPre != '.')) && s.length() > 1) { s = s.substring(1); }//endIf2 if (!setStopWords.contains(s.toLowerCase()))//忽视大小写的区别(这里提供的StopWords中的单词都是小写的,因而只需要当方面的将被比较字符转为纯小写即可达到效果(如有必要,可以将listStopWord中的单词也都转为小写,可以达到忽略大小写的效果. { list.add(s); }//endIf3 }/*endfor至此,成功读入数据到list中*/ } catch (FileNotFoundException e) { e.printStackTrace(); }//endCatch }//endFor /*开始利用map统计词频:*/ for (String x : list) { if (!map.containsKey(x)) { map.put(x, 1); } else map.put(x, map.get(x) + 1); } /*监视下结果:*/ System.out.println("observation"); //Collections.sort(list,new Comparator<Integer>()); /*使用遍历map的套路(两种之一)*/ /* for (Map.Entry<String, Integer> x : map.entrySet()) { System.out.println(x); }*/ List<Tuple> listTuples = new ArrayList<>(); for (String x : map.keySet()) { listTuples.add(new Tuple(x, map.get(x))); } Collections.sort(listTuples, new Comparator<Tuple>() { @Override public int compare(Tuple o1, Tuple o2) { return o2.getValue() - o1.getValue(); } }); //System.out.println(listTuples); for (Tuple x : listTuples) { System.out.println(x); } }//endMain }
package experiment6.exp4; public class Tuple { String string; int num; public Tuple(String string, int num) { this.string = string; this.num = num; } public String getKey() { return string; } public int getValue() { return num; } @Override public String toString() { return getKey()+"\t"+getValue(); } }
版本2
package experiment6.exp4; /* 对7个文件Lincoln, Abraham - The Writings of Abraham Lincoln Volume 1.txt ~ Lincoln, Abraham - The Writings of Abraham Lincoln Volume 7.txt中的 所有单词进行词频统计, 要求去除停用词、 去除 单词首尾 的标点符号, 并按词频大小按 降序排列 写到文件WordCount.txt中。 可用HashMap实现单词词频记录。 写文件可用语句: import java.io.PrintWriter; PrintWriter pw = new PrintWriter("data/wordcount.txt"); pw.write(); */ import java.io.File; import java.io.FileNotFoundException; import java.io.PrintWriter; import java.util.*; public class WordFrequencyStatistics { public static void main(String[] args) { List<String> list = new ArrayList<>(); Map<String, Integer> map = new TreeMap<>(); Set<String> setStopWords = new HashSet<>();/*Set里的对象是String,不需要重写equals和hashCode方法.*/ /*读入多个文件的数据到list中去.*/ /*分析文件名结构,以便利用循环读入数据.*/ String filesDirectory = "D:\\ecloud\\textbooks\\java\\experiment_doc\\dataExperiment6"; String filenamesPre = filesDirectory + "\\Lincoln, Abraham - The Writings of Abraham Lincoln Volume "; int no = 1; String filenamesPos = ".txt"; Scanner scanner = null; /*读取停用词:*/ File fileStopWord = new File(filesDirectory + "\\stopwords" + filenamesPos); try { scanner = new Scanner(fileStopWord); String stringStopWord = scanner.nextLine(); for (; scanner.hasNextLine(); stringStopWord = scanner.nextLine()) setStopWords.add(stringStopWord); } catch (FileNotFoundException e) { e.printStackTrace(); } /*开始读入各个文件,并处理(这里读入7个文件)*/ for (; no < 8; no++) { File file = new File(filenamesPre + no + filenamesPos); // File file2 = new File("Lincoln, Abraham - The Writings of Abraham Lincoln Volume 2.txt"); try { scanner = new Scanner(file); String s; for (s = scanner.nextLine(); scanner.hasNextLine(); s = scanner.nextLine()) { StringTokenizer tokenizer = new StringTokenizer(s, " #*-,.!:;\"$()[]\\&?");//delimiter界定符 String str; while(tokenizer.hasMoreElements()){ str=(String) tokenizer.nextElement(); if (!setStopWords.contains(str.toLowerCase()))/*//忽视大小写的区别(这里提供的StopWords中的单词都是小写的, 因而只需要当方面的将被比较字符转为纯小写即可达到效果(如有必要,可以将listStopWord中的单词也都转为小写,可以达到忽略大小写的效果.*/ { list.add(str); }//endIf } }/*endfor至此,成功读入数据到list中*/ } catch (FileNotFoundException e) { e.printStackTrace(); }//endCatch }//endFor /*开始利用map统计词频:*/ for (String x : list) { if (!map.containsKey(x)) { map.put(x, 1); } else map.put(x, map.get(x) + 1); } /*监视下结果:*/ System.out.println("observation"); //Collections.sort(list,new Comparator<Integer>()); /*使用遍历map的套路(两种之一)*/ /* for (Map.Entry<String, Integer> x : map.entrySet()) { System.out.println(x); }*/ List<Tuple> listTuples = new ArrayList<>(); for (String x : map.keySet()) { listTuples.add(new Tuple(x, map.get(x))); } Collections.sort(listTuples, new Comparator<Tuple>() { @Override public int compare(Tuple o1, Tuple o2) { return o2.getValue() - o1.getValue(); } }); //System.out.println(listTuples); /* for (Tuple x : listTuples) { System.out.println(x); }*/ PrintWriter pw = null; try { pw = new PrintWriter(filesDirectory+"/data/wordcount.txt"); } catch (FileNotFoundException e) { e.printStackTrace(); } for (Tuple x : listTuples) { pw.write(x.toString()); } }//endMain }
package experiment6.exp4; public class Tuple { String string; int num; public Tuple(String string, int num) { this.string = string; this.num = num; } public String getKey() { return string; } public int getValue() { return num; } @Override public String toString() { return getKey()+"\t"+getValue()+"\n"; } }
版本三(主要是排除停用词的另一种过滤方式)
package experiment6.exp4; import java.io.File; import java.io.FileNotFoundException; import java.io.PrintWriter; import java.util.*; public class WordFrequencyStatistics { public static void main(String[] args) { List<String> list = new ArrayList<>(); Map<String, Integer> map = new TreeMap<>(); Set<String> stopWordsSet = new HashSet<>();/*Set里的对象是String,不需要重写equals和hashCode方法.*/ /*读入多个文件的数据到list中去.*/ /*分析文件名结构,以便利用循环读入数据.*/ String filesDirectory = "D:\\ecloud\\textbooks\\java\\experiment_doc\\dataExperiment6"; String filenamesPre = filesDirectory + "\\Lincoln, Abraham - The Writings of Abraham Lincoln Volume "; int no = 1; String filenamesPos = ".txt"; Scanner scanner; /*读取停用词:*/ File fileStopWord = new File(filesDirectory + "\\stopwords" + filenamesPos); try { scanner = new Scanner(fileStopWord); String stringStopWord = scanner.nextLine(); for (; scanner.hasNextLine(); stringStopWord = scanner.nextLine()) stopWordsSet.add(stringStopWord); } catch (FileNotFoundException e) { e.printStackTrace(); } /*开始读入各个文件,并处理(这里读入7个文件)*/ for (; no < 8; no++) { File file = new File(filenamesPre + no + filenamesPos); // File file2 = new File("Lincoln, Abraham - The Writings of Abraham Lincoln Volume 2.txt"); try { scanner = new Scanner(file); String s; for (s = scanner.nextLine(); scanner.hasNextLine(); s = scanner.nextLine()) { StringTokenizer tokenizer = new StringTokenizer(s, " #*-,.!:;\"$()[]\\&?");//delimiter界定符 String str; /*分析(剔除停用词)并将满足条件的单词添加到list中*/ while (tokenizer.hasMoreElements()) {//这一行中的所有单词(保存在tokenizer中) str = (String) tokenizer.nextElement();//挨个地获取单词 /*排除停用词(方法1)*/ if (!stopWordsSet.contains(str.toLowerCase()))/*//忽视大小写的区别(这里提供的StopWords中的单词都是小写的, 因而只需要当方面的将被比较字符转为纯小写即可达到效果(如有必要,可以将listStopWord中的单词也都转为小写,可以达到忽略大小写的效果.*/ { list.add(str); }//endIf /*方法2:(这个就不推荐,时间开销比方法一大的多)*/ // boolean isEliminate = false; // for (String x : stopWordsSet) { // if (str.compareToIgnoreCase(x) == 0)/*public int compareToIgnoreCase(@NotNull String str)*/ { // isEliminate = true; // break; // }//endIf // }//endFor // if (isEliminate == false) // list.add(str); }//endWhile }/*endfor至此,成功读入数据到list中*/ } catch (FileNotFoundException e) { e.printStackTrace(); }//endCatch }//endFor /*开始利用map统计词频:*/ for (String x : list) { if (!map.containsKey(x)) { map.put(x, 1); } else map.put(x, map.get(x) + 1); } /*监视下结果:*/ System.out.println("observation"); //Collections.sort(list,new Comparator<Integer>()); /*使用遍历map的套路(两种之一)*/ /* for (Map.Entry<String, Integer> x : map.entrySet()) { System.out.println(x); }*/ List<Tuple> listTuples = new ArrayList<>(); for (String x : map.keySet()) { listTuples.add(new Tuple(x, map.get(x))); } Collections.sort(listTuples, new Comparator<Tuple>() { @Override public int compare(Tuple o1, Tuple o2) { return o2.getValue() - o1.getValue(); } }); /*打印结果*/ //System.out.println(listTuples); for (Tuple x : listTuples) { System.out.println(x); } /*将结果输出到文件中*/ // PrintWriter pw = null; // try { // pw = new PrintWriter(filesDirectory+"/data/wordcount.txt"); // } catch (FileNotFoundException e) { // e.printStackTrace(); // } // for (Tuple x : listTuples) { // pw.write(x.toString()); // }//endFor }//endMain }
版本4(利用StringBuffer来处理待写入字符串内容)
package experiment6.exp4; /* 对7个文件Lincoln, Abraham - The Writings of Abraham Lincoln Volume 1.txt ~ Lincoln, Abraham - The Writings of Abraham Lincoln Volume 7.txt中的 所有单词进行词频统计, 要求去除停用词、 去除 单词首尾 的标点符号, 并按词频大小按 降序排列 写到文件WordCount.txt中。 可用HashMap实现单词词频记录。 写文件可用语句: import java.io.PrintWriter; PrintWriter pw = new PrintWriter("data/wordcount.txt"); pw.write(); */ import java.io.File; import java.io.FileNotFoundException; import java.io.PrintWriter; import java.util.*; public class WordFrequencyStatistics { public static void main(String[] args) { List<String> list = new ArrayList<>(); Map<String, Integer> map = new TreeMap<>(); Set<String> stopWordsSet = new HashSet<>();/*Set里的对象是String,不需要重写equals和hashCode方法.*/ /*读入多个文件的数据到list中去.*/ /*分析文件名结构,以便利用循环读入数据.*/ String filesDirectory = "D:\\ecloud\\textbooks\\java\\experiment_doc\\dataExperiment6"; String filenamesPre = filesDirectory + "\\Lincoln, Abraham - The Writings of Abraham Lincoln Volume "; int no = 1; String filenamesPos = ".txt"; Scanner scanner; /*读取停用词:*/ File fileStopWord = new File(filesDirectory + "\\stopwords" + filenamesPos); try { scanner = new Scanner(fileStopWord); String stringStopWord = scanner.nextLine(); for (; scanner.hasNextLine(); stringStopWord = scanner.nextLine()) stopWordsSet.add(stringStopWord); } catch (FileNotFoundException e) { e.printStackTrace(); } /*开始读入各个文件,并处理(这里读入7个文件)*/ for (; no < 8; no++) { //File 对象是定义在for中的,即,尽管每次进入for时,File类的实例名都叫file,但由于都是通过new来实例化的,所以这些同名File对象是不同的. File file = new File(filenamesPre + no + filenamesPos); // File file2 = new File("Lincoln, Abraham - The Writings of Abraham Lincoln Volume 2.txt"); try { scanner = new Scanner(file); String s; for (s = scanner.nextLine(); scanner.hasNextLine(); s = scanner.nextLine()) { StringTokenizer tokenizer = new StringTokenizer(s, " #*-,.!:;\"$()[]\\&?");//delimiter界定符 String str; /*分析(剔除停用词)并将满足条件的单词添加到list中*/ while (tokenizer.hasMoreElements()) {//这一行中的所有单词(保存在tokenizer中) str = (String) tokenizer.nextElement();//挨个地获取单词 /*排除停用词(方法1)*/ if (!stopWordsSet.contains(str.toLowerCase()))/*//忽视大小写的区别(这里提供的StopWords中的单词都是小写的, 因而只需要当方面的将被比较字符转为纯小写即可达到效果(如有必要,可以将listStopWord中的单词也都转为小写,可以达到忽略大小写的效果.*/ { list.add(str); }//endIf /*方法2:(这个就不推荐,时间开销比方法一大的多)*/ // boolean isEliminate = false; // for (String x : stopWordsSet) { // if (str.compareToIgnoreCase(x) == 0)/*public int compareToIgnoreCase(@NotNull String str)*/ { // isEliminate = true; // break; // }//endIf // }//endFor // if (isEliminate == false) // list.add(str); }//endWhile }/*endfor至此,成功读入数据到list中*/ } catch (FileNotFoundException e) { e.printStackTrace(); }//endCatch }//endFor /*开始利用map统计词频:*/ for (String x : list) { if (!map.containsKey(x)) { map.put(x, 1); } else map.put(x, map.get(x) + 1); } List<Tuple> listTuples = new ArrayList<>(); for (String x : map.keySet()) { listTuples.add(new Tuple(x, map.get(x))); } Collections.sort(listTuples, new Comparator<Tuple>() { @Override public int compare(Tuple o1, Tuple o2) { return o2.getValue() - o1.getValue(); } }); /*打印结果*/ // //System.out.println(listTuples); // for (Tuple x : listTuples) { // System.out.println(x); // } /*将结果输出到文件中*/ /*创建文件*/ PrintWriter pw = null; try { //pw = new PrintWriter(filesDirectory+"/data/wordcount.txt"); pw = new PrintWriter("C://users//xuchaoxin//desktop//wordcount.txt"); } catch (FileNotFoundException e) { e.printStackTrace(); } /*写入方式1:*/ // for (Tuple x : listTuples) { // pw.write(x.toString()); // }//endFor /*写入方式2:*/ StringBuffer sb=new StringBuffer(); for(Tuple x:listTuples){ sb.append(x.toString()); } //System.out.println(sb);//其实是打印了sb.toString(); pw.write(sb.toString()); }//endMain }
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
2022-12-05 CN_MAC介质访问控制子层@CSMA协议
2021-12-05 linux_mysql运行状态检查/端口检查/本地主机连接远程服务器宝塔面板中的mysql_(本地mysql软件连接到远程mysql(登陆任意用户/root用户)