统计单词,字母出现的次数和频率
一、统计所给出文件中英文字母出现的频率(区分大小写),并且按着出现频率倒序输出
思路:将文件用BufferedReader读取
对每行进行读取在进行分割成单词
对单词进行循环判断是否在A-Z,a-z之间,若在存储到数组里计数
最终进行排序
package com.wenjian; import java.util.Scanner; import java.util.HashMap; import java.util.Iterator; import java.util.Set; import java.io.*; public class hali { public static <type> void main (String[] args) throws IOException { File file=new File("E:\\Harry Potter and the Sorcerer's Stone.txt"); //��ȡ�ļ� if(!file.exists()){ System.out.println("文件打不开"); return; } Scanner scanner=new Scanner(file); BufferedReader buf = new BufferedReader(new FileReader(file)); int []num=new int[100];//计数数组 char []zimu=new char[100];//字母表数组 char a='A';char b='a'; for(int i=1;i<=52;i++) { if(i<=26) zimu[i]=a++; else zimu[i]=b++; } //计数 String s=buf.readLine(); while(s!=null) { String[] lineWords=s.split(" "); for(int i=0;i<lineWords.length;i++) { for(int j=0;j<lineWords[i].length();j++) { if(lineWords[i].charAt(j)>='A'&&lineWords[i].charAt(j)<='Z') num[lineWords[i].charAt(j)-'A'+1]++; else if(lineWords[i].charAt(j)>='a'&&lineWords[i].charAt(j)<='z') num[lineWords[i].charAt(j)-'a'+1+24]++; } } s=buf.readLine(); } //求总次数 int sum=0; for(int i=1;i<=52;i++) { sum+=num[i]; } //排序 for(int i=1;i<=52;i++) { for(int j=i+1;j<=52;j++) { if(num[i]<num[j]) { int t=num[i]; num[i]=num[j]; num[j]=t; char p=zimu[i]; zimu[i]=zimu[j]; zimu[j]=p; } } } System.out.println(sum); for(int i=1;i<=52;i++) { double ans=num[i]*1.0/sum*100; System.out.println(zimu[i]+":"+String.format("%.2f", ans)+"%"); } } }
二、输出单个文件的前N个最常出现的英文单词
思路:用输入流读取文件
利用HashMap<String,Integer>来存储单词和计数
利用split进行空格分割
然后再输出前n个单词频率高的
package com.wenjian; import java.io.File; import java.util.Scanner; import java.io.FileNotFoundException; import java.util.HashMap; import java.util.Iterator; import java.util.Set; public class nword { public static <type> void main (String[] args) throws FileNotFoundException { File file=new File("E:\\Harry Potter and the Sorcerer's Stone.txt"); //��ȡ�ļ� if(!file.exists()){ System.out.println("未找到文件"); return; } Scanner in=new Scanner(System.in); System.out.println("请输入前n个常用单词"); int n=in.nextInt(); Scanner scanner=new Scanner(file); HashMap<String,Integer> hashMap=new HashMap<String,Integer>(); while(scanner.hasNextLine()) { String line=scanner.nextLine(); String[] lineWords=line.split(" "); Set<String> wordSet=hashMap.keySet(); for(int i=0;i<lineWords.length;i++) { if(wordSet.contains(lineWords[i])) {//判断set集合里是否有该单词 Integer number=hashMap.get(lineWords[i]);//若有次数+1 number++; hashMap.put(lineWords[i], number); } else {//没有就将其放入set里,次数为1 hashMap.put(lineWords[i], 1); } } } //计算总体单词数 int sum=0; Iterator<String> it=hashMap.keySet().iterator(); while(it.hasNext()){ sum+=hashMap.get(it.next()); } //输出前n个单词 while(n>0) { Iterator<String> iterator=hashMap.keySet().iterator(); int max=0; String maxword=null; while(iterator.hasNext()){ String word=iterator.next(); if(hashMap.get(word)>max) { max=hashMap.get(word); maxword=word; } } hashMap.remove(maxword); double ans=max*1.0/sum*100; if(!maxword.equals("")) { System.out.println(maxword+":"+max); n--; } } } }
三:指定文件目录,但是会递归目录下的所有子目录,每个文件执行统计单词的数量以及百分比
思路:首先获取目录路径
对目录下的目录进行判断,如果还是目录继续递归,否则就输出该文档里的单词
package com.wenjian; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.util.Scanner; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.Reader; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.HashMap; import java.util.Iterator; import java.util.Set; public class neng2 { //输出文档单词 static public void print(String f) throws IOException { File file=new File(f); if(!file.exists()){ System.out.println("无法打开文件"); return; } BufferedReader buf = new BufferedReader(new FileReader(file)); HashMap<String,Integer> hashMap=new HashMap<String,Integer>(); String s; while((s=buf.readLine())!=null) { String[] lineWords=s.split(" "); Set<String> wordSet=hashMap.keySet(); for(int i=0;i<lineWords.length;i++) { if(wordSet.contains(lineWords[i])) { Integer number=hashMap.get(lineWords[i]); number++; hashMap.put(lineWords[i], number); } else { hashMap.put(lineWords[i], 1); } } } while(hashMap.size()>0) { Iterator<String> iterator=hashMap.keySet().iterator(); int max=0; String maxword=null; while(iterator.hasNext()){ String word=iterator.next(); if(hashMap.get(word)>max) { max=hashMap.get(word); maxword=word; } else if(hashMap.get(word)==max) { if(word.compareTo(maxword)<0) { maxword=word; } } } hashMap.remove(maxword); if(!maxword.equals("")) System.out.println(maxword+":"+max+" "); } } //递归文件 static public void getDirectory(File file) throws IOException { File flist[] = file.listFiles(); if (flist == null || flist.length == 0) { return; } for (File f : flist) { if (f.isDirectory()) { getDirectory(f); } else { System.out.println("file==>" + f.getAbsolutePath()); print( f.getAbsolutePath()); System.out.println(); } } } static Scanner sc=new Scanner(System.in); public static void main(String[] args)throws IOException { String path="D:\\test"; File fm=new File(path); getDirectory(fm); } }