统计文章中字母、单词出现的频率
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
/*
* 统计一片文章中各字母出现的频率
*/
class entity{
String zimu;
int cishu;
public entity(String zimu,int cishu) {
this.zimu = zimu;
this.cishu = cishu;
}
public String getZimu() {
return zimu;
}
public void setZimu(String zimu) {
this.zimu = zimu;
}
public int getCishu() {
return cishu;
}
public void setCishu(int cishu) {
this.cishu = cishu;
}
}
public class ZimuCollect {
// public static Map.Entry[] getSortedHashtableByValue(Map map) {
// Set set = map.entrySet();
// Map.Entry[] entries = (Map.Entry[]) set.toArray(new Map.Entry[set.size()]);
// Arrays.sort(entries, new Comparator() {
// public int compare(Object arg0, Object arg1) {
// Long key1 = Long.valueOf(((Map.Entry) arg0).getValue().toString());
// Long key2 = Long.valueOf(((Map.Entry) arg1).getValue().toString());
// return key2.compareTo(key1);
// }
// });
// return entries;
// }
public static void collect() throws IOException {
try {
//IO操作读取文件内容
FileReader fr = new FileReader("file.txt");
BufferedReader br = new BufferedReader(fr);
HashMap<String, Integer> map = new HashMap<String, Integer>();
String string =null;
Integer count = 0;//每个字母的次数
Integer total = 0;//总共多少个字母
while ((string=br.readLine())!=null) {
char[] ch = string.toCharArray();//将获取的string分成字符数组
total = total + ch.length;
for (int i = 0; i < ch.length; i++) {
ch[i] = Character.toLowerCase(ch[i]);//将所有的字母变成小写的
count = map.get(ch[i]+"");
if (count == null) {//字母没有出现重复;
count = 1;
}else {//字母出现重复,count+1;
count++;
}
map.put(ch[i]+"", count);
}
}
// Map.Entry[] finmap = getSortedHashtableByValue(map);
// for(int i=0;i<finmap.length;i++)
// {
// System.out.println("字母"+finmap[i]+"其频率为:");
// }
List<entity> result = new ArrayList<>();
entity e = null;
for (String str : map.keySet()) {
e = new entity(str,map.get(str));
result.add(e);
}
result.sort((entity e1,entity e2)->{
return e2.getCishu()-e1.getCishu();});
for(entity ee : result) {
System.out.println("字母"+ee.getZimu()+"在文章中出现"+ee.getCishu()+"次,其频率为"+String.format("%.2f",ee.getCishu()*1.0/total));
}
// for (String str : map.keySet()) {
// System.out.println("字母"+str+"出现"+map.get(str)+"次,其频率为:"+String.format("%.2f",map.get(str)*1.0/total));
// }
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
try{
ZimuCollect zimucollect = new ZimuCollect();
ZimuCollect.collect();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
统计文章中字母出现频率思路: FileReader fr = new FileReader("file.txt");BufferedReader br = new BufferedReader(fr);按行读取文件,将每次读取到的一行都进行统计,HashMap<String, Integer> map = new HashMap<String, Integer>();用map的统计方法,其中String代表出现的字母,int代表该字母出现的次数,其次可以利用map方法看其是否重复重现,重复的话其int值便+1;最后遍历即可。
package 统计英文字母出现频率; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.TreeMap; class entity11{//单词实体类 String danci;//出现的单词 int cishu;//单词对应出现的次数 public entity11(String zimu,int cishu) { this.danci = zimu; this.cishu = cishu; } public String getDanci() { return danci; } public int getCishu() { return cishu; } } public class DanciCollect { public static boolean judgeNouse(String str) throws IOException { boolean flag = true; FileReader fr = new FileReader("judge.txt"); BufferedReader bf = new BufferedReader(fr); String str1; // 按行读取字符串 while ((str1 = bf.readLine()) != null) { if(str.equals(str1)) { flag = false; } } bf.close(); fr.close(); return flag; } public static String toLowerCase(String str) { char []StringArr = str.toCharArray(); for (int i = 0; i < StringArr.length; i++) { StringArr[i] = Character.toLowerCase(StringArr[i]); } StringBuffer sb = new StringBuffer(); for(int i = 0;i < StringArr.length;i++) { sb.append(StringArr[i]); } String str1 = sb.toString(); return str1; } public static void collect1() throws IOException { try { File file1 = new File("piao.txt");//定义一个file对象,用来初始化FileReader FileReader reader1 = new FileReader(file1);//定义一个fileReader对象,用来初始化BufferedReader BufferedReader bReader1 = new BufferedReader(reader1);//new一个BufferedReader对象,将文件内容读取到缓存 StringBuilder sb1 = new StringBuilder();//定义一个字符串缓存,将字符串存放缓存中 String s1 = ""; while ((s1 =bReader1.readLine()) != null) {//逐行读取文件内容,不读取换行符和末尾的空格 sb1.append(s1);//将读取的字符串添加换行符后累加p存放在缓存中 } bReader1.close(); String text = sb1.toString(); int i=0; String[] array = {".",",","?","!",":","‘","’","“","”","—",";","-"}; for (int j = 0; j < array.length; j++) { text = text.replace(array[j]," "); //将text中的array数组中包含的特殊字符用空格代替 } String[] textArray = text.split(" "); //根据空格将text分割并存放在textArray中 Map<String, Integer> map = new TreeMap<String, Integer>(); Integer count = 0;//每个字母的次数 Integer total = 0;//总共多少个字母 while(i < textArray.length) { String str = toLowerCase(textArray[i]); if(!judgeNouse(str)) { total = total + 1; count = map.get(str+""); if (count == null) {//单词没有出现重复; count = 1; }else {//单词出现重复,count+1; count++; } map.put(str+"", count); i++; } else { i++; } } List<entity11> result = new ArrayList<>(); entity11 e = null; for (String str : map.keySet()) { e = new entity11(str,map.get(str)); result.add(e); } result.sort((entity11 e1,entity11 e2)->{ return e2.getCishu()-e1.getCishu();}); System.out.println("文章共计"+total+"个单词"); for(int ii = 0 ; ii < result.size();ii++) { System.out.println(result.get(ii).getDanci()+"在文章中出现"+result.get(ii).getCishu()+"次,其频率为"+String.format("%.2f",result.get(ii).getCishu()*1.0/total)); } // for(entity11 ee : result) { // System.out.println("单词"+ee.getDanci()+"在文章中出现"+ee.getCishu()+"次,其频率为"+String.format("%.2f",ee.getCishu()*1.0/total)); // } // for (String str : map.keySet()) { // System.out.println("字母"+str+"出现"+map.get(str)+"次,其频率为:"+String.format("%.2f",map.get(str)*1.0/total)); // } } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void main(String args[]) throws IOException { try { DanciCollect dancicollect = new DanciCollect(); DanciCollect.collect1(); }catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
统计文章中单词出现的频率思路:File file1 = new File("piao.txt");//定义一个file对象,用来初始化FileReader
FileReader reader1 = new FileReader(file1);//定义一个fileReader对象,用来初始化BufferedReader
BufferedReader bReader1 = new BufferedReader(reader1);//new一个BufferedReader对象,将文件内容读取到缓存,,按行读取文件中的内容,用sb1.append(s1);方法将每次读取的内容追加到缓存,这样将文件中所有的内容全部存放进缓存中,用String text = sb1.toString();将缓存内容转化成字符串。按照String[] array = {".",",","?","!",":","‘","’","“","”","—",";","-"};这些特殊符号用空格代替,这样便出现了一个个单词中间以空格间隔,用text.split(" ");方法将其分成一个个单词存进字符串数组,同样用map来从头遍历存放。
package 统计英文字母出现频率; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.TreeMap; class entity111{ String danci; int cishu; public entity111(String zimu,int cishu) { this.danci = zimu; this.cishu = cishu; } public String getDanci() { return danci; } public int getCishu() { return cishu; } } public class NDanciCollect { public static boolean judgeNouse(String str) throws IOException { boolean flag = true; FileReader fr = new FileReader("judge.txt"); BufferedReader bf = new BufferedReader(fr); String str1; // 按行读取字符串 while ((str1 = bf.readLine()) != null) { if(str.equals(str1)) { flag = false; } } bf.close(); fr.close(); return flag; } public static String toLowerCase(String str) { char []StringArr = str.toCharArray(); for (int i = 0; i < StringArr.length; i++) { StringArr[i] = Character.toLowerCase(StringArr[i]); } StringBuffer sb = new StringBuffer(); for(int i = 0;i < StringArr.length;i++) { sb.append(StringArr[i]); } String str1 = sb.toString(); return str1; } public static void collect11() throws IOException { try { File file1 = new File("piao.txt");//定义一个file对象,用来初始化FileReader FileReader reader1 = new FileReader(file1);//定义一个fileReader对象,用来初始化BufferedReader BufferedReader bReader1 = new BufferedReader(reader1);//new一个BufferedReader对象,将文件内容读取到缓存 StringBuilder sb1 = new StringBuilder();//定义一个字符串缓存,将字符串存放缓存中 String s1 = ""; while ((s1 =bReader1.readLine()) != null) {//逐行读取文件内容,不读取换行符和末尾的空格 sb1.append(s1);//将读取的字符串添加换行符后累加p存放在缓存中 } bReader1.close(); String text = sb1.toString(); int i=0; String[] array = {".",",","?","!",":","‘","’","“","”","—",";","-"}; for (int j = 0; j < array.length; j++) { text = text.replace(array[j]," "); //将text中的array数组中包含的特殊字符用空格代替 } String[] textArray = text.split(" "); //根据空格将text分割并存放在textArray中 Map<String, Integer> map = new TreeMap<String, Integer>(); Integer count = 0;//每个字母的次数 Integer total = 0;//总共多少个字母 while(i < textArray.length) { String str = toLowerCase(textArray[i]); if(!judgeNouse(str)) { total = total + 1; count = map.get(str+""); if (count == null) {//单词没有出现重复; count = 1; }else {//单词出现重复,count+1; count++; } map.put(str+"", count); i++; }else { i++; } } List<entity111> result = new ArrayList<>(); entity111 e = null; for (String str : map.keySet()) { e = new entity111(str,map.get(str)); result.add(e); } result.sort((entity111 e1,entity111 e2)->{ return e2.getCishu()-e1.getCishu();}); System.out.println("文章共计"+total+"个单词"); System.out.println("请输入要输出出现频率最高的前N个单词------请输入N的值:"); Scanner scan = new Scanner(System.in); int top = scan.nextInt(); for(int ii = 0 ; ii < top;ii++) { System.out.println(result.get(ii).getDanci()+"在文章中出现"+result.get(ii).getCishu()+"次,其频率为"+String.format("%.2f",result.get(ii).getCishu()*1.0/total)); } // for(entity111 ee : result) { // System.out.println("单词"+ee.getDanci()+"在文章中出现"+ee.getCishu()+"次,其频率为"+String.format("%.2f",ee.getCishu()*1.0/total)); // } // for (String str : map.keySet()) { // System.out.println("字母"+str+"出现"+map.get(str)+"次,其频率为:"+String.format("%.2f",map.get(str)*1.0/total)); // } } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void main(String args[]) throws IOException { try { NDanciCollect ndancicollect = new NDanciCollect(); NDanciCollect.collect11(); }catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
输出前几个频率最高的单词,并且去掉文中一些常见单词,这里只是追加了一个方法,在存进map之前用judgeNouse()函数判断,不是常见单词在存进map里,同样用map来统计单词和其出现的次数,最后将map转换成List数组,用sort函数按照其次数这个属性来进行排序,这样便可按照用户的意愿来输出频率最高的前N个单词。