词汇统计
需求概要:
输入一段英文,读取文本文件获取段落中的英文字符,及常见标点,空格以及换行符对段落进行分割
统计英文单词在段落中出现的次数并输出单词出现的次数,并对结果进行排序。
分析:
1.对文本文件进行读取。
2.对文本文件中的英文段落分割为单词,通过map统计单词出现次数,将hashmap的键值对放入到数组中,进行排序,最后进行输出
部分代码如下:
public class WordSearchs {
public static void main(String[] args) {
String string;
String file = null;
BufferedReader myfile;
try {
myfile = new BufferedReader(new FileReader("work.txt"));//读取work.txt文件
while ((string = myfile.readLine()) != null)//读取文本
{
file += string;
}
} catch (Exception e) {
e.printStackTrace();
}
file = file.toLowerCase();//将给定区域的规则将字符串中的所有字符转换为小写。基于Unicode字符类指定的标准版本
file = file.replaceAll("[^a-zA-Z]", " ");//使用正则表达式,表示匹配 26 个小写字母中的任意一个或者 26 个大写字母中的任意一个
file = file.replaceAll("\\s+", " ");
String workwords[];
workwords = file.split("\\s+");//通过正则表达式将字符串转换为字符串数组
Map<String, Integer> hashMap = new HashMap<String, Integer>();//使用hashmap键值对来标示
for (int i = 0; i < workwords.length; i++) {
String key = workwords[i];
if (hashMap.get(key) != null) {
int value = ((Integer) hashMap.get(key)).intValue();//通过hashmap的键得到它的值
++value;
hashMap.put(key, new Integer(value));
} else {
hashMap.put(key, new Integer(1));
}
}
Map<String, Object> tree = new TreeMap<String, Object>(hashMap);
BufferedWriter bw = null;
try {
bw = new BufferedWriter(new FileWriter("count.txt"));
} catch (Exception e1) {
e1.printStackTrace();
}
Iterator itert = tree.entrySet().iterator(); //通过迭代器进行遍历
String a[]=new String[tree.size()];//定义一个a数组,计算hashmap长度
int b[]=new int[tree.size()];//定义一个b数组
int i=0;
while (itert.hasNext()) {
Map.Entry me = (Map.Entry) itert.next();
int value = (Integer)me.getValue();
String key =(String) me.getKey();
a[i]=key;
b[i]=value;
i++;
}
int bValue=0;//将a数组进行排序,且与b中的值对应起来
String aKey="";
for(int j=0;j<b.length;j++){
for(int k=0;k<j;k++){
if(b[j]>b[k]){
bValue=b[j];
aKey=a[j];
b[j]=b[k];
a[j]=a[k];
b[k]=bValue;
a[k]=aKey;
}
}
}
for(int j=0;j<a.length;j++){//输出,此处输出a数组的长度和b数组的长度是一样的
System.out.println(a[j]+"="+b[j]);
try {
bw.write(a[j]+"="+b[j]);
bw.newLine();//换行
bw.flush(); //释放资源
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
输出结果:
i=60
the=57
and=38
to=31
you=25
a=23
in=19
said=15
jack=15
me=15
my=15
of=14
s=14
he=13
his=12
it=12
that=12
what=11
we=11
as=10
all=10
mom=10
they=10
through=9
day=9
one=9
rain=9
can=9
was=8
t=8
not=8
but=8
no=7
for=7
get=7
had=7
time=7
she=6
do=6
take=6
run=6
ll=6
were=6
him=6
when=6
this=6
your=6
their=5
at=5
on=5
just=5
left=5
let=5
away=5
way=5
be=5
friends=5
if=5
next=5
after=5
about=5
so=4
stood=4
over=4
up=4
been=4
right=4
every=4
few=4
them=4
there=4
with=4
wrong=4
another=4
young=4
her=4
moment=3
child=3
needed=3
come=3
anything=3
into=3
is=3
off=3
got=3
desk=3
thought=3
other=3
out=3
have=3
prayed=3
by=3
washing=3
door=3
like=3
asked=3
wet=3
make=3
say=3
will=3
see=3
word=3
cancer=3
hope=3
some=3
spring=3
faith=2
ran=2
repeated=2
replied=2
children=2
room=2
first=2
job=2
closed=2
knew=2
know=2
leaving=2
shopping=2
before=2
soaked=2
from=2
corrected=2
god=2
long=2
couldn=2
talk=2
tears=2
man=2
mart=2
great=2
memories=2
then=2
beautiful=2
hand=2
things=2
think=2
morning=2
much=2
must=2
deserve=2
cars=2
treat=2
tried=2
tulips=2
turned=2
turning=2
two=2
under=2
did=2
us=2
ve=2
waited=2
wal=2
want=2
wanted=2
chair=2
don=2
now=2
honey=2
weeks=2
welcome=2
cheek=2
office=2
how=2
down=2
whispered=2
who=2
or=2
episodes=2
without=2
others=2
words=2
would=2
are=2
years=2
inside=2
pouring=2
eyes=2
calmly=1
hospital=1
came=1
hurried=1
hurry=1
hypnotic=1
affirmation=1
appeared=1
ignore=1
image=1
carefree=1
innocence=1
innocent=1
abruptly=1
insult=1
case=1
irritated=1
caught=1
arm=1
jabbed=1
change=1
changed=1
again=1
kind=1
kindness=1
aside=1
knit=1
ask=1
laugh=1
laughed=1
laughing=1
lay=1
leaned=1
circumstances=1
cleared=1
absolutely=1
letters=1
life=1
clouded=1
line=1
clung=1
colors=1
lose=1
lost=1
love=1
loved=1
工程源码地址:https://git.coding.net/handsomeman/wordsearch.git
ssh://git@git.coding.net:handsomeman/wordsearch.git