读取英文单词,按顺序排列
1. Implement a function that counting word frequency. It reads an English article from an user-specified txt file(article.txt) and counts their number. Those exclusive words should not be counted. Output the words and counts to 2 files. One(3_1_1out.txt) is in lexicographic order, and the other (3_1_2out.txt) is in descending frequency order.
#pragma warning(disable:4786)
#include <string>
#include <iostream>
#include <fstream>
#include <functional>
#include <algorithm>
#include <vector>
#include <map>
#include <set>
using namespace std;
typedef map<string, int>::value_type sival_type;
vector<string> *retrieve_text(string file_name)
{
ifstream artcile_file( file_name.c_str(), ios::in );
if (!artcile_file) {
cout << "Conn't open " << file_name.c_str() << " !" << endl;
exit (1);
}
vector<string> *lines_of_text = new vector<string>;
string textline;
while ( getline(artcile_file, textline, '\n'))
{
//cout << " " << textline << '\n';
lines_of_text->push_back(textline);
}
return lines_of_text;
}
void strip_caps( vector<string> *text_file )
{
string caps( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" );
vector<string>::iterator iter = text_file->begin();
for ( ; iter != text_file->end(); ++iter )
{
string::size_type pos = 0;
while ( (pos = (*iter).find_first_of( caps, pos ))
!= string::npos )
{
(*iter)[ pos ] = tolower( (*iter)[pos] );
}
} // end of for
}
vector<string> *separate_words( const vector<string> *text_file )
{
string filter("abcdefghijklmnopqrstuvwxy");
// 包含独立的单词集合
vector<string> *words = new vector<string>;
short line_pos = 0;
for ( ; line_pos <text_file->size(); ++line_pos )
{
string textline = (*text_file)[line_pos];
// 用来遍历所有的字母
string::size_type pos = 0;
// 单词的开始位置
string::size_type prev_pos = textline.find_first_of(filter);
// 单词末尾的下一空格位置
string::size_type temp_pos = textline.find_first_of(filter);
// 一个小开关,其值为TURE时,prev_pos指向单词开始的位置
bool onoff = false;
while ( (pos = textline.find_first_of(filter, pos))
!= string::npos )
{
if ( onoff )
{
prev_pos = temp_pos - 1;
// 将onoff值改为false,使单词开始的位置不会改变
onoff = false;
}
++pos;
if ( (pos - temp_pos) != 1 )
{
// 为下一次的赋值做准备
onoff = true;
// 将分离出的单词输入words
words->push_back(
textline.substr( prev_pos, temp_pos - prev_pos ));
}
temp_pos = pos;
} // end of while
// 输入最后一个单词,除非这一段没有找到任何字母
if ( prev_pos != string::npos )
{
words->push_back(
textline.substr( prev_pos, temp_pos - prev_pos ));
}
} // end of for
return words;
}
map< string, int > *appear_total( const vector<string> *words )
{
// 创建单词排除集合
set<string> exclusion_set;
ifstream exclusion_file( "pkg95.txt", ios::in );
if (!exclusion_file) {
cout << "Conn't open pkg95.txt !" << endl;
exit (1);
}
string textline;
while ( getline(exclusion_file, textline, '\n'))
{
//cout << " " << textline << '\n';
exclusion_set.insert(textline);
}
map<string, int> *word_map = new map<string, int>;
// 开始向word_map中记录数据
vector<string>::const_iterator iter = words->begin();
for ( ; iter != words->end(); ++iter )
{
// 如果少于3个字符或在排除集合中存在,则不输入到map中
if ( (*iter).size() < 3 || exclusion_set.count( *iter ) )
{
continue;
}
// 如果count()返回0,则单词不存在,加入它
if ( !word_map->count(*iter) )
{
word_map->insert( sival_type( (*iter), 1 ) );
}
else
{
// 将单词的出现次数加1
(*word_map)[ (*iter) ] += 1;
}
} // end of for
return word_map;
}
multimap< int, string, greater<int> > * multimap_total( map<string, int> *text_map )
{
multimap<int, string, greater<int> > *word_map =
new multimap< int, string, greater<int> >;
map< string, int >::iterator map_siter = text_map->begin();
for ( ; map_siter != text_map->end(); ++map_siter )
{
word_map->insert(make_pair((*map_siter).second, (*map_siter).first));
}
{
string ofile("3_1_2out.txt");
ofstream outfile( ofile.c_str() );
if (!outfile)
{
cerr << "error: unable to open output file: "
<< ofile << endl;
}
multimap< int, string, greater<int> >::iterator map_siter = word_map->begin();
for ( ; map_siter != word_map->end(); ++map_siter )
{
outfile << (*map_siter).second;
for ( int n = 0; n < 15 - (*map_siter).second.size(); ++n )
{
outfile << ' ';
}
outfile << "出现 " << (*map_siter).first << "\t次" << endl;
} // end of for
cout << "程序已将处理结果写入3_1_2out.txt,该文件保存在当前目录"
<< endl;
}
return word_map;
}
void map_output( map<string, int> *text_map )
{
string ofile("3_1_1out.txt");
ofstream outfile( ofile.c_str() );
if (!outfile)
{
cerr << "error: unable to open output file: "
<< ofile << endl;
}
map< string, int >::iterator map_siter = text_map->begin();
for ( ; map_siter != text_map->end(); ++map_siter )
{
outfile << (*map_siter).first;
for ( int n = 0; n < 15 - (*map_siter).first.size(); ++n )
{
outfile << ' ';
}
outfile << "出现 " << (*map_siter).second << "\t次" << endl;
} // end of for
cout << "程序已将处理结果写入3_1_1out.txt,该文件保存在当前目录"
<< endl;
}
int main()
{
vector<string> *text_file = retrieve_text("article.txt");
strip_caps(text_file);
vector<string> *words = separate_words(text_file);
map< string, int > *text_map = appear_total(words);
map_output( text_map );
multimap_total(text_map);
return 0;
}
程序执行结果
3_1_1out.txt
article 出现 2 次
counted 出现 1 次
counting 出现 1 次
counts 出现 2 次
descending 出现 1 次
english 出现 1 次
exclusive 出现 1 次
file 出现 1 次
files 出现 1 次
frequency 出现 2 次
function 出现 1 次
implement 出现 1 次
lexicographic 出现 1 次
output 出现 1 次
reads 出现 1 次
specified 出现 1 次
txt 出现 4 次
user 出现 1 次
word 出现 1 次
words 出现 2 次
3_1_2out.txt
txt 出现 4 次
article 出现 2 次
counts 出现 2 次
frequency 出现 2 次
words 出现 2 次
counted 出现 1 次
counting 出现 1 次
descending 出现 1 次
english 出现 1 次
exclusive 出现 1 次
file 出现 1 次
files 出现 1 次
function 出现 1 次
implement 出现 1 次
lexicographic 出现 1 次
output 出现 1 次
reads 出现 1 次
specified 出现 1 次
user 出现 1 次
word 出现 1 次
附PKG95.TXT 文件内容:
different
necessary
need
needed
needing
newest
next
no
nobody
non
none
not
nothing
now
nowhere
of
off
often
new
old
older
oldest
on
once
one
only
open
again
among
already
about
above
against
alone
after
also
although
along
always
an
across
and
another
ask
asking
asks
backed
away
should
show
came
all
almost
before
began
back
backing
be
became
because
becomes
been
at
behind
being
best
better
between
big
showed
ended
ending
both
but
by
asked
backs
can
cannot
number
numbers
case
few
find
finds
cases
clearly
her
herself
come
could
did
here
beings
fact
far
felt
become
first
for
four
from
full
fully
furthers
gave
general
generally
get
gets
gives
facts
go
going
good
goods
certain
certainly
clear
great
greater
greatest
group
grouped
grouping
groups
got
has
have
having
he
further
furthered
had
furthering
itself
faces
highest
him
himself
his
how
however
if
important
interests
into
is
it
its
anyone
anything
anywhere
are
area
areas
around
as
seconds
see
seem
seemed
seeming
seems
sees
right
several
shall
she
enough
even
evenly
over
part
parted
parting
parts
per
down
place
places
point
pointed
pointing
points
possible
present
presented
presenting
ends
high
mrs
much
must
my
myself
presents
down
problem
problems
put
puts
quite
will
with
within
rather
really
room
rooms
said
same
right
showing
shows
side
sides
since
small
smaller
smallest
so
some
somebody
someone
something
somewhere
state
states
such
sure
take
taken
than
that
the
their
then
there
therefore
these
thought
thoughts
three
through
thus
to
today
together
too
took
toward
turn
turned
turning
turns
two
still
under
until
up
others
upon
us
use
used
uses
very
want
wanted
wanting
wants
was
way
we
well
wells
went
were
what
when
where
whether
which
while
who
whole
year
years
yet
you
everyone
everything
everywhere
young
younger
youngest
your
yours
ever
works
every
everybody
face
other
our
out
just
interesting
high
might
keep
keeps
give
given
higher
kind
knew
know
known
knows
large
largely
last
later
latest
least
less
needs
never
newer
let
lets
like
likely
long
high
longer
longest
made
make
making
man
many
may
me
member
members
men
more
in
interest
interested
most
mostly
mr
opened
opening
new
opens
or
perhaps
order
ordered
ordering
orders
differ
differently
do
does
done
downed
downing
downs
they
thing
things
think
thinks
this
those
ways
why
without
work
worked
working
would
during
each
early
either
end
though
still
whose
saw
say
says
them
second
any
anybody