语料库词频统计程序
老大让我写个统计程序,我就看看书写了两个.
这个是用c++的map方法,map内部实现是红黑树,应该效率比较高.
#include <map>
#include <string>
#include <iostream>
using namespace std;
typedef std::map<std::string, int> type_map;
typedef type_map::iterator type_iter;
type_map m;
type_iter it;
bool lookup(string s)//const char* s)
{
int ret = 0;
it = m.find(s);
if (m.end() != it)
return true;
else
return false;
}
void insert(string s )//const char* s)
{
int count=1;
m.insert(type_map::value_type(s, count));
}
bool gbr(char c,FILE *p)
{
if((int)c==13||(int)c==10||(int) c>=97&&(int) c<122||c=='['||c==']'||c=='{'||c==' '|(int)c==47||c==EOF)
{ if(c=='{')
do{
c=fgetc(p);
}while(c!='}');
return false;}//if
return true;
}
void display()
{
string str;
int num;
cout<<"count vocabulary:"<<endl;
for(it=m.begin();it!=m.end();it++)
{
str=it->first;
num=it->second;
cout<<num<<" "<<str<<endl;
}
}
int main(int argc, char* argv[])
{
FILE *fp;
char *s,c,temp[30],word[30];
fp=fopen("c:/tt.txt","r");
if(fp==NULL)
{
cout<<"file can not open";
return 0;
}
while((c=fgetc(fp))!=EOF)
{
s=temp;
while(gbr(c,fp)==true)
{
*s++=c;
c=fgetc(fp);
}
*s='\0';
string text(temp);
if(strlen(temp)!=0)//recognize a word
{
if(lookup(text)==false)
insert(text);
else
it->second=it->second+1;
}
}
display();
fclose(fp);
return 0;
}
这个是我看到<The c programming language>想起来的hash统计法
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#define HASHSIZE 101
struct nlist{
struct nlist *next;
char name[30];
int count;
};
static struct nlist *hashtab[HASHSIZE];
unsigned hash(char*s)
{
unsigned hashval;
for(hashval=0;*s!='\0';s++)
hashval=*s+31*hashval;
return hashval%HASHSIZE;
}
struct nlist* lookup(char *s)
{
struct nlist *np;
for(np=hashtab[hash(s)];np!=NULL;np=np->next)
if(strcmp(s,np->name)==0)
return np;
//found
return NULL;
}
void install(char*name)
{
struct nlist *np,*nq;
unsigned hashval;
hashval=hash(name);
if(lookup(name)==NULL){//not found
np=(struct nlist*) malloc(sizeof(struct nlist));
strcpy(np->name,name);
np->count=1;
np->next=NULL;
nq=hashtab[hashval];
if(nq==NULL)
hashtab[hashval]=np;
else{
while(nq->next!=NULL)
nq=nq->next;
nq->next=np;
}//
}//end of if
else{
np=lookup(name);
np->count++;
}// found the key
}
void print_count()
{
struct nlist *np;
printf("count vocabulary:\n");
for(int i=0;i<HASHSIZE;i++)
{
np=hashtab[i];
while(np!=NULL)
{
printf("%d %s\n",np->count, np->name);
np=np->next;
}
}
}
bool gbr(char c,FILE *p)
{
if((int)c==13||(int)c==10||(int) c>=97&&(int) c<122||c=='['||c==']'||c=='{'||c==' '|(int)c==47||c==EOF)
{ if(c=='{')
do{
c=fgetc(p);
}while(c!='}');
return false;}//if
return true;
}
int main()
{
FILE *fp;
char *s,*t,c,temp[30],word[30];
fp=fopen("c:/paper.txt","r");
if(fp==NULL)
{
printf("file can not open");
// exit(0);
}
//if((c=fgetc(fp))!=EOF)
while((c=fgetc(fp))!=EOF)
{
s=temp;
//t=word;
while(gbr(c,fp)==true)
{
*s++=c;
c=fgetc(fp);
}
*s='\0';
if(strlen(temp)!=0)//recognize a word
install(temp);
}
fclose(fp);
print_count();
return 0;
}