进阶实验5-3.3 基于词频的文件相似度 (30分)-哈希
解题思路:
1、存储:用一张哈希表存储单词以及对应所在的文件,再用一张文件表,存储每个文件的词汇量以及单词在哈希表中的位置
2、查询:先在文件表中查询对应的文件名,(取文件词汇量较少的文件名)-> 找到对应文件名中的词汇所在位置-> 根据此单词的位置到哈希表中查找单词所在文件列表->从而判断该单词是否是两文件的公共词汇
重复步骤2,直至文件中的单词全部查询完毕
#include <stdio.h> #include <string.h> #include <malloc.h> #include <ctype.h> #define MAXSIZE 500009 #define MAXS 10 #define MINS 3 #define MAXB 5 typedef char Element[MAXS+1]; typedef struct FileEntry *FList; struct FileEntry{ short FileNo; FList Next; }; typedef struct WordEntry *WList; struct WordEntry{ int words; WList Next; }; struct HashEntry{ short FileNo; Element Word; FList InvIndex; }; typedef struct HashTbl *HashTable; struct HashTbl{ int TableSize; struct HashEntry *TheCells; }; HashTable InitialHashTable(int size);//哈希表初始化 WList CreateWordList(int size);//文件单词表初始化 int Hash(Element Key,int P);//哈希函数 int Find(HashTable H,Element Key);//获取存储位置 int FindAndInsert(HashTable H,Element Key,int FileNo);//插入哈希表(前插法) void FileInsert(WList File,int Pos,int FileNo);//插入文凭单词表(前插法) int GetWord(Element Word);//获取单词 double CalSim(HashTable H,WList File,int F1,int F2);//计算公共词汇量占两文件总词汇量的百分比 int main() { int i,N,M,F1,F2; HashTable H; WList File; Element Word; scanf("%d",&N); H=InitialHashTable(MAXSIZE); File=CreateWordList(N+1); for(i=1;i<=N;i++) { while(GetWord(Word)) FileInsert(File,FindAndInsert(H,Word,i),i); } scanf("%d",&M); for(i=0;i<M;i++) { scanf("%d%d",&F1,&F2); printf("%.1lf%%\n",CalSim(H,File,F1,F2)); } } HashTable InitialHashTable(int size) { HashTable H=malloc(sizeof(struct HashTbl)); H->TheCells=malloc(sizeof(struct HashEntry)*size); H->TableSize=size; while(size) { H->TheCells[--size].InvIndex=NULL; H->TheCells[size].FileNo=0; } return H; } WList CreateWordList(int size) { WList F=malloc(sizeof(struct WordEntry)*size); while(size) { F[--size].words=0; F[size].Next=NULL; } return F; } int Hash(Element Key,int P) { unsigned int h=0; while(*Key!='\0') { h=(h<<MAXB)+(*Key++-'a'); } return h%P; } int Find(HashTable H,Element Key) { int Pos=Hash(Key,H->TableSize); while(H->TheCells[Pos].FileNo&&strcmp(H->TheCells[Pos].Word,Key)) { Pos++; if(Pos==H->TableSize) Pos-=H->TableSize; } return Pos; } int FindAndInsert(HashTable H,Element Key,int FileNo) { int Pos=Find(H,Key); if(H->TheCells[Pos].FileNo!=FileNo) { if(!H->TheCells[Pos].FileNo) strcpy(H->TheCells[Pos].Word,Key); H->TheCells[Pos].FileNo=FileNo; FList node=malloc(sizeof(struct FileEntry)); node->FileNo=FileNo; node->Next=H->TheCells[Pos].InvIndex; H->TheCells[Pos].InvIndex=node; return Pos; } return -1; } void FileInsert(WList File,int Pos,int FileNo) { if(Pos<0)return; WList W=malloc(sizeof(struct WordEntry)); W->words=Pos; W->Next=File[FileNo].Next; File[FileNo].Next=W; File[FileNo].words++; } int GetWord(Element Word) { char c; int p=0; scanf("%c",&c); while(!isalpha(c)&&(c!='#'))scanf("%c",&c); if(c=='#')return 0; while(isalpha(c)&&(p<MAXS)) { Word[p++]=tolower(c); scanf("%c",&c); } while(isalpha(c))scanf("%c",&c); if(p<MINS)return GetWord(Word); else { Word[p]='\0'; return 1; } } double CalSim(HashTable H,WList File,int F1,int F2) { int i; if(File[F1].words>File[F2].words) { i=F1;F1=F2;F2=i; } WList W=File[F1].Next; i=0; while(W) { FList F=H->TheCells[W->words].InvIndex; while(F) { if(F->FileNo==F2) break; F=F->Next; } if(F) i++; W=W->Next; } return ((double)(i*100)/(double)(File[F1].words+File[F2].words-i)); }
勤能补拙,熟能生巧