进阶实验5-3.3 基于词频的文件相似度 (30分)-哈希

 

 

 

 解题思路:

1、存储:用一张哈希表存储单词以及对应所在的文件,再用一张文件表,存储每个文件的词汇量以及单词在哈希表中的位置

2、查询:先在文件表中查询对应的文件名,(取文件词汇量较少的文件名)-> 找到对应文件名中的词汇所在位置-> 根据此单词的位置到哈希表中查找单词所在文件列表->从而判断该单词是否是两文件的公共词汇

重复步骤2,直至文件中的单词全部查询完毕

#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <ctype.h>

#define MAXSIZE 500009
#define MAXS 10
#define MINS 3
#define MAXB 5

typedef char Element[MAXS+1];
typedef struct FileEntry *FList;
struct FileEntry{
    short FileNo;
    FList Next;
};
typedef struct WordEntry *WList;
struct WordEntry{
    int words;
    WList Next;
};

struct HashEntry{
    short FileNo;
    Element Word;
    FList InvIndex;
};
typedef struct HashTbl *HashTable;
struct HashTbl{
    int TableSize;
    struct HashEntry *TheCells;
};

HashTable InitialHashTable(int size);//哈希表初始化
WList CreateWordList(int size);//文件单词表初始化
int Hash(Element Key,int P);//哈希函数
int Find(HashTable H,Element Key);//获取存储位置
int FindAndInsert(HashTable H,Element Key,int FileNo);//插入哈希表(前插法)
void FileInsert(WList File,int Pos,int FileNo);//插入文凭单词表(前插法)
int GetWord(Element Word);//获取单词
double CalSim(HashTable H,WList File,int F1,int F2);//计算公共词汇量占两文件总词汇量的百分比

int main()
{
    int i,N,M,F1,F2;
    HashTable H;
    WList File;
    Element Word;
    
    scanf("%d",&N);
    H=InitialHashTable(MAXSIZE);
    File=CreateWordList(N+1);
    
    for(i=1;i<=N;i++)
    {
        while(GetWord(Word))
        FileInsert(File,FindAndInsert(H,Word,i),i);
    }
    
    scanf("%d",&M);
    for(i=0;i<M;i++)
    {
        scanf("%d%d",&F1,&F2);
        printf("%.1lf%%\n",CalSim(H,File,F1,F2));
    }
}

HashTable InitialHashTable(int size)
{
    HashTable H=malloc(sizeof(struct HashTbl));
    H->TheCells=malloc(sizeof(struct HashEntry)*size);
    H->TableSize=size;
    while(size)
    {
        H->TheCells[--size].InvIndex=NULL;
        H->TheCells[size].FileNo=0;
    }
    return H;
}

WList CreateWordList(int size)
{
    WList F=malloc(sizeof(struct WordEntry)*size);
    while(size)
    {
        F[--size].words=0;
        F[size].Next=NULL;
    }
    return F;
}

int Hash(Element Key,int P)
{
    unsigned int h=0;
    while(*Key!='\0')
    {
        h=(h<<MAXB)+(*Key++-'a');
    }
    return h%P;
}

int Find(HashTable H,Element Key)
{
    int Pos=Hash(Key,H->TableSize);
    while(H->TheCells[Pos].FileNo&&strcmp(H->TheCells[Pos].Word,Key))
    {
        Pos++;
        if(Pos==H->TableSize)
        Pos-=H->TableSize;
    }
    return Pos;
}
int FindAndInsert(HashTable H,Element Key,int FileNo)
{
    int Pos=Find(H,Key);
    if(H->TheCells[Pos].FileNo!=FileNo)
    {
        if(!H->TheCells[Pos].FileNo)
        strcpy(H->TheCells[Pos].Word,Key);
        H->TheCells[Pos].FileNo=FileNo;
        
        FList node=malloc(sizeof(struct FileEntry));
        node->FileNo=FileNo;
        node->Next=H->TheCells[Pos].InvIndex;
        H->TheCells[Pos].InvIndex=node;
        return Pos;
    }
    return -1;
}
void FileInsert(WList File,int Pos,int FileNo)
{
    if(Pos<0)return;
    WList W=malloc(sizeof(struct WordEntry));
    W->words=Pos;
    W->Next=File[FileNo].Next;
    File[FileNo].Next=W;
    File[FileNo].words++;
}

int GetWord(Element Word)
{
    char c;
    int p=0;
    scanf("%c",&c);
    while(!isalpha(c)&&(c!='#'))scanf("%c",&c);
    if(c=='#')return 0;
    while(isalpha(c)&&(p<MAXS))
    {
        Word[p++]=tolower(c);
        scanf("%c",&c);
    }
    while(isalpha(c))scanf("%c",&c);
    if(p<MINS)return GetWord(Word);
    else
    {
        Word[p]='\0';
        return 1;
    }
}

double CalSim(HashTable H,WList File,int F1,int F2)
{
    int i;
    if(File[F1].words>File[F2].words)
    {
        i=F1;F1=F2;F2=i;
    }
    WList W=File[F1].Next;
    i=0;
    while(W)
    {
        FList F=H->TheCells[W->words].InvIndex;
        while(F)
        {
            if(F->FileNo==F2)
            break;
            F=F->Next;
        }
        if(F) i++;
        W=W->Next;
    }
    return ((double)(i*100)/(double)(File[F1].words+File[F2].words-i));
}

 

posted @ 2020-04-08 20:21  跃鱼  阅读(1365)  评论(1编辑  收藏  举报