哈希表之词频统计

#include <stdio.h>


typedef struct node_t{
    struct node_t *next;
    char *word;
    int count;
}*node;

#define NHASH 9973 // 最好定位质数
#define MULT 31     // 乘法器
node bin[NHASH];    // 哈希表索引

unsigned int hash(char *p)
{
    unsigned int h = 0;
    for(; *p; p++)
        h = MULT * h + *p;
    return h % NHASH;
}

void incword(char *s)
{
    unsigned int h = hash(s);
    node p;
    for(p=bin[h]; p; p=p->next)
        if(strcmp(s, p->word) == 0){
            (p->count)++;
            return;
        }
    p = malloc(sizeof(*p));
    p->count = 1;
    p->word = malloc(strlen(s)+1);
    strcpy(p->word, s);
    p->next = bin[h]; // 栈式插入，从表头处插入，而非从表的尾部插入
    bin[h] = p;
}


void main(int argc, char **argv)
{
    int i;
    char buf[32];
    for(i=0; i<NHASH; i++)
        bin[i] = NULL;
    int ii = 0, cc;
    while(1){
        scanf("%s",buf);
        printf("--- %d\n", ii++);
        if(*buf == 'q')
            break;
        incword(buf);
    }

    node p;
    for(i=0; i<NHASH; i++)
        for(p = bin[i]; p; p = p->next)
            printf("%s:%d\n", p->word, p->count);
}

以哈希表为数据结构统计词频大致思路：

构造一个结构体数组 struct node_t bin[质数]; 称为哈希表

构造一个哈希函数，给定一个字符串返回一个整数，这个整数哈希表的键值；

每获取一个字符串，把字符串与它的所对应键值的node 节点为表头的链表上的所有单词比较，若存在相同的，count++，否则增加到链表节点，把单词挂到该节点上，并置count=1；

输出的时候，从哈希表的0键值处开始，遍历所有链表，并输出各非空节点；

posted @ 2012-09-13 17:47 庄庄庄阅读(3475) 评论(0) 收藏举报

刷新页面返回顶部