词频统计更新

 

实现功能:从控制台输入文件路径,并统计单词总数及不重复的单词数,并输出所有单词词频,同时排序。

 

头文件

1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>

 

定义宏

#define WORD_LENGTH 250

 

定义结构体及全局变量

typedef struct Node
{
    char word[WORD_LENGTH];
    int time;
    struct Node *next;
}wordNode;

typedef struct TopNode
{
    int sum;        //全文单词个数
    int num;        //全文无重复单词个数
    wordNode * next;
}TopNode;

TopNode t;

TopNode * L = NULL;

 

声明文件中使用的函数

wordNode *wordSearch(char *word);
void wordJob(char word[]);
void wordCount(char *word);

void printCountList();
void PrintFirstTenTimes();

void mergeSort(wordNode **head);
void FrontBackSplit(wordNode *head,wordNode **pre,wordNode **next);
wordNode *SortedMerge(wordNode *pre,wordNode *next);

void release();

 

主函数

int main(int argc,char *argv[])
{
    char temp[WORD_LENGTH];//定义用以临时存放单词的数组
    char file_path[100];
    wordNode * h;
    FILE *file;
    printf("请输入文件路径:");
    gets(file_path);
    if((file = fopen(file_path, "r")) == NULL)
    {
        printf("文件读取失败!");
        exit(1);
    }
    L = &t;
    L->num = 0;
    L->sum = 0;
    L->next = NULL;
    while((fscanf(file,"%s",temp))!= EOF)
    {
        L->sum++;
        wordJob(temp);
        wordCount(temp);
    }
    fclose(file);
    printCountList();
    printf("\n\n输出词频最高的10个词\n");
    h = L->next;
    mergeSort(&h);              //排序
    PrintFirstTenTimes();
    release();
    return 0;
}

 

查找单词所在节点并返回

wordNode *wordSearch(char *word)
{
    char * t;
    wordNode *node;
    wordNode *nextNode = L->next;
    if(L->next == NULL)
    {
        node = (wordNode*)malloc(sizeof(wordNode));
        strcpy(node->word,word);
        node->time = 0;
        node->next = NULL;           //初试化,必须有,否则会发生错误。
        L->num++;
        L->next = node;
        return node;
    }
    while(nextNode != NULL)          //查找匹配单词
    {
        t = nextNode->word;
        if(strcmp(t,word) == 0)
        {
            return nextNode;
        }
        nextNode = nextNode->next;
    }
    if(nextNode == NULL)             //原链表中不存在该单词
    {
        node = (wordNode*)malloc(sizeof(wordNode));
        strcpy(node->word, word);
        node->time = 0;
        node->next = L->next;
        L->next = node;
        L->num++;
        return node;
    }
    else
        return nextNode;            //返回查找到的节点
}

 

词频统计

void wordCount(char *word)
{
    wordNode *tmpNode;
    tmpNode = wordSearch(word);      //word所在的节点
    tmpNode->time++;
}

 

输出所有词频

void printCountList()
{
    int i = 0;
    wordNode *node = L->next;
    if(L->next == NULL)
    {
        printf("该文件无内容!");

    }
    else
    {
        printf("\n这篇文章总计%d词\n\n不重复单词共%d个\n",L->sum,L->num);
        printf("\n输出所有单词的频数\n");
        while(node != NULL)
        {
            printf(" %s:%d次\t",node->word,node->time);
            i++;
            node = node->next;
            if(i%4 == 0)
                printf("\n");
        }
    }
}

 

输出词频最高的10个词

void PrintFirstTenTimes()
{
    wordNode *node = L->next;
    int i = 1;
    if(L->next == NULL)
    {
        printf("该文件无内容!");

    }
    else
    {
        while (node != NULL && i<=10)
        {
            printf("\t%s:%d次\n",node->word,node->time);
            node = node->next;
            i++;
        }
    }
}

 

对词频统计结果进行归并排序

void mergeSort(wordNode **headnode)
{
    wordNode *pre,*next,*head;
    head = *headnode;
    if(head == NULL || head->next == NULL)
    {
        return;
    }
    FrontBackSplit(head,&pre,&next);
    mergeSort(&pre);
    mergeSort(&next);
    *headnode = SortedMerge(pre,next);  //插入排序
}

 

取尾节点

void FrontBackSplit(wordNode *source,wordNode **pre,wordNode **next)
{
    wordNode *fast;
    wordNode *slow;
    if(source == NULL || source->next == NULL)
    {
        *pre = source;
        *next = NULL;
    }
    else
    {
        slow = source;
        fast = source->next;
        while(fast != NULL)
        {
            fast = fast->next;
            if(fast != NULL)
            {
                slow = slow->next;
                fast = fast->next;
            }
        }
        *pre = source;
        fast = source;
        *next = slow->next;     //pre和next为传址
        slow->next = NULL;
    }
}

 

取频数最大的节点作为头节点

wordNode *SortedMerge(wordNode *pre,wordNode *next)
{
    wordNode *result = NULL;
    if(pre == NULL)
        return next;
    else if(next == NULL)
        return pre;
    if(pre->time >= next->time)
    {
        result = pre;
        result->next = SortedMerge(pre->next,next);
    }
    else
    {
        result = next;
        result->next = SortedMerge(pre,next->next);
    }
    return result;
}

 

处理单词

void wordJob(char word[])
{
    int i,k;
    for(i = 0;i<strlen(word);i++)
    {
        if(word[i]>='A'&& word[i]<='Z')
        {
            word[i] += 32;
            continue;
        }
        if(word[i]<'a'||word[i]>'z')
        {
            if(i == (strlen(word)-1))
            {
                word[i] = '\0';
            }
            else
            {
                k = i;
                while(i < strlen(word))
                {
                    word[i] = word[i+1];
                    i++;
                }
                i = k;
            }
        }
    }
}

 

释放所有结点内存

void release()
{
    wordNode *pre;
    if(L->next == NULL)
        return;
    pre = L->next;
    while(pre != NULL)
    {
        L->next = pre->next;
        free(pre);
        pre = L->next;
    }
}

 

ssh://git@git.coding.net:amberpass/cptjgx.git

 

https://git.coding.net/amberpass/cptjgx.git