词频统计更新
实现功能:从控制台输入文件路径,并统计单词总数及不重复的单词数,并输出所有单词词频,同时排序。
头文件
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <string.h>
定义宏
#define WORD_LENGTH 250
定义结构体及全局变量
typedef struct Node { char word[WORD_LENGTH]; int time; struct Node *next; }wordNode; typedef struct TopNode { int sum; //全文单词个数 int num; //全文无重复单词个数 wordNode * next; }TopNode; TopNode t; TopNode * L = NULL;
声明文件中使用的函数
wordNode *wordSearch(char *word); void wordJob(char word[]); void wordCount(char *word); void printCountList(); void PrintFirstTenTimes(); void mergeSort(wordNode **head); void FrontBackSplit(wordNode *head,wordNode **pre,wordNode **next); wordNode *SortedMerge(wordNode *pre,wordNode *next); void release();
主函数
int main(int argc,char *argv[]) { char temp[WORD_LENGTH];//定义用以临时存放单词的数组 char file_path[100]; wordNode * h; FILE *file; printf("请输入文件路径:"); gets(file_path); if((file = fopen(file_path, "r")) == NULL) { printf("文件读取失败!"); exit(1); } L = &t; L->num = 0; L->sum = 0; L->next = NULL; while((fscanf(file,"%s",temp))!= EOF) { L->sum++; wordJob(temp); wordCount(temp); } fclose(file); printCountList(); printf("\n\n输出词频最高的10个词\n"); h = L->next; mergeSort(&h); //排序 PrintFirstTenTimes(); release(); return 0; }
查找单词所在节点并返回
wordNode *wordSearch(char *word) { char * t; wordNode *node; wordNode *nextNode = L->next; if(L->next == NULL) { node = (wordNode*)malloc(sizeof(wordNode)); strcpy(node->word,word); node->time = 0; node->next = NULL; //初试化,必须有,否则会发生错误。 L->num++; L->next = node; return node; } while(nextNode != NULL) //查找匹配单词 { t = nextNode->word; if(strcmp(t,word) == 0) { return nextNode; } nextNode = nextNode->next; } if(nextNode == NULL) //原链表中不存在该单词 { node = (wordNode*)malloc(sizeof(wordNode)); strcpy(node->word, word); node->time = 0; node->next = L->next; L->next = node; L->num++; return node; } else return nextNode; //返回查找到的节点 }
词频统计
void wordCount(char *word) { wordNode *tmpNode; tmpNode = wordSearch(word); //word所在的节点 tmpNode->time++; }
输出所有词频
void printCountList() { int i = 0; wordNode *node = L->next; if(L->next == NULL) { printf("该文件无内容!"); } else { printf("\n这篇文章总计%d词\n\n不重复单词共%d个\n",L->sum,L->num); printf("\n输出所有单词的频数\n"); while(node != NULL) { printf(" %s:%d次\t",node->word,node->time); i++; node = node->next; if(i%4 == 0) printf("\n"); } } }
输出词频最高的10个词
void PrintFirstTenTimes() { wordNode *node = L->next; int i = 1; if(L->next == NULL) { printf("该文件无内容!"); } else { while (node != NULL && i<=10) { printf("\t%s:%d次\n",node->word,node->time); node = node->next; i++; } } }
对词频统计结果进行归并排序
void mergeSort(wordNode **headnode) { wordNode *pre,*next,*head; head = *headnode; if(head == NULL || head->next == NULL) { return; } FrontBackSplit(head,&pre,&next); mergeSort(&pre); mergeSort(&next); *headnode = SortedMerge(pre,next); //插入排序 }
取尾节点
void FrontBackSplit(wordNode *source,wordNode **pre,wordNode **next) { wordNode *fast; wordNode *slow; if(source == NULL || source->next == NULL) { *pre = source; *next = NULL; } else { slow = source; fast = source->next; while(fast != NULL) { fast = fast->next; if(fast != NULL) { slow = slow->next; fast = fast->next; } } *pre = source; fast = source; *next = slow->next; //pre和next为传址 slow->next = NULL; } }
取频数最大的节点作为头节点
wordNode *SortedMerge(wordNode *pre,wordNode *next) { wordNode *result = NULL; if(pre == NULL) return next; else if(next == NULL) return pre; if(pre->time >= next->time) { result = pre; result->next = SortedMerge(pre->next,next); } else { result = next; result->next = SortedMerge(pre,next->next); } return result; }
处理单词
void wordJob(char word[]) { int i,k; for(i = 0;i<strlen(word);i++) { if(word[i]>='A'&& word[i]<='Z') { word[i] += 32; continue; } if(word[i]<'a'||word[i]>'z') { if(i == (strlen(word)-1)) { word[i] = '\0'; } else { k = i; while(i < strlen(word)) { word[i] = word[i+1]; i++; } i = k; } } } }
释放所有结点内存
void release() { wordNode *pre; if(L->next == NULL) return; pre = L->next; while(pre != NULL) { L->next = pre->next; free(pre); pre = L->next; } }
ssh://git@git.coding.net:amberpass/cptjgx.git
https://git.coding.net/amberpass/cptjgx.git