英文文章词频统计:
功能:统计一篇英文文章的单词总数及出现频数并输出,之后排序,输出频数前十的单词及其频数。
实现方法:使用C语言,用fopen函数读入txt文件,fscanf函数逐个读入单词,结构体wordNode存储单词及其频数,以链表的形式连接在一起,最后使用插入排序进行分析,输出频数最高的5个单词。
头文件
#include <stdio.h> #include <stdlib.h> #include <string.h>
定义宏
#define ERROR 1 #define OK 0 #define WORD_LENGTH 250
自定义数据类型
typedef int status; typedef struct Node { char word[WORD_LENGTH]; int time; struct Node *next; }wordNode;
定义全局变量
wordNode *headNode = NULL;
声明所有使用的函数
wordNode *wordSearch(char *word,int *num); status wordCount(char *word,int *num); void printCountList(int *num); void PrintFirstFiveTimes(); void mergeSort(wordNode **head); void FrontBackSplit(wordNode *head,wordNode **pre,wordNode **next); void wordJob(char word[]); wordNode *SortedMerge(wordNode *pre,wordNode *next); void release();
主函数
status main(int argc,char *argv[]) { char temp[WORD_LENGTH];//定义用以临时存放单词的数组 FILE *file; int count; int articleWordNum = 0;//定义统计结点个数的变量 int *num = &articleWordNum; if((file = fopen("F:\\zc\\c\\yjs\\file.txt", "r")) == NULL) { printf("文件读取失败!"); exit(1); } while((fscanf(file,"%s",temp))!= EOF) { wordJob(temp); count = wordCount(temp,num); } fclose(file); printf("\n输出所有单词的频数\n"); printCountList(num); printf("\n输出词频最高的5个词\n"); mergeSort(&headNode); //排序 PrintFirstFiveTimes(); release(); return 0; }
查找单词所在结点并返回其地址
wordNode *wordSearch(char *word,int *num) { wordNode *node; wordNode *nextNode = headNode; wordNode *preNode = NULL; char a[WORD_LENGTH]; if(headNode == NULL) { node = (wordNode*)malloc(sizeof(wordNode)); strcpy(node->word, word); node->time = 0; *num+=1; headNode = node; return node; } while(nextNode != NULL) //查找匹配单词 { strcpy(a,nextNode->word); if(strcmp(a, word) == 0) { return nextNode; } preNode = nextNode; nextNode = nextNode->next; } if(nextNode == NULL) { node = (wordNode*)malloc(sizeof(wordNode)); strcpy(node->word, word); node->time = 0; node->next = headNode->next; headNode->next = node; *num+=1; return node; } else return nextNode; }
进行词频统计
status wordCount(char *word,int *num) { wordNode *tmpNode = NULL; tmpNode = wordSearch(word,num); //word所在的节点 if(tmpNode == NULL) { return ERROR; } tmpNode->time++; return 0; }
输出所有词频
void printCountList(int *num) { if(headNode == NULL) { printf("该文件无内容!"); } else { wordNode *preNode = headNode; printf("\n\t总计 %d \n",*num); while(preNode != NULL) { printf("\n\t%s:%d次\n",preNode->word,preNode->time); preNode = preNode->next; } } }
输出词频最高的10个词
void PrintFirstFiveTimes() { if(headNode == NULL) { printf("该文件无内容!"); } else { wordNode *preNode = headNode; int i = 1; while (preNode != NULL && i<=5) { printf("\n\t%s:%d次\n",preNode->word,preNode->time); preNode = preNode->next; i++; } } }
对词频统计结果进行归并排序
void mergeSort(wordNode **headnode)
{
wordNode *pre,*next,*head;
head = *headnode;
if(head == NULL || head->next == NULL)
{
return;
}
FrontBackSplit(head,&pre,&next);
mergeSort(&pre);
mergeSort(&next);
*headnode = SortedMerge(pre,next);
}
取尾节点
void FrontBackSplit(wordNode *source,wordNode **pre,wordNode **next) { wordNode *fast; wordNode *slow; if(source == NULL || source->next == NULL) { *pre = source; *next = NULL; } else { slow = source; fast = source->next; while(fast != NULL) { fast = fast->next; if(fast != NULL) { slow = slow->next; fast = fast->next; } } *pre = source; *next = slow->next; slow->next = NULL; } }
取频数最大的节点作为头节点
wordNode *SortedMerge(wordNode *pre,wordNode *next) { wordNode *result = NULL; if(pre == NULL) return next; else if(next == NULL) return pre; if(pre->time >= next->time) { result = pre; result->next = SortedMerge(pre->next,next); } else { result = next; result->next = SortedMerge(pre,next->next); } return result; }
处理单词
void wordJob(char word[]) { int i,k; for(i = 0;i<strlen(word);i++) { if(word[i]>='A'&& word[i]<='Z') { word[i] += 32; continue; } if(word[i]<'a'||word[i]>'z') { if(i == (strlen(word)-1)) { word[i] = '\0'; } else { k = i; while(i < strlen(word)) { word[i] = word[i+1]; i++; } i = k; } } } }
释放所有结点内存
void release() { if(headNode == NULL) return; wordNode *pre = headNode; while(pre != NULL) { headNode = pre->next; free(pre); pre = headNode; } }
git@git.coding.net:amberpass/Calculate_words.git
https://git.coding.net/amberpass/Calculate_words.git
程序运行结果: