词频统计
要求:
(1). 实现一个控制台程序,给定一段英文字符串,统计其中各个英文单词(4字符以上含4字符)的出现频率。 附加要求:读入一段文本文件,统计该文本文件中单词的频率。
(2). 性能分析:
- 对C++代码运行VS的性能分析工具,找出性能问题并进行优化。
- 对Java程序运行性能分析工具 NetBeans IDE 6.0,找出性能问题并进行优化。
预计完成时间:4H 实际完成时间:5-6H
github:https://github.com/yuanchenhui/zuoye:
#include <iostream> #include <ctype.h> #include <algorithm> #include <string> using namespace std; struct Word //单词结构体 { string Str; int Count=0; void exchange(Word &word) //单词交换,用于排序 { string tStr = word.Str; int tCount = word.Count; word.Str = Str; word.Count = Count; Str = tStr; Count = tCount; } }; void CalcCount(Word *words, string &newWord, int size) //词频统计 { int i = 0; for (; i < size; i++) { if (words[i].Str == newWord) { words[i].Count++; return; } else if (words[i].Str == "") break; } words[i].Str = newWord; words[i].Count = 1; } void SortWordDown(Word * words, int size) //根据词频降序排序 { for (int i = 0; i < size; i++) { for (int j = 0; j < size - 1; j++) { if (words[j].Count < words[j + 1].Count) { words[j].exchange(words[j + 1]); } } } } int main() { Word * words; string content; cout << "Word is case insensitive, i.e. “file123”, “123FILE” and “File” are considered the same word."; getline(cin, content); int wCount = 1; //记录单词总数 for (unsigned int i = 0; i < content.length(); i++) { if (isalnum(content[i]) == 0) //非字母数字 { wCount++; } } words = new Word[wCount]; string::size_type offset = content.find(‘ ‘); //单词分隔;size_type用以保存string对象的长度 while (offset != string::npos) { string wStr = content.substr(0, offset); //string.substr()返回一个从指定位置返回指定长度的字符串 if (wStr.length() < 4) //除去长度小于4的单词 { wCount--; content.erase(0, offset + 1); offset = content.find(‘ ‘); continue; } content.erase(0, offset + 1); //string.erase()删除从指定位置开始的指定长度的字符 transform(wStr.begin(), wStr.end(), wStr.begin(), ::tolower); CalcCount(words, wStr, wCount); offset = content.find(‘ ‘); } if (content.length() >= 4) { transform(content.begin(), content.end(), content.begin(), ::tolower); CalcCount(words, content, wCount); //计算最后一个单词 } else wCount--; for (int i = 0; i < wCount; i++) { if (words[i].Str == "") { wCount--; } } SortWordDown(words, wCount); cout << "词频统计:" << endl; for (int i = 0; i < wCount - 1; i++) { cout << words[i].Str << "频率:" << words[i].Count << "次" << endl; } cin.get(); delete[] words; return 0; }
运行结果: