自然语言理解 之 统计词频

统计词频,中文字体编码格式:GB2312。

  1 #include <iostream>
  2 #include <fstream>
  3 #include <algorithm>
  4 #include <functional>
  5 #include <string>
  6 #include <vector>
  7 #include <map>
  8 #include <unordered_map>
  9 #include <sstream>
 10 #include <ctime>
 11 using namespace std;
 12 
 13 typedef long clock_t;
 14 typedef pair<string, int> Pair_StrInt;
 15 typedef vector<Pair_StrInt>::iterator Vec_Pair_StrInt_Itr;
 16 #define ERROR0 cerr << "Open error !!!" << endl; exit(1);
 17 #define ERROR1 cerr << "无法识别 !!!" << endl; exit(1);
 18 #define Lim 100
 19 
 20 string infile = "Ci.txt";
 21 string outfile1 = "out1.txt";
 22 string outfile2 = "out2.txt";
 23 string outfile3 = "out3.txt";
 24 string project_time = "project_time.txt";
 25 string One_strArr[100];
 26 string Two_strArr[100];
 27 string Three_strArr[100];
 28 ifstream fin;
 29 ofstream fout;
 30 string Text;
 31 
 32 struct myNode {
 33     string Chant; // 词牌名
 34     string Rules; // 格式
 35 };
 36 
 37 bool Pair_StrInt_Cmp(const Pair_StrInt& p0, const Pair_StrInt& p1) { return (p0.second > p1.second); }
 38 unordered_map<string, int> StrInt_Hash;
 39 
 40 void InitText(string _infile) {
 41     fin.open(_infile);
 42     if (!fin) { ERROR0; }
 43 
 44     //////////////////////////////////////////////////////////////////////////
 45     // 将整个文件读入 string : 流迭代器
 46     std::ostringstream tmp;
 47     tmp << fin.rdbuf();
 48     string Text_tmp = tmp.str();
 49     //////////////////////////////////////////////////////////////////////////
 50 
 51     
 52     unsigned char Judge;
 53     string strTmp;
 54 
 55     int len = Text_tmp.size();
 56     for (int i = 0; i < len; ) {
 57         Judge = Text_tmp[i];
 58         if (Judge >= 0xB0 && Judge <= 0xF7) {
 59             strTmp = Text_tmp.substr(i, 2);
 60             i += 2;
 61             Text += strTmp;
 62         }
 63         else { ++i; }
 64     }
 65 
 66     fin.close();
 67     fin.clear();
 68 }
 69 
 70 // 输出到文件
 71 void myOutput(const vector<Pair_StrInt> &StrInt_Vec, string out) {
 72     fout.open(out);
 73     if (!fout) { ERROR0; }
 74 
 75     vector<Pair_StrInt>::const_iterator pair_itr;
 76     for (pair_itr = StrInt_Vec.begin(); pair_itr != StrInt_Vec.end(); ++pair_itr) {
 77         fout << pair_itr->first << "\t" << pair_itr->second << endl;
 78     }
 79 
 80     fout.close();
 81     fout.clear();
 82 }
 83 
 84 // 获取一个中文字的词频
 85 void getOneWord(string out1) {
 86     string strTmp;
 87 
 88     int str_len = Text.size();
 89     for (int i = 0; i < str_len; i += 2) {
 90         strTmp = Text.substr(i, 2);
 91         StrInt_Hash[strTmp] += 1;
 92     }
 93     
 94     vector<Pair_StrInt> StrInt_Vec(StrInt_Hash.begin(), StrInt_Hash.end());
 95     StrInt_Hash.clear();
 96     std::sort(StrInt_Vec.begin(), StrInt_Vec.end(), Pair_StrInt_Cmp);
 97 
 98     myOutput(StrInt_Vec, out1);
 99 
100     StrInt_Vec.clear();
101 }
102 
103 // 获取两个中文字的词频
104 void getTwoWord(string out2) {
105     string strTmp;
106 
107     int str_len = Text.size();
108     for (int i = 0; i < (str_len - 2); i += 2) {
109         strTmp = Text.substr(i, 4);
110         StrInt_Hash[strTmp] += 1;
111     }
112 
113     vector<Pair_StrInt> StrInt_Vec(StrInt_Hash.begin(), StrInt_Hash.end());
114     StrInt_Hash.clear();
115     std::sort(StrInt_Vec.begin(), StrInt_Vec.end(), Pair_StrInt_Cmp);
116 
117     myOutput(StrInt_Vec, out2);
118 
119     StrInt_Vec.clear();
120 }
121 
122 // 获取三个中文字的词频
123 void getThreeWord(string out3) {
124     string strTmp;
125 
126     int str_len = Text.size();
127     for (int i = 0; i < (str_len - 4); i += 2) {
128         strTmp = Text.substr(i, 6);
129         StrInt_Hash[strTmp] += 1;
130     }
131 
132     vector<Pair_StrInt> StrInt_Vec(StrInt_Hash.begin(), StrInt_Hash.end());
133     StrInt_Hash.clear();
134     std::sort(StrInt_Vec.begin(), StrInt_Vec.end(), Pair_StrInt_Cmp);
135 
136     myOutput(StrInt_Vec, out3);
137 
138     StrInt_Vec.clear();
139 }
140 
141 // 自动生成词
142 void Poetry(string _strTmp) {
143     int len = _strTmp.size();
144     int myRandom;
145     srand((unsigned)(time(NULL)));
146     for (int i = 0; i < len; ++i) {
147         switch (_strTmp[i])
148         {
149         case '2': {
150             myRandom = rand() % Lim;
151             cout << Two_strArr[myRandom];
152             break;
153         }
154         case '1': {
155             myRandom = rand() % Lim;
156             cout << One_strArr[myRandom];
157             break;
158         }
159         case '3': {
160             myRandom = rand() % Lim;
161             cout << Three_strArr[myRandom];
162             break;
163         }
164         case '0': {
165             cout << '\n';
166             break;
167         }
168         case '-': {
169             cout << "  ";
170             break;
171         }
172         default: {
173             cout << _strTmp.substr(i, 2);
174             ++i;
175             break;
176         }
177         }
178     }
179     cout << endl;
180 }
181 
182 // 生成词前的预处理
183 void makePoetry(string out1, string out2, string out3) {
184     ifstream fin1, fin2, fin3;
185     ofstream fout1, fout2, fout3;
186     fin1.open(out1);
187     if (!fin1) { ERROR0; }
188     fin2.open(out2);
189     if (!fin2) { ERROR0; }
190     fin3.open(out3);
191     if (!fin3) { ERROR0; }
192     string strTmp;
193     for (int i = 0; i < Lim; ++i) {
194         getline(fin1, strTmp);
195         One_strArr[i] = strTmp.substr(0, 2);
196         getline(fin2, strTmp);
197         Two_strArr[i] = strTmp.substr(0, 4);
198         getline(fin3, strTmp);
199         Three_strArr[i] = strTmp.substr(0, 6);
200     }
201 
202     myNode node0;
203     node0.Chant = "念奴娇";
204     node0.Rules = "·220-22,12,222。22,21:222。22,22,23。22,222。0-222,23,22。22,3222。22,23,22。22,222。0";
205 
206     string strTmp0 = "---" + node0.Chant + node0.Rules;
207     Poetry(strTmp0);
208     //system("pause");
209 }
210 
211 void Solve() {
212     
213     InitText(infile);
214 
215     ofstream fout;
216     fout.open(project_time);
217     clock_t myStart, myFinish;
218     double totaltime;
219     //////////////////////////////////////////////////////////////////////////
220     myStart = clock();
221     //////////////////////////////////////////////////////////////////////////
222     getOneWord(outfile1);
223     //////////////////////////////////////////////////////////////////////////
224     getTwoWord(outfile2);
225     /////////////////////////////////////////////////////////////////////////
226     getThreeWord(outfile3);
227     //////////////////////////////////////////////////////////////////////////
228 
229     myFinish = clock();
230     totaltime = (double)(myFinish - myStart) / CLOCKS_PER_SEC;
231 
232     fout << "运行时间为: " << totaltime << " 秒。" << endl;
233     fout.close();
234     fout.clear();
235     
236 
237     makePoetry(outfile1, outfile2, outfile3);
238 }
239 
240 int main() {
241     Solve();
242     return 0;
243 }
posted @ 2014-12-03 18:33  JmingS  阅读(1227)  评论(2编辑  收藏  举报