自然语言理解 之 统计词频
统计词频,中文字体编码格式:GB2312。
1 #include <iostream> 2 #include <fstream> 3 #include <algorithm> 4 #include <functional> 5 #include <string> 6 #include <vector> 7 #include <map> 8 #include <unordered_map> 9 #include <sstream> 10 #include <ctime> 11 using namespace std; 12 13 typedef long clock_t; 14 typedef pair<string, int> Pair_StrInt; 15 typedef vector<Pair_StrInt>::iterator Vec_Pair_StrInt_Itr; 16 #define ERROR0 cerr << "Open error !!!" << endl; exit(1); 17 #define ERROR1 cerr << "无法识别 !!!" << endl; exit(1); 18 #define Lim 100 19 20 string infile = "Ci.txt"; 21 string outfile1 = "out1.txt"; 22 string outfile2 = "out2.txt"; 23 string outfile3 = "out3.txt"; 24 string project_time = "project_time.txt"; 25 string One_strArr[100]; 26 string Two_strArr[100]; 27 string Three_strArr[100]; 28 ifstream fin; 29 ofstream fout; 30 string Text; 31 32 struct myNode { 33 string Chant; // 词牌名 34 string Rules; // 格式 35 }; 36 37 bool Pair_StrInt_Cmp(const Pair_StrInt& p0, const Pair_StrInt& p1) { return (p0.second > p1.second); } 38 unordered_map<string, int> StrInt_Hash; 39 40 void InitText(string _infile) { 41 fin.open(_infile); 42 if (!fin) { ERROR0; } 43 44 ////////////////////////////////////////////////////////////////////////// 45 // 将整个文件读入 string : 流迭代器 46 std::ostringstream tmp; 47 tmp << fin.rdbuf(); 48 string Text_tmp = tmp.str(); 49 ////////////////////////////////////////////////////////////////////////// 50 51 52 unsigned char Judge; 53 string strTmp; 54 55 int len = Text_tmp.size(); 56 for (int i = 0; i < len; ) { 57 Judge = Text_tmp[i]; 58 if (Judge >= 0xB0 && Judge <= 0xF7) { 59 strTmp = Text_tmp.substr(i, 2); 60 i += 2; 61 Text += strTmp; 62 } 63 else { ++i; } 64 } 65 66 fin.close(); 67 fin.clear(); 68 } 69 70 // 输出到文件 71 void myOutput(const vector<Pair_StrInt> &StrInt_Vec, string out) { 72 fout.open(out); 73 if (!fout) { ERROR0; } 74 75 vector<Pair_StrInt>::const_iterator pair_itr; 76 for (pair_itr = StrInt_Vec.begin(); pair_itr != StrInt_Vec.end(); ++pair_itr) { 77 fout << pair_itr->first << "\t" << pair_itr->second << endl; 78 } 79 80 fout.close(); 81 fout.clear(); 82 } 83 84 // 获取一个中文字的词频 85 void getOneWord(string out1) { 86 string strTmp; 87 88 int str_len = Text.size(); 89 for (int i = 0; i < str_len; i += 2) { 90 strTmp = Text.substr(i, 2); 91 StrInt_Hash[strTmp] += 1; 92 } 93 94 vector<Pair_StrInt> StrInt_Vec(StrInt_Hash.begin(), StrInt_Hash.end()); 95 StrInt_Hash.clear(); 96 std::sort(StrInt_Vec.begin(), StrInt_Vec.end(), Pair_StrInt_Cmp); 97 98 myOutput(StrInt_Vec, out1); 99 100 StrInt_Vec.clear(); 101 } 102 103 // 获取两个中文字的词频 104 void getTwoWord(string out2) { 105 string strTmp; 106 107 int str_len = Text.size(); 108 for (int i = 0; i < (str_len - 2); i += 2) { 109 strTmp = Text.substr(i, 4); 110 StrInt_Hash[strTmp] += 1; 111 } 112 113 vector<Pair_StrInt> StrInt_Vec(StrInt_Hash.begin(), StrInt_Hash.end()); 114 StrInt_Hash.clear(); 115 std::sort(StrInt_Vec.begin(), StrInt_Vec.end(), Pair_StrInt_Cmp); 116 117 myOutput(StrInt_Vec, out2); 118 119 StrInt_Vec.clear(); 120 } 121 122 // 获取三个中文字的词频 123 void getThreeWord(string out3) { 124 string strTmp; 125 126 int str_len = Text.size(); 127 for (int i = 0; i < (str_len - 4); i += 2) { 128 strTmp = Text.substr(i, 6); 129 StrInt_Hash[strTmp] += 1; 130 } 131 132 vector<Pair_StrInt> StrInt_Vec(StrInt_Hash.begin(), StrInt_Hash.end()); 133 StrInt_Hash.clear(); 134 std::sort(StrInt_Vec.begin(), StrInt_Vec.end(), Pair_StrInt_Cmp); 135 136 myOutput(StrInt_Vec, out3); 137 138 StrInt_Vec.clear(); 139 } 140 141 // 自动生成词 142 void Poetry(string _strTmp) { 143 int len = _strTmp.size(); 144 int myRandom; 145 srand((unsigned)(time(NULL))); 146 for (int i = 0; i < len; ++i) { 147 switch (_strTmp[i]) 148 { 149 case '2': { 150 myRandom = rand() % Lim; 151 cout << Two_strArr[myRandom]; 152 break; 153 } 154 case '1': { 155 myRandom = rand() % Lim; 156 cout << One_strArr[myRandom]; 157 break; 158 } 159 case '3': { 160 myRandom = rand() % Lim; 161 cout << Three_strArr[myRandom]; 162 break; 163 } 164 case '0': { 165 cout << '\n'; 166 break; 167 } 168 case '-': { 169 cout << " "; 170 break; 171 } 172 default: { 173 cout << _strTmp.substr(i, 2); 174 ++i; 175 break; 176 } 177 } 178 } 179 cout << endl; 180 } 181 182 // 生成词前的预处理 183 void makePoetry(string out1, string out2, string out3) { 184 ifstream fin1, fin2, fin3; 185 ofstream fout1, fout2, fout3; 186 fin1.open(out1); 187 if (!fin1) { ERROR0; } 188 fin2.open(out2); 189 if (!fin2) { ERROR0; } 190 fin3.open(out3); 191 if (!fin3) { ERROR0; } 192 string strTmp; 193 for (int i = 0; i < Lim; ++i) { 194 getline(fin1, strTmp); 195 One_strArr[i] = strTmp.substr(0, 2); 196 getline(fin2, strTmp); 197 Two_strArr[i] = strTmp.substr(0, 4); 198 getline(fin3, strTmp); 199 Three_strArr[i] = strTmp.substr(0, 6); 200 } 201 202 myNode node0; 203 node0.Chant = "念奴娇"; 204 node0.Rules = "·220-22,12,222。22,21:222。22,22,23。22,222。0-222,23,22。22,3222。22,23,22。22,222。0"; 205 206 string strTmp0 = "---" + node0.Chant + node0.Rules; 207 Poetry(strTmp0); 208 //system("pause"); 209 } 210 211 void Solve() { 212 213 InitText(infile); 214 215 ofstream fout; 216 fout.open(project_time); 217 clock_t myStart, myFinish; 218 double totaltime; 219 ////////////////////////////////////////////////////////////////////////// 220 myStart = clock(); 221 ////////////////////////////////////////////////////////////////////////// 222 getOneWord(outfile1); 223 ////////////////////////////////////////////////////////////////////////// 224 getTwoWord(outfile2); 225 ///////////////////////////////////////////////////////////////////////// 226 getThreeWord(outfile3); 227 ////////////////////////////////////////////////////////////////////////// 228 229 myFinish = clock(); 230 totaltime = (double)(myFinish - myStart) / CLOCKS_PER_SEC; 231 232 fout << "运行时间为: " << totaltime << " 秒。" << endl; 233 fout.close(); 234 fout.clear(); 235 236 237 makePoetry(outfile1, outfile2, outfile3); 238 } 239 240 int main() { 241 Solve(); 242 return 0; 243 }