C++里创建 Trie字典树(中文词典)(三)(联想)
萌新做词典第三篇,做得不好,还请指正,谢谢大佬!
今天把词典的联想做好了,也是比较low的,还改了之前的查询、遍历等代码。 Orz
一样地先放上运行结果:
1 test1 2 ID : 2 char : 件 word : 编程软件 3 ID : 3 char : 习 word : 编程学习 4 ID : 4 char : 站 word : 编程学习网站 5 ID : 1 char : 门 word : 编程入门 6 7 test2 8 ID : 5 char : 练 word : 编程训练 9 ID : 1 char : 门 word : 编程入门 10 ID : 3 char : 习 word : 编程学习 11 ID : 4 char : 站 word : 编程学习网站 12 ID : 2 char : 件 word : 编程软件 13 find ID : 3 word : 编程学习 14 15 associate "编程" : 16 find! 17 训练 18 入门 19 学习 20 学习网站 21 软件
测试用的test.cc
1 #include "Dictionary.h" 2 #include <iostream> 3 #include <string> 4 #include <vector> 5 using std::cout; 6 using std::endl; 7 using std::string; 8 using std::vector; 9 10 int test1() 11 { 12 ccx::Dictionary words; 13 string word1 = "编程入门"; 14 string word2 = "编程软件"; 15 string word3 = "编程学习"; 16 string word4 = "编程学习网站"; 17 18 words.push(word1); 19 words.push(word2); 20 words.push(word3); 21 words.push(word4); 22 23 words.resetIt(); 24 25 while(!words.isEnd()) 26 { 27 cout << "ID : " << words.getCurWordId() 28 << "\tchar : " << words.getCurChar() 29 << "\tword : " << words.getCurWord() << endl; 30 words.next(); 31 } 32 33 words.leading_out(); 34 return 0; 35 } 36 37 38 int test2() 39 { 40 ccx::Dictionary words; 41 words.leading_in(); 42 43 44 string word("编程训练"); 45 words.push(word); 46 words.resetIt(); 47 48 while(!words.isEnd()) 49 { 50 cout << "ID : " << words.getCurWordId() 51 << "\tchar : " << words.getCurChar() 52 << "\tword : " << words.getCurWord() << endl; 53 words.next(); 54 } 55 string tmp = "编程学习"; 56 int id = words.search(tmp); 57 if(-1 == id) 58 { 59 cout << "no such word like \"" << tmp << "\"" << endl; 60 }else{ 61 cout << "find ID : " << id 62 << "\tword : " << tmp << endl; 63 } 64 65 cout << endl; 66 cout << "associate \"编程\" : " << endl; 67 68 vector<string> data; 69 string temp = "编程"; 70 71 if(words.associate(temp, data)) 72 { 73 cout << "find!" << endl; 74 for(auto & elem : data) 75 { 76 cout << elem << endl; 77 } 78 }else{ 79 cout << "can't find" << endl; 80 } 81 82 83 return 0; 84 } 85 86 int main() 87 { 88 cout << "test1" << endl; 89 test1(); 90 cout << endl; 91 cout << "test2" << endl; 92 test2(); 93 cout << endl; 94 }
test1不变,test2 在导入后再插入一个词“编程训练”,发现ID是正常的。
然后在test2最后调用联想函数,传入“编程”,能够正常传出所有的字符串。
在做这个的时候,一开始想的很简单,就是拿传入的词去树中查找,找到最后一人字对应的节点,然后以那个节点为根进行遍历。然后就开开心心地去写了,结果写一部分就要对之前的代码进行更改,于是,这个接口越来越“肥”了:
Dictionary.h
1 #ifndef __DICTIONARY_H__ 2 #define __DICTIONARY_H__ 3 4 #include "DictionaryData.h" 5 #include "DictionaryConf.h" 6 7 #include <memory> 8 #include <vector> 9 #include <list> 10 11 namespace ccx{ 12 13 using std::shared_ptr; 14 using std::vector; 15 using std::list; 16 17 class Dictionary 18 { 19 typedef unordered_map<string, pDictElem>::iterator WordIt; 20 public: 21 Dictionary(); 22 void push(const string & word); 23 void push(vector<string> & words); 24 int search(const string & word); 25 bool associate(const string & word, vector<string> & data); 26 private: 27 void AddWord(const string & word, int wordId); 28 void splitWord(const string & word, vector<string> & characters);//把词拆成字 29 int search(vector<string> & data, pDictElem & pcur); 30 pDictElem _dictionary; 31 DictionaryConf _conf; 32 33 //遍历 34 public: 35 string getCurChar(); 36 string getCurWord(); 37 int getCurWordId(); 38 bool isEnd(); 39 void resetIt(); 40 void next(); 41 private: 42 void resetPoint(pDictElem pcur); 43 void next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict); 44 void nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict); 45 string getCurWord(list<WordIt> & stackWord); 46 47 pDictElem _pcur; 48 WordIt _itcur; 49 50 //用list实现栈,遍历时方便 51 list<WordIt> _stackWord; 52 list<pDictElem> _stackDict; 53 54 //导入导出 55 public: 56 void leading_in(); 57 void leading_out(); 58 }; 59 60 } 61 62 #endif
对几个原有的函数进行了重载,主要是为了能够复用一些代码,但是又想不到合适的新的函数名(英语不太好Orz)。
首先,是要能够查找并返回新的根结点,于是对search进行修改:
1 int Dictionary::search(vector<string> & characters, pDictElem & root) 2 { 3 vector<string>::iterator it_char; 4 it_char = characters.begin(); 5 root = _dictionary; 6 int i = 0; 7 for(; it_char != characters.end(); ++it_char, ++i) 8 { 9 WordIt it_word; 10 it_word = root->_words.find(*it_char); 11 12 if(it_word == root->_words.end()) 13 { 14 break; 15 }else{ 16 root = it_word->second; 17 } 18 } 19 return i; 20 }
形参第一项是分解后的字集,第二项是一个智能指针,指向某个节点。这里返回值改为了字集的第几项,有两个目的:
1、插入函数中可以方便地知道下一个要插入的是哪个字符
2、联想函数中可以判断字集中的字是否都存在于词典中
3、好吧,我没想到其它好办法,而且当时是想到上面两点就这么做了,后来发现,插入部分的代码根本就不用改
然后是重载search:
1 int Dictionary::search(const string & word) 2 { 3 pDictElem root = _dictionary; 4 vector<string> temp; 5 splitWord(word, temp); 6 7 int ret = search(temp, root); 8 int size = temp.size(); 9 if(ret != size) 10 { 11 return -1; 12 } 13 return root->_wordId; 14 }
在这里对字进行分解,并定义一个临时的根结点,这样做的目的是为了保护private中的根结点,并且可以在多线程环境中互不干扰。
能够找到“新的根”后,就要对它进行遍历了。如果只有单一线程或进程来使用它,这里可以直接把resetPoint(原来的)修改一下,设置指定结点就可以了:
1 void Dictionary::resetPoint(pDictElem pcur) 2 { 3 _pcur = pcur; 4 if(_stackDict.size()) 5 { 6 _stackDict.clear(); 7 } 8 if(_stackWord.size()) 9 { 10 _stackWord.clear(); 11 } 12 next(); 13 }
如果是这样,那前面也完全不用修改。由于这个词典最后是要应用到miniSearchEngin中,于是我对遍历部分的函数进行了修改:
1 void Dictionary::next() 2 { 3 next(_pcur, _stackWord, _stackDict); 4 } 5 6 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict) 7 { 8 while(pcur) 9 { 10 nextWord(pcur, stackWord, stackDict); 11 if(!pcur || pcur->_wordId) 12 { 13 break; 14 } 15 } 16 } 17 18 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict) 19 { 20 if(pcur) 21 { 22 if(pcur->_words.size()) 23 { 24 stackDict.push_back(pcur); 25 stackWord.push_back(pcur->_words.begin()); 26 pcur = stackWord.back()->second; 27 }else{ 28 ++(stackWord.back()); 29 } 30 while(stackWord.back() == stackDict.back()->_words.end()) 31 { 32 stackDict.pop_back(); 33 stackWord.pop_back(); 34 if(!stackDict.size()) 35 { 36 pcur = NULL; 37 } 38 ++(stackWord.back()); 39 } 40 if(pcur) 41 { 42 pcur = stackWord.back()->second; 43 } 44 } 45 }
next部分,改为传入参数,这样就可以在associate里定义临时的栈和智能指针等,遍历的时候与其它工作并没有任何关系。
同样地,getWord也要做相同的更改:
1 string Dictionary::getCurWord() 2 { 3 return getCurWord(_stackWord); 4 } 5 6 string Dictionary::getCurWord(list<WordIt> & stackWord) 7 { 8 string temp; 9 list<WordIt>::iterator it_word; 10 it_word = stackWord.begin(); 11 12 for(; it_word != stackWord.end(); ++it_word) 13 { 14 temp += (*it_word)->first; 15 } 16 return temp; 17 }
当然了,对外提供的接口都是不要传参的,其它的只能在内部使用,于是放入了private区。
终于可以开始写联想了0.0
1 bool Dictionary::associate(const string & word, vector<string> & data) 2 { 3 pDictElem root = _dictionary; 4 vector<string> temp; 5 splitWord(word, temp); 6 7 int ret = search(temp, root); 8 int size = temp.size(); 9 if(ret != size) 10 { 11 return false; 12 } 13 14 list<WordIt> stackWord; 15 list<pDictElem> stackDict; 16 next(root, stackWord, stackDict); 17 while(root) 18 { 19 string temp = getCurWord(stackWord); 20 data.push_back(temp); 21 next(root, stackWord, stackDict); 22 } 23 24 if(!data.size()) 25 { 26 return false; 27 } 28 return true; 29 }
返回bool类型,可以方便地判断是否联想成功,即以传入的词做为前缀,能否找到剩余部分(词典里有存)。于是乎,一个渣渣型号的词典就做好啦~~~
Dictionary.cc
1 #include "Dictionary.h" 2 #include <iostream> 3 #include <fstream> 4 #include <string> 5 #include <json/json.h> 6 7 namespace ccx{ 8 9 using std::endl; 10 using std::cout; 11 using std::pair; 12 using std::ofstream; 13 using std::ifstream; 14 15 Dictionary::Dictionary() 16 : _dictionary(new DictElem) 17 , _conf() 18 { 19 _dictionary->_wordId = 0; 20 _pcur = _dictionary; 21 } 22 23 void Dictionary::splitWord(const string & word, vector<string> & characters) 24 { 25 int num = word.size(); 26 int i = 0; 27 while(i < num) 28 { 29 int size = 1; 30 if(word[i] & 0x80) 31 { 32 char temp = word[i]; 33 temp <<= 1; 34 do{ 35 temp <<= 1; 36 ++size; 37 }while(temp & 0x80); 38 } 39 string subWord; 40 subWord = word.substr(i, size); 41 characters.push_back(subWord); 42 i += size; 43 } 44 } 45 46 void Dictionary::AddWord(const string & word, int wordId) 47 { 48 vector<string> characters; 49 splitWord(word, characters); 50 51 vector<string>::iterator it_char; 52 it_char = characters.begin(); 53 pDictElem root; 54 root = _dictionary; 55 for(; it_char != characters.end(); ++it_char) 56 { 57 WordIt it_word; 58 it_word = root->_words.find(*it_char); 59 60 if(it_word == root->_words.end()) 61 { 62 pair<string, pDictElem> temp; 63 temp.first = *it_char; 64 pDictElem dictemp(new DictElem); 65 dictemp->_word = *it_char; 66 dictemp->_wordId = 0; 67 temp.second = dictemp; 68 root->_words.insert(temp); 69 root = dictemp; 70 }else{ 71 root = it_word->second; 72 } 73 } 74 if(!root->_wordId) 75 { 76 root->_wordId = wordId; 77 } 78 } 79 80 void Dictionary::push(const string & word) 81 { 82 ++(_dictionary->_wordId); 83 AddWord(word, _dictionary->_wordId); 84 } 85 86 void Dictionary::push(vector<string> & words) 87 { 88 int size = words.size(); 89 for(int i = 0; i < size; ++i) 90 { 91 push(words[i]); 92 } 93 } 94 95 int Dictionary::search(const string & word) 96 { 97 pDictElem root = _dictionary; 98 vector<string> temp; 99 splitWord(word, temp); 100 101 int ret = search(temp, root); 102 int size = temp.size(); 103 if(ret != size) 104 { 105 return -1; 106 } 107 return root->_wordId; 108 } 109 110 int Dictionary::search(vector<string> & characters, pDictElem & root) 111 { 112 vector<string>::iterator it_char; 113 it_char = characters.begin(); 114 root = _dictionary; 115 int i = 0; 116 for(; it_char != characters.end(); ++it_char, ++i) 117 { 118 WordIt it_word; 119 it_word = root->_words.find(*it_char); 120 121 if(it_word == root->_words.end()) 122 { 123 break; 124 }else{ 125 root = it_word->second; 126 } 127 } 128 return i; 129 } 130 131 bool Dictionary::associate(const string & word, vector<string> & data) 132 { 133 pDictElem root = _dictionary; 134 vector<string> temp; 135 splitWord(word, temp); 136 137 int ret = search(temp, root); 138 int size = temp.size(); 139 if(ret != size) 140 { 141 return false; 142 } 143 144 list<WordIt> stackWord; 145 list<pDictElem> stackDict; 146 next(root, stackWord, stackDict); 147 while(root) 148 { 149 string temp = getCurWord(stackWord); 150 data.push_back(temp); 151 next(root, stackWord, stackDict); 152 } 153 154 if(!data.size()) 155 { 156 return false; 157 } 158 return true; 159 } 160 161 //遍历用 162 163 void Dictionary::resetPoint(pDictElem pcur) 164 { 165 _pcur = pcur; 166 if(_stackDict.size()) 167 { 168 _stackDict.clear(); 169 } 170 if(_stackWord.size()) 171 { 172 _stackWord.clear(); 173 } 174 next(); 175 } 176 177 void Dictionary::resetIt() 178 { 179 resetPoint(_dictionary); 180 } 181 182 void Dictionary::next() 183 { 184 next(_pcur, _stackWord, _stackDict); 185 } 186 187 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict) 188 { 189 while(pcur) 190 { 191 nextWord(pcur, stackWord, stackDict); 192 if(!pcur || pcur->_wordId) 193 { 194 break; 195 } 196 } 197 } 198 199 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict) 200 { 201 if(pcur) 202 { 203 if(pcur->_words.size()) 204 { 205 stackDict.push_back(pcur); 206 stackWord.push_back(pcur->_words.begin()); 207 pcur = stackWord.back()->second; 208 }else{ 209 ++(stackWord.back()); 210 } 211 while(stackWord.back() == stackDict.back()->_words.end()) 212 { 213 stackDict.pop_back(); 214 stackWord.pop_back(); 215 if(!stackDict.size()) 216 { 217 pcur = NULL; 218 } 219 ++(stackWord.back()); 220 } 221 if(pcur) 222 { 223 pcur = stackWord.back()->second; 224 } 225 } 226 } 227 228 string Dictionary::getCurChar() 229 { 230 return _pcur->_word; 231 } 232 233 int Dictionary::getCurWordId() 234 { 235 return _pcur->_wordId; 236 } 237 238 string Dictionary::getCurWord() 239 { 240 return getCurWord(_stackWord); 241 } 242 243 string Dictionary::getCurWord(list<WordIt> & stackWord) 244 { 245 string temp; 246 list<WordIt>::iterator it_word; 247 it_word = stackWord.begin(); 248 249 for(; it_word != stackWord.end(); ++it_word) 250 { 251 temp += (*it_word)->first; 252 } 253 return temp; 254 } 255 256 bool Dictionary::isEnd() 257 { 258 return _pcur == NULL; 259 } 260 261 void Dictionary::leading_in()//导入,失败没必要退出程序 262 { 263 ifstream ifs; 264 const char * path = _conf.getDictionaryPath().c_str(); 265 ifs.open(path); 266 if(!ifs.good()) 267 { 268 cout << "open Dictionary.json error(leading_in)" << endl; 269 }else{ 270 Json::Value root; 271 Json::Reader reader; 272 273 if(!reader.parse(ifs, root, false)) 274 { 275 cout << "json read Dictionary.json error" << endl; 276 }else{ 277 int size = root.size(); 278 for(int i = 0; i < size; ++i) 279 { 280 string word = root[i]["Word"].asString(); 281 int wordId = root[i]["WordId"].asInt(); 282 AddWord(word, wordId); 283 ++(_dictionary->_wordId); 284 } 285 } 286 } 287 } 288 289 void Dictionary::leading_out() 290 { 291 Json::Value root; 292 Json::FastWriter writer; 293 294 resetIt(); 295 296 while(!isEnd()) 297 { 298 Json::Value elem; 299 elem["Word"] = getCurWord(); 300 elem["WordId"] = getCurWordId(); 301 root.append(elem); 302 next(); 303 } 304 305 string words; 306 words = writer.write(root); 307 308 ofstream ofs; 309 const char * path = _conf.getDictionaryPath().c_str(); 310 ofs.open(path); 311 if(!ofs.good()) 312 { 313 cout << "open Dictionary.json error(leading_out)" << endl; 314 ofs.open("Dictionary.tmp"); 315 if(!ofs.good()) 316 { 317 exit(EXIT_FAILURE); 318 } 319 } 320 321 ofs << words; 322 ofs.close(); 323 } 324 325 }