Trie实现(C++)
参考了本文:http://www.cnblogs.com/xulb597/archive/2012/07/05/2578562.html
- 支持模糊搜索,比如,【bkmh】可以匹配【BuKaManHua】;
- 支持优先级,首字母、大小字母有更高的优先级。
亟需解决的问题:
- 目前搜索结果与关键词中字母的顺序无关,即【buk】可以匹配【BKManHua】
- 若条目中存在重复的任意关键字,即使不包含其他关键字,仍然能匹配上来
- 内存的占用:因为接触C++不久,内存管理一窍不通,一个1.65MB的文件(312964个单词),索引之后程序(VS2013编译,Release版本)的内存有68 860KB,
trie.hpp
#ifndef TRIE #define TRIE #include "LinkedList.hpp" #include "MatchInfo.hpp" #include <stdlib.h> #include <iostream> #include <string> #define BRANCH_SIZE 28 #define START_CHAR 'a' #define INDEX(x) (x == '?'? 27 : (x == '*'? 26 : x - START_CHAR)) class Trie { public: Trie() { rootNode = new Trie_Node(0); memset(nodeMap, NULL, sizeof(nodeMap)); memset(indexList, NULL, sizeof(indexList)); } ~Trie() { //delete rootNode; } void insert(const char *data, const int i) { bool flag_start = false,flag_capital = false; Trie_Node *location = rootNode; int pos = 0; while (*data) { char c = *data; // check wether it's capital and convert to lowwer case if so. if(c > 'A'-1 && c < 'Z'+1) { flag_capital = true; c += 32; }else{ flag_capital = false; } // map the char value to int which starts from 0 int index = INDEX(c); // skip invalid chars if(index < 0) { data++; pos++; continue; } // find next if(location->next[index] == NULL) { location->next[index] = getNode(index); } location = location->next[index]; // build MatchInfo and add it to the trie node's indexList MatchInfo *info = new MatchInfo(); info->itemindex = i; info->position = pos; // position of the char in the string info->priority = 1; // intial or capital char has a higher priority if(!flag_start) { flag_start = true; info->priority++; } if(flag_capital) info->priority++; if(indexList[index] == NULL) indexList[index] = new LinkedList<MatchInfo>(); indexList[index]->add(info); data++; pos++; } // end character has a higher priority //location->indexList->getCurrent()->value->priority++; } /*int match(const char *data) { Trie_Node *location = rootNode; while (*data && location) { location = location->next[INDEX(*data)]; data++; } return (location != NULL); }*/ /*void fuzzy_match(const char *data, int* indexMap, size_t indexMapLength) { predicateIndexMap(data, indexMap, indexMapLength); int index; Trie_Node *location = rootNode; while (*data && location) { index = INDEX(*data); location = location->next[INDEX(*data)]; if(location != NULL) { fillIndexArray(indexMap, index); } data++; } }*/ void fuzzy_match(const char *data, int* indexMap, size_t indexMapLength) { predicateIndexMap(data, indexMap, indexMapLength); int index; Trie_Node *location = nodeMap[INDEX(*data)]; do { index = INDEX(*data); if(location != NULL) { fillIndexArray(indexMap, index); } else { break; } data++; } while ((*data) && (location = nodeMap[index])); } /*void print() { print(rootNode); }*/ private: // // a list to record matche info of each char in indexed words. // it's for priority and fuzzy seaching. // LinkedList<MatchInfo>* indexList[BRANCH_SIZE]; struct Trie_Node { //int index; Trie_Node *next[BRANCH_SIZE]; Trie_Node(int _index) { //index = _index; memset(next, NULL, sizeof(next)); }; ~Trie_Node() { //delete indexList; for (int i = 0; i < BRANCH_SIZE; i++) { if(next[i]) delete next[i]; } } }; Trie_Node *rootNode; // // a map to hold all created Trie_Node. // Trie_Node *nodeMap[BRANCH_SIZE]; // // /*get a trie node from map.*/ // return a new Trie_Node; // index: (char - 'a') // Trie_Node *getNode(int index) { //return new Trie_Node(index); Trie_Node *tempNode = nodeMap[index]; if(tempNode == NULL) { tempNode = new Trie_Node(index); nodeMap[index] = tempNode; } return tempNode; } // // fill [indexMap] with priority of char at [index] // void fillIndexArray(int* indexMap, int index) { if(indexList[index] == NULL) indexList[index] = new LinkedList<MatchInfo>(); LinkedList<MatchInfo> *list = indexList[index]; Node<MatchInfo> *node = list->getRoot(); while (node) { int itemIndex = node->value->itemindex; if(indexMap[itemIndex] != -1) indexMap[itemIndex] += node->value->priority; node = node->next; } } // // keep moving node to next until it's itemindex in value has been changed. // node will set to NULL if reaches the end. // void moveToNextItemIndex(Node<MatchInfo> **node) { int index = (*node)->value->itemindex; if((*node)->next == NULL) (*node) = NULL; else { while ((*node)->value->itemindex == index) { (*node)=(*node)->next; if((*node) == NULL) break; } } } // // predicate whether an index in indexMap is impossiable to be matched. // It will be set to -1 if so. // void predicateIndexMap(const char* keyword, int* indexMap, size_t indexMapLength) { int *indexesMatched = new int[indexMapLength]; int keywordLength = strlen(keyword); unsigned int keywordRecords[BRANCH_SIZE]; size_t size = indexMapLength * sizeof(int); memset(indexesMatched, 0, size); memset(indexMap, -1, size); LinkedList<MatchInfo> *list; Node<MatchInfo> *match_node; int charIndex, index = 0; while (*keyword) { charIndex = INDEX(*keyword); if(keywordRecords[charIndex] == 1) { keyword++; continue; } keywordRecords[charIndex] = 1; list = indexList[charIndex]; if(list != NULL) { match_node = list->getRoot(); while (match_node != NULL) { indexesMatched[match_node->value->itemindex]++; match_node = match_node->next; //moveToNextItemIndex(&match_node); } } keyword++; } for (int i = 0; i < indexMapLength; i++) { if(indexesMatched[i] >= keywordLength) indexMap[i] = 0; } delete indexesMatched; } /*void print(Trie_Node* node) { char c; for (int i = 0; i < BRANCH_SIZE; i++) { if(node->next[i] != NULL)\n{ c = node->index + 'a'; printf("%c-", c); print(node->next[i]); } } }*/ }; #endif // TRIE
LinkedList.hpp
#ifndef LINKEDLIST #define LINKEDLIST #include <stdlib.h> #include <iostream> template <class T> struct Node { T* value; int index; Node *next; Node(T* _value, int _index) { value = _value; index = _index; } ~Node() { delete value; } }; template <class T> class LinkedList { public: int length; LinkedList() { length = 0; root = new Node<T>(NULL, 0); current = root; }; ~LinkedList() { Node<MatchInfo> *node = root; Node<MatchInfo> *tmp; while (node) { tmp = node->next; delete node; node = tmp; } }; void add(T *value) { if(length == 0) { root->value = value; root->index = 0; } else { current->next = new Node<T>(value, current->index + 1); current = current->next; } length++; current->next = NULL; }; Node<T> getAt(int index) { Node<T> *node = root; while (node) { if(node->index == index) return node; node = node->next; } return NULL; } Node<T> *getRoot() { return root; } Node<T> *getCurrent() { return current; } private: Node<T> *root,*current; }; #endif // LINKEDLIST
MatchInfo.hpp
#ifndef DEFINE_MatchInfo #define DEFINE_MatchInfo // // 字符的匹配信息 // struct MatchInfo { // 所在条目的序号 int itemindex; // 所在的位置 int position; // 优先级 int priority; }; #endif
SortBiTree.hpp
#ifndef DEFINE_SortBiTree #define DEFINE_SortBiTree #include <stdlib.h> #include <iostream> #include <string> template <class T> struct BTNode { int index; T value; BTNode<T> *left,*right; BTNode(int _i, T _v) { index = _i; value = _v; left = NULL; right = NULL; } }; template <class T> class SortBiTree { public: SortBiTree() { root = NULL; } ~SortBiTree() { } void add(int index, T value) { BTNode<T> *node = root; if(root == NULL) root = new BTNode<T>(index, value); else { add_iter(root, index, value); } } BTNode<T> *getMaxNode() { return maxNode; } BTNode<T> *getRootNode() { return root; } private: BTNode<T> *root,*minNode,*maxNode; void add_iter(BTNode<T>* node, int index, int value) { if(index > node->index) { if(node->left != NULL) { add_iter(node->left, index, value); } else { node->left = new BTNode<T>(index, value); maxNode = node->left; } } else { if(node->right != NULL) { add_iter(node->right, index, value); } else { node->right = new BTNode<T>(index, value); minNode = node->right; } } } }; #endif
Stopwatch.hpp
#ifndef STOPWATCH_DEF #define STOPWATCH_DEF #include <ctime> class Stopwatch { public: Stopwatch(){}; ~Stopwatch(){}; static Stopwatch* StartNew() { Stopwatch* sw = new Stopwatch(); sw->startTime = clock(); return sw; } double Stop() { double t = (double)(clock() - startTime)/CLOCKS_PER_SEC; delete this; return t; } private: clock_t startTime; }; #endif
main.cpp
#include <stdlib.h> #include <iostream> #include <string> #include <fstream> #include <sys/stat.h> #include "Stopwatch.hpp" #include "trie.hpp" #include "SortBiTree.hpp" using namespace std; int displayMaxItems = 10; unsigned long get_file_size(const char *path) { unsigned long filesize = -1; struct stat statbuff; if(stat(path, &statbuff) < 0){ return filesize; }else{ filesize = statbuff.st_size; } return filesize; } void readFromFile(const char* path, char** buff) { long length = get_file_size(path); // 取得文件大小 if(length == -1){ cerr << "content file is invalid!" << endl; system("quit"); return; } FILE *f = fopen(path, "r"); *buff = new char[length]; fread(*buff, sizeof(char), length, f); fclose(f); } void readInlineChars(char* source, char** buff) { *buff = new char[strlen(source)]; strcpy(*buff, source); } // // 用于显示结果 // TODO:使用迭代器 // void inorder_traverse(BTNode<int> *node, char* words[], int* indexMap, int threshold) { if(displayMaxItems == 0) return; if (NULL != node->left) { inorder_traverse(node->left, words, indexMap, threshold); } int confidence = indexMap[node->value]; if(confidence > threshold) { printf("%i\t%i\t%s\n", node->value, confidence, words[node->value]); displayMaxItems--; } if (NULL != node->right) { inorder_traverse(node->right, words, indexMap, threshold); } } void main(char* argv){ Trie t; SortBiTree<int> bt; char* cpy = new char[]; //===================================== // 注意,大文件请自行修改,去掉words //===================================== char *words[256]; printf("indexing..."); //readFromFile("contents.txt", &cpy); readFromFile("app_list.txt", &cpy); //readInlineChars("PlayShangDian PhotoshopDesigner pho Pho BuKaManHua BKManHua", &cpy); //readInlineChars("abc acc caa acb aaa abb", &cpy); //readInlineChars("aac aaa", &cpy); char *tk = strtok(cpy, " "); size_t index = 0; Stopwatch *sw = Stopwatch::StartNew(); t.insert(tk, index++); while (tk = strtok(NULL, " ")) { words[index] = tk; t.insert(tk, index++); } printf("%i word(s) have been indexed. [%lf seconds]\n", index, sw->Stop()); label_enter: printf("I'm searching for:\n>"); string input; const char *chars; while (true) { cin >> input; chars = input.data(); if(!isalpha(*chars) || (*chars <= 'Z' && *chars >= 'A')) { printf("only lowwer character is accepted!\n>"); } else break; } int *indexMap = new int[index]; sw = Stopwatch::StartNew(); printf("searching..."); t.fuzzy_match(chars, indexMap, index); printf("done. [%lf seconds]\n", sw->Stop()); label_change: bt = SortBiTree<int>(); printf("please input the threshold:\n>"); int threshold = 0; cin >> threshold; printf("calculating..."); int count = 0; for (int i = 0; i < index; i++) { int confidence = indexMap[i]; if(confidence > threshold) { count++; bt.add(confidence, i); } } printf(" [%i] words matched.\n", count); displayMaxItems = 10; if(count > displayMaxItems) printf("first 10 items are listed below.\n"); goto label_display; label_display: printf("------------------------------------------\n"); printf("index\tpriority\tcontent\n"); printf("------------------------------------------\n"); if(bt.getRootNode() != NULL) inorder_traverse(bt.getRootNode(), words, indexMap, threshold); else printf(" none \n"); /*it = &bt.getIterator(); while (it->hasNext()) { int i = it->next(); int confidence = indexMap[i]; if(confidence > threshold) { printf("%s\t%i\t%i\n",words[i], confidence, i); } }*/ printf("------------------------------------------\n"); label_menu: printf("now you may want to : \n[1].See them all.\n[2].Change thresgold\n[3].Change keyword.\n[q].Exit\n>"); char choise = 0; cin >> choise; switch (choise) { case '1': displayMaxItems = -1; goto label_display; break; case '2': goto label_change; break; case '3': goto label_enter; break; case 'q': break; default: break; } //delete indexMap; }
截图: