AC自动机
要学会AC自动机,我们必须知道什么是Trie,也就是字典树。最好对KMP算法也有些了解。Trie树和KMP算法我之前博客都有写过,感兴趣的可以看看。
简单叙述下问题,现在给出
"hsay";
"ah";
"sahe";
"he";
"say";
"herhb";
"aher";
"erhs"
共8个关键词,要问字符串"yasaherhsay"中这8个关键词有几个出现过。
答案是7。
这就是一个多模式匹配问题。
AC自动机算法分为3步:构造一棵Trie树,构造失败指针和模式匹配过程。
失败指针和KMP算法中的next函数或称shift函数的功能类似。
上图解释了失败指针的作用。
// AC_automachine.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include<vector> #include<algorithm> #include<set> #include<iostream> using namespace std; #define MAXSIZE 26 struct TrieNode { TrieNode* next[MAXSIZE]; TrieNode*parent; vector<TrieNode*>fail; char p; int Num; bool isword; }; set<string>re;//保存结果 TrieNode*initiate_Trie() { TrieNode*root = new TrieNode; for (int i = 0; i < MAXSIZE; i++) root->next[i] = NULL; root->Num = 0; root->parent = NULL; root->isword = false; return root; } bool search(TrieNode*root, char*str) { TrieNode*tn; tn = root; int k; while (*str != '\0') { k = *str - 'a'; if (tn->next[k] == NULL) return false; tn = tn->next[k]; str++; } if (tn->isword == false) return false; return true; } TrieNode*build_Trie_singleword(TrieNode*root, char*str) { if (search(root, str)) return root; root->Num = root->Num + 1; TrieNode*tn; tn = root; while (*str != '\0') { int k = *str - 'a'; if (tn->next[k] == NULL) { tn->next[k] = new TrieNode; for (int i = 0; i < MAXSIZE; i++) { tn->next[k]->next[i] = NULL; } tn->next[k]->p = *str; tn->next[k]->Num = 1; tn->next[k]->parent = tn; tn->next[k]->isword = false; } else { tn->next[k]->Num = tn->next[k]->Num + 1; } tn = tn->next[k]; str++; } tn->isword = true; return root; } void initiate_fail_pointer(TrieNode*root, TrieNode*node) { //if (node == NULL) // return; if (node == root) { for (int i = 0; i < MAXSIZE; i++) if (root->next[i] != NULL) initiate_fail_pointer(root, root->next[i]); } else { cout << node->p; TrieNode*n = node; vector<char>ss; ss.push_back(node->p); vector<TrieNode*>::iterator result = find(node->fail.begin(), node->fail.end(), root->next[node->p - 'a']); //查找 if (root->next[node->p - 'a'] != NULL&&result == node->fail.end() && root->next[node->p - 'a'] != node) node->fail.push_back(root->next[node->p - 'a']); while (n->parent != root) { TrieNode*mm = root; ss.push_back(n->parent->p); int i; for (i = ss.size() - 1; i >= 0; i--) if (mm->next[ss[i] - 'a'] != NULL) mm = mm->next[ss[i] - 'a']; else break; if (i == -1 && mm != node) { result = find(node->fail.begin(), node->fail.end(), mm); if (result == node->fail.end()) node->fail.push_back(mm); } n = n->parent; } for (int i = 0; i < MAXSIZE; i++) if (node->next[i] != NULL) initiate_fail_pointer(root, node->next[i]); } } int AC_automachine(TrieNode*root, char*str) { int count = 0; int len = strlen(str); int k = 0; while (k < len) { while (root->next[str[k] - 'a'] == NULL) { k++; } TrieNode*p,*node = root->next[str[k] - 'a']; p = NULL; while (node != NULL) { if (node->isword == true) { string aa; TrieNode*nn = node; while (nn != root) { aa += nn->p; nn = nn->parent; } std::reverse(aa.begin(), aa.end()); if (re.find(aa) == re.end()) { re.insert(aa); count++; } } if (!node->fail.empty()) { for (int i = 0; i < node->fail.size(); i++) if (node->fail[i]->isword) { string aa; TrieNode*nn = node->fail[i]; while (nn != root) { aa += nn->p; nn = nn->parent; } std::reverse(aa.begin(), aa.end()); if (re.find(aa) == re.end()) { re.insert(aa); count++; } } } k++; p = node; node = node->next[str[k] - 'a']; } k--; node = p; _ASSERT(node); if (node->fail.empty()) { k++; } else { int max = 0; TrieNode*tn, *tp; tn = NULL; int kk; for (int i = 0; i < node->fail.size(); i++) { kk = 0; tp = node->fail[i]; while (tp != NULL) { if (tp->isword) { string aa; TrieNode*nn = tp; while (nn != root) { aa += nn->p; nn = nn->parent; } std::reverse(aa.begin(), aa.end()); if (re.find(aa) == re.end()) { re.insert(aa); count++; } } if (!tp->fail.empty()) { for (int i = 0; i < tp->fail.size(); i++) if (tp->fail[i]->isword) { string aa; TrieNode*nn = tp->fail[i]; while (nn != root) { aa += nn->p; nn = nn->parent; } std::reverse(aa.begin(), aa.end()); if (re.find(aa) == re.end()) { re.insert(aa); count++; } } } kk++; p = tp; tp = tp->next[str[k + kk] - 'a']; } if (kk > max) { max = kk; tn = p; _ASSERT(tn); } } if (!tn->fail.empty()) { int maxlen=0; for (int i = 0; i < tn->fail.size(); i++) { TrieNode*mm = tn->fail[i]; int kkk = 0; while (mm != root) { mm = mm->parent; kkk++; } if (kkk > maxlen) maxlen = kkk; } k = k + kk - maxlen; } else { k = k + kk; } }//end of else } return count; } int _tmain(int argc, _TCHAR* argv[]) { TrieNode*root = initiate_Trie(); root = build_Trie_singleword(root, "hsay"); root = build_Trie_singleword(root, "ah"); root = build_Trie_singleword(root, "sahe"); root = build_Trie_singleword(root, "he"); root = build_Trie_singleword(root, "say"); root = build_Trie_singleword(root, "herhb"); root = build_Trie_singleword(root, "aher"); root = build_Trie_singleword(root, "erhs"); initiate_fail_pointer(root, root); cout << endl; cout << AC_automachine(root, "yasaherhsay") << endl; system("pause"); return 0; }
版权声明: