AC自动机

要学会AC自动机,我们必须知道什么是Trie,也就是字典树。最好对KMP算法也有些了解。Trie树和KMP算法我之前博客都有写过,感兴趣的可以看看。


简单叙述下问题,现在给出
"hsay";
"ah";
"sahe";
"he";
"say";
"herhb";
"aher";
"erhs"

共8个关键词,要问字符串"yasaherhsay"中这8个关键词有几个出现过。

答案是7。

这就是一个多模式匹配问题。


AC自动机算法分为3步:构造一棵Trie树,构造失败指针和模式匹配过程。

失败指针和KMP算法中的next函数或称shift函数的功能类似。


                

上图解释了失败指针的作用。

// AC_automachine.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include<vector>
#include<algorithm>
#include<set>
#include<iostream>  

using namespace std;

#define MAXSIZE 26  



struct TrieNode
{
	TrieNode* next[MAXSIZE];
	TrieNode*parent;
	vector<TrieNode*>fail;
	char p;
	int Num;
	bool isword;
};

set<string>re;//保存结果

TrieNode*initiate_Trie()
{
	TrieNode*root = new TrieNode;
	for (int i = 0; i < MAXSIZE; i++)
		root->next[i] = NULL;
	root->Num = 0;
	root->parent = NULL;
	root->isword = false;
	return root;


}

bool search(TrieNode*root, char*str)
{
	TrieNode*tn;
	tn = root;
	int k;
	while (*str != '\0')
	{
		k = *str - 'a';
		if (tn->next[k] == NULL)
			return false;
		tn = tn->next[k];
		str++;
	}
	if (tn->isword == false)
		return false;
	return true;
}

TrieNode*build_Trie_singleword(TrieNode*root, char*str)
{
	if (search(root, str))
		return root;
	root->Num = root->Num + 1;
	TrieNode*tn;
	tn = root;
	while (*str != '\0')
	{
		int k = *str - 'a';
		if (tn->next[k] == NULL)
		{
			tn->next[k] = new TrieNode;
			for (int i = 0; i < MAXSIZE; i++)
			{
				tn->next[k]->next[i] = NULL;
			}
			tn->next[k]->p = *str;
			tn->next[k]->Num = 1;
			tn->next[k]->parent = tn;
			tn->next[k]->isword = false;
		}
		else
		{
			tn->next[k]->Num = tn->next[k]->Num + 1;
		}
		tn = tn->next[k];
		str++;
	}
	tn->isword = true;
	return root;
}

void initiate_fail_pointer(TrieNode*root, TrieNode*node)
{
	//if (node == NULL)
	//	return;
	if (node == root)
	{
		for (int i = 0; i < MAXSIZE; i++)
			if (root->next[i] != NULL)
				initiate_fail_pointer(root, root->next[i]);
	}
	else
	{
		cout << node->p;
		TrieNode*n = node;
		vector<char>ss;
		ss.push_back(node->p);
		vector<TrieNode*>::iterator result = find(node->fail.begin(), node->fail.end(), root->next[node->p - 'a']); //查找
		if (root->next[node->p - 'a'] != NULL&&result == node->fail.end() && root->next[node->p - 'a'] != node)
			node->fail.push_back(root->next[node->p - 'a']);

		while (n->parent != root)
		{
			TrieNode*mm = root;
			ss.push_back(n->parent->p);
			int i;
			for (i = ss.size() - 1; i >= 0; i--)
				if (mm->next[ss[i] - 'a'] != NULL)
					mm = mm->next[ss[i] - 'a'];
				else
					break;
			if (i == -1 && mm != node)
			{
				result = find(node->fail.begin(), node->fail.end(), mm);
				if (result == node->fail.end())
					node->fail.push_back(mm);
			}
			n = n->parent;
		}



		for (int i = 0; i < MAXSIZE; i++)
			if (node->next[i] != NULL)
				initiate_fail_pointer(root, node->next[i]);
	}


}


int AC_automachine(TrieNode*root, char*str)
{
	int count = 0;
	int len = strlen(str);
	int k = 0;
	

	while (k < len)
	{
		while (root->next[str[k] - 'a'] == NULL)
		{
			k++;
		}

		TrieNode*p,*node = root->next[str[k] - 'a'];
		p = NULL;
		while (node != NULL)
		{
			if (node->isword == true)
			{
				string aa;
				TrieNode*nn = node;
				while (nn != root)
				{
					aa += nn->p;
					nn = nn->parent;
				}
				std::reverse(aa.begin(), aa.end());
				if (re.find(aa) == re.end())
				{
					re.insert(aa);
					count++;
				}
			}
			if (!node->fail.empty())
			{
				for (int i = 0; i < node->fail.size(); i++)
					if (node->fail[i]->isword)
					{
						string aa;
						TrieNode*nn = node->fail[i];
						while (nn != root)
						{
							aa += nn->p;
							nn = nn->parent;
						}
						std::reverse(aa.begin(), aa.end());
						if (re.find(aa) == re.end())
						{
							re.insert(aa);
							count++;
						}
					}
			}
			k++;
			p = node;
			node = node->next[str[k] - 'a'];
		}

		k--;
		node = p;
		_ASSERT(node);
		if (node->fail.empty())
		{
			k++;
		}
		else
		{
			int max = 0;
			TrieNode*tn, *tp;
			tn = NULL;
			int kk;
			for (int i = 0; i < node->fail.size(); i++)
			{
				kk = 0;
				tp = node->fail[i];
				while (tp != NULL)
				{
					if (tp->isword)
					{
						string aa;
						TrieNode*nn = tp;
						while (nn != root)
						{
							aa += nn->p;
							nn = nn->parent;
						}
						std::reverse(aa.begin(), aa.end());
						if (re.find(aa) == re.end())
						{
							re.insert(aa);
							count++;
						}
					}
					if (!tp->fail.empty())
					{
						for (int i = 0; i < tp->fail.size(); i++)
							if (tp->fail[i]->isword)
							{
								string aa;
								TrieNode*nn = tp->fail[i];
								while (nn != root)
								{
									aa += nn->p;
									nn = nn->parent;
								}
								std::reverse(aa.begin(), aa.end());
								if (re.find(aa) == re.end())
								{
									re.insert(aa);
									count++;
								}
							}
					}
					kk++;
					p = tp;
					tp = tp->next[str[k + kk] - 'a'];
				}
				if (kk > max)
				{
					max = kk;
					tn = p;
					_ASSERT(tn);
				}
			}
			if (!tn->fail.empty())
			{
				int maxlen=0;
				for (int i = 0; i < tn->fail.size(); i++)
				{
					TrieNode*mm = tn->fail[i];
					int kkk = 0;
					while (mm != root)
					{
						mm = mm->parent;
						kkk++;
					}
					if (kkk > maxlen)
						maxlen = kkk;
				}
				k = k + kk - maxlen;
			}
			else
			{
				k = k + kk;
			}
		}//end of else
	}

	return count;

}


int _tmain(int argc, _TCHAR* argv[])
{
	TrieNode*root = initiate_Trie();
	root = build_Trie_singleword(root, "hsay");
	root = build_Trie_singleword(root, "ah");
	root = build_Trie_singleword(root, "sahe");
	root = build_Trie_singleword(root, "he");
	root = build_Trie_singleword(root, "say");
	root = build_Trie_singleword(root, "herhb");
	root = build_Trie_singleword(root, "aher");
	root = build_Trie_singleword(root, "erhs");
	

	initiate_fail_pointer(root, root);
	cout << endl;
	cout << AC_automachine(root, "yasaherhsay") << endl;

	system("pause");
	return 0;
}



版权声明:

posted on 2015-08-10 00:58  moffis  阅读(164)  评论(0编辑  收藏  举报

导航