AC自动机

ac自动机

优点

多模式匹配

fail指针

i -> fail = j => word[j]是word[i]的最长后缀

示例

给定词组 {he,she,hers,his}

给定字符串:ahishers,判断 字符串是否包含以上单词

  1. 建立树状图,并记录节点保存的单词长度
  2. h节点和s节点都和root一样,后缀为空。层次遍历。找fail指针,一个节点A的fail指针,找他父节点B的fail指针(father-fail)指向的节点C,指向与A点相同的C点的叶子结点。如图比如5号节点,fa-fail指向root,root有一个和h相同的叶子结点2号节点,所以5fail指针指向2。6号节点,fa-fail指向2,同时2号节点也有个e边,所以6的fail指向3。此时3号节点记录了一个完整的单词长度,此时6号节点也存储进去。

C++代码

#include <bits/stdc++.h>

using namespace std;

#define DEBUG (1)

#define Len(x)              sizeof(x)/sizeof(x[0]) 

/**********************************************
    aho corasick
**********************************************/
#define ALPHABET   (26)

int cnt = 0;                // debug

typedef struct _NODE
{
    int name;               // debug
    vector<int> exist;
    _NODE *fail;
    _NODE *child[ALPHABET];
    _NODE()
    {
        name = 0;           // debug
        fail = NULL;
        memset(child, NULL, ALPHABET*sizeof(_NODE *));
    }
}NODE;

/**********************************************
    "he" "she" "hers" "his" 
    ahishers
     his
       she
        he
        hers
**********************************************/
void print_matching_result(const char *T, int start, int len)
{
    for(int i = 0; i < start; i++)
        printf(" ");
    for(int i = 0; i < len; i++)
        printf("%c",T[start+i]);
    printf("\n");
}

/**********************************************
    用于debug确认构造完成的状态机的[fail指针]和[exist信息]
    bfs遍历一遍trie,按顺序给节点命名,同时输出fail指向信息
**********************************************/
void print_automaton_info(NODE *tmp)
{
    printf("---------------------info----------------------\n");
    queue<NODE *> q;
    q.push(tmp);
    while(!q.empty())
    {
        NODE *tmp = q.front(); q.pop();
        tmp->name = cnt++;
        if(tmp->fail)
            printf("%d --fail--> %d, has %d word\n", tmp->name, tmp->fail->name, tmp->exist.size());
        for(int i = 0; i < ALPHABET; i++)
            if(tmp->child[i])
                q.push(tmp->child[i]);
    }
    printf("---------------------end----------------------\n");
}

void trie_insert(NODE *root, const char *word)
{
    NODE *tmp = root;

    for(int i = 0; i < strlen(word); i++)
    {
        int c = word[i]-'a';
        if(NULL == tmp->child[c])
            tmp->child[c] = new NODE();
        tmp = tmp->child[c];
    }
    tmp->exist.push_back(strlen(word));
}

void ac_build(NODE *root, const char *P[], int n)
{
    for(int i = 0; i < n; i++)
        trie_insert(root, P[i]);
    
    queue<NODE *> q;
    for(int i = 0; i < ALPHABET; i++)
    {
        if(root->child[i])
        {
            root->child[i]->fail = root;
            q.push(root->child[i]);
        }
    }

    while(!q.empty())
    {
        NODE *x = q.front(); q.pop();
        for(int i = 0; i < ALPHABET; i++)
        {
            if(x->child[i])
            {
                NODE *y = x->child[i], *fafail = x->fail;
                /*
                    x--fail-→fafail         x--fail-→fafail
                      ↘ⁱ             ==>     ↘ⁱ        ↘ⁱ      
                        y                      y--fail--→★
                */
                while(fafail && NULL == fafail->child[i])
                    fafail = fafail->fail;
                if(NULL == fafail)
                    y->fail = root;
                else
                    y->fail = fafail->child[i];

                if(y->fail->exist.size())
                    for(int j = 0; j < y->fail->exist.size(); j++)
                        y->exist.push_back(y->fail->exist[j]);
                q.push(y);
            }
        }
    }
}

void ac_query(NODE *root, const char *T)
{
    NODE *tmp = root;
    for(int i = 0; i < strlen(T); i++)
    {
        int c = T[i]-'a';
        while(NULL == tmp->child[c] && tmp->fail)
            tmp = tmp->fail;
        if(tmp->child[c])
            tmp = tmp->child[c];
        else
            continue;
        if(tmp->exist.size())
        {
            for(int j = 0; j < tmp->exist.size(); j++)
            {
                int len = tmp->exist[j];
                print_matching_result(T, i-len+1, len);
            }
        }
    }
}

void aho_corasick(const char *P[], int n, const char *T)
{
    printf("**********************************************\n");
    for(int i = 0; i < n; i++)
        printf("\"%s\" ", P[i]);
    printf("\n%s\n", T);

    NODE *root = new NODE();
    ac_build(root, P, n);
    ac_query(root, T);

    print_automaton_info(root);         // debug
}

int main()
{
    const char *P[] = {"he", "she", "hers", "his", "is"}; 
    const char *T = "ahishersheishiser";

    aho_corasick(P, Len(P), T);

    return 0;
}
posted @ 2020-09-27 20:29  Jimmyhe  阅读(105)  评论(0编辑  收藏  举报