KMP和AC自动机

一、在做编译原理龙书3.4节的课后习题的时候，看到了关于KMP的介绍，则再次了解了下next失效函数的构造过程和字符串的匹配过程

1、关键的求next过程的概念：求公共最长匹配前缀和后缀的长度。具体可以参考该博文：https://blog.csdn.net/qq_37174526/article/details/90141833

这里重新写了该算法

vector<int> getNext(const string& str) {
    vector<int> next(str.length(), -1);
    int index = 0;
    int pre = -1;
    while (index + 1 < str.length()) {
        if (pre < 0 || str[pre] == str[index]) {
            ++index;
            ++pre;
            // 这里增加是否相等的判断
            // 如果不加判断直接：next[index] = pre;
            // 假设匹配的串为aba
            // 则next的值为:-1 0 0
            // 假设第一个a为a1, 第二个a为a2
            // 那么当a2匹配失效的时候则跳转到a1再匹配一次，依旧是匹配失效；那么跟a1的这次匹配其实是没必要的
            // 解决方案是当a2等于a1的时候，则a2的next直接使用a1的next，避免不必要的匹配
            if (str[index] == str[pre])
                next[index] = next[pre];
            else
                next[index] = pre;
        } else {
            pre = next[pre];
        }
    }
    return next;
}

int match(const string& key, const string& str) {
    vector<int> next = getNext(key);

    int pre = 0;
    int index = 0;
    while (index < str.length() && pre < key.length()) {
        if (pre < 0 || key[pre] == str[index]) {
            ++pre;
            ++index;
        } else {
            pre = next[pre];
        }
    }

    if (pre >= (int)key.length()) {
        return index - pre;
    }
    return -1;
}

int main() {
    cout << match("ababaa", "abababaab") << endl;
    cout << match("ababaa", "abababbba") << endl;
    
    return 0;
}

View Code

二、编译原理龙书3.4.10的课后习题，编写AC自动机的实效函数

1、书上对于AC自动机的介绍比较少，这里参考了https://zhuanlan.zhihu.com/p/368184958 对于AC自动机的详细介绍

其失效函数的思路是匹配所有匹配串的最长前缀；

某个节点的父节点的失效节点是否能否找到与该节点相同的子节点，如果有找到则该节点的失效节点为父节点的实效节点的对应子节点。如果在该失效节点没有找到对应的子节点则在失效节点的失效节点找直到找到对应的节点或者失效节点为根节点并且没有找到对应的子节点。如下则是寻找父节点的实效节点是否有对应子节点为c的节点的一个过程

int getFailIndex(const vector<Node>& nodes, int parentFailIndex, char c) {
    const map<char, int>& children = nodes[parentFailIndex].children;
    map<char, int>::const_iterator iter = children.find(c);
    if (iter != children.end())
        return iter->second;

    if (parentFailIndex == 0)
        return parentFailIndex;

    return getFailIndex(nodes, nodes[parentFailIndex].fail, c);
}

这里定义了该失效函数对应的结构体

struct Node {
    map<char, int> children;
    int fail;
    char c;
    bool endFlag; // 某个字符串的接受标志
};

如下是整个失效函数的实现过程

vector<Node> createFailArray(const vector<string>& arr) {
    vector<Node> nodes(1);
    nodes[0].fail = 0;
    nodes[0].c = '\0';
    nodes[0].endFlag = false;

    vector<int> parents(arr.size(), 0);
    int maxlen = 0;
    for (int i = 0; i < arr.size(); ++i) {
        maxlen = MAX(maxlen, arr[i].length());
    }

    for (int i = 0; i < maxlen; ++i) {
        for (int j = 0; j < arr.size(); ++j) {
            if (arr[j].length() <= i)
                continue;

            int parentIndex = parents[j];
            char c = arr[j][i];

            map<char, int>& children = nodes[parentIndex].children;
            map<char, int>::iterator iter = children.find(c);
            if (iter == children.end()) {
                nodes.push_back(Node());
                Node& node = nodes.back();
                node.c = c;
                node.fail = getFailIndex(nodes, nodes[parentIndex].fail, c);
                node.endFlag = false;

                // 这里不使用上面获取的children，是因为nodes的push_back之后，可能之前的children会失效
                // 要等参数都构造完成之后，再将本节点加到父节点的子节点中，否则数据会异常
                nodes[parentIndex].children[c] = nodes.size() - 1;
                parents[j] = nodes.size() - 1;
            } else {
                parents[j] = iter->second;
            }

            if (i + 1 == arr[j].length()) {
                nodes[parents[j]].endFlag = true;
            }
        }
    }

    return nodes;
}

那么该如何使用失效函数呢？这里有个简单的例子，计算给定的字符串有多少个匹配串在该字符串中

// 对应的节点的所有失效节点是否存在接受标志，如果有则统计
int getEndCount(const vector<Node>& nodes, int index) {
    int count = 0;
    while (index > 0) {
        if (nodes[index].endFlag)
            ++count;
        index = nodes[index].fail;
    }
    return count;
}

int getAllMatchedCount(const vector<Node>& nodes, const string& str) {
    int curIndex = 0;
    int count = 0;
    for (string::const_iterator iter = str.begin(); iter != str.end(); ++iter) {
        curIndex = getFailIndex(nodes, curIndex, *iter);
        count += getEndCount(nodes, curIndex);
    }
    return count;
}

该例子可以应用到 http://acm.hdu.edu.cn/showproblem.php?pid=2222

posted @ 2022-05-14 23:27 LCAC 阅读(81) 评论(0) 收藏举报

刷新页面返回顶部

KMP和AC自动机

公告