用c++后缀自动机实现最大公共字符串算法,并封装成Python库

后缀自动机的C++代码转自https://e-maxx.ru/algo/suffix_automata,其余封装为自写。

 

在C++文件同级目录建立setup.py文件,代码如下:

# !/usr/bin/env python
from distutils.core import setup, Extension
mod = "sam"
setup(name=mod, ext_modules=[Extension(mod, sources=['sam_lcs.cpp'])])

 

封装完后缀自动机的源码后,命令行编译、安装、卸载,安装后即可在Python里import调用:

python setup.py build
python setup.py install
python setup.py uninstall

 

包装模块的C++函数编写如下:

#include <map>
#include <string>
#include <Python.h>

using namespace std;

struct state
{
    int len, link;
    map<char, int> next;
};

const int MAXLEN = 100000;
state st[MAXLEN * 2];
int sz, last;

void sa_init()
{
    sz = last = 0;
    st[0].len = 0;
    st[0].link = -1;
    ++sz;
    // 清除状态:
    for (int i = 0; i < MAXLEN * 2; ++i)
        st[i].next.clear();
};

void sa_extend(char c)
{
    int cur = sz++;
    st[cur].len = st[last].len + 1;
    int p;
    for (p = last; p != -1 && !st[p].next.count(c); p = st[p].link)
        st[p].next[c] = cur;
    if (p == -1)
        st[cur].link = 0;
    else
    {
        int q = st[p].next[c];
        if (st[p].len + 1 == st[q].len)
            st[cur].link = q;
        else
        {
            int clone = sz++;
            st[clone].len = st[p].len + 1;
            st[clone].next = st[q].next;
            st[clone].link = st[q].link;
            for (; p != -1 && st[p].next[c] == q; p = st[p].link)
                st[p].next[c] = clone;
            st[q].link = st[cur].link = clone;
        }
    }
    last = cur;
};

string lcs(string s, string t)
{
    sa_init();
    for (int i = 0; i < (int)s.length(); ++i)
        sa_extend(s[i]);

    int v = 0, l = 0,
        best = 0, bestpos = 0;
    for (int i = 0; i < (int)t.length(); ++i)
    {
        while (v && !st[v].next.count(t[i]))
        {
            v = st[v].link;
            l = st[v].len;
        }
        if (st[v].next.count(t[i]))
        {
            v = st[v].next[t[i]];
            ++l;
        }
        if (l > best)
            best = l, bestpos = i;
    }
    return t.substr(bestpos - best + 1, best);
};

static PyObject *sam_lcs(PyObject *self, PyObject *args)
{
    char *stmp, *ttmp;
    string s, t;
    if (!PyArg_ParseTuple(args, "ss", &stmp, &ttmp))
        return NULL;
    s = stmp;
    t = ttmp;
    return PyUnicode_FromString(lcs(s, t).c_str());
};

static PyMethodDef sam_lcs_Methods[] = {
    {"lcs", sam_lcs, METH_VARARGS,
     "Get a longest common string of two strings with SAM"},
    {NULL, NULL, 0, NULL}};

static struct PyModuleDef sam = {
    PyModuleDef_HEAD_INIT,
    "sam",
    "SAM",
    -1,
    sam_lcs_Methods};

PyMODINIT_FUNC
PyInit_sam(void)
{
    return PyModule_Create(&sam);
};

 

编译安装完成后,就可以在Python里调用了

 

posted @ 2019-03-08 16:53  谢耳朵的派森笔记  阅读(392)  评论(0编辑  收藏  举报