【字符串】字符串多项式哈希

根据不同的需求要改的太多了,注意如果不需要防hack可以关闭anti_hack,或者在多个字符串共享同一个哈希算法的时候让anti_hack值

哈希值为64位有符号整数

进行2次哈希,输出为long long的版本。

基本上只维护这个版本了,不太想维护其他版本的。

字符串的多项式哈希的公式

\[hashcode = \sum\limits_{i=1}^{n} (s[i] + b)\cdot p^{n - i} \mod m \]

这三个数字的意义:

  1. \(b\): 如果 s[i] 值域为 \([0, 10^6]\) 的非负整数,那么如果 \(b\)\(0\),不同长度的 \([0], [0,0], [0,0,0]\) 哈希之后得到的结果是一样的,永远都是 \([0]\)。与之相同,虽然 \(b\) 的取值为小的正整数,但仍然可能存在值域为负数的 \(s[i]\)\(s[i] + b\) 恰好都为 \(0\),这时候无法区分不同长度的 \([s[i]], [s[i],s[i]], [s[i],s[i],s[i]]\) 。可以通过选取不同的两个 \(b\) 进行两次哈希来彻底避免这种问题,这也是下文为什么要选择两个小的正整数 \(b\) 。通常,对于常见的字符串为小写英文字符的题目来说,常见取的 \(b\)b = 0 (不做处理)或者 b = - 'a' + 1 (让'a'表示第1个字符,'b'表示第2个字符,'z'表示第26个字符)
  2. \(p\): \(p\) 要选取一个质数,并且这个 \(p\) 的取值范围要大于最大的 \(s[i] + b\) ,试想一种常见的错误实现,比如 b = - 'a' + 1, p = 7 虽然 \(p\) 是质数,但是无法区分下列的字符串:
h    \\ hashcode = 'h' - 'a' + 1 = 8
aa   \\ hashcode = ('a' - 'a' + 1) * 7 + ('a' - 'a' + 1) = 8

如果p选择合数会发生什么,这个我不太会分析。对于随机生成的长度为50,值域为小写a到小写z的“无重复”字符串集合(集合大小为t=50000),在b = -'a' + 1且m=1e9+7的前提下。对于不同的p的取值,碰撞的次数如下:
d为p的因子个数,d=2相当于p为质数,cnt表示有多少个这样的数字,sumsame表示他们发送的碰撞贡献。

p的取值为30~80。

d = 2, cnt = 12, sumsame = 7, P = 0.583333333333
d = 3, cnt = 1, sumsame = 3, P = 3.000000000000
d = 4, cnt = 15, sumsame = 25, P = 1.666666666667
d = 6, cnt = 9, sumsame = 7, P = 0.777777777778
d = 7, cnt = 1, sumsame = 0, P = 0.000000000000
d = 8, cnt = 8, sumsame = 13, P = 1.625000000000
d = 9, cnt = 1, sumsame = 1, P = 1.000000000000
d = 10, cnt = 2, sumsame = 5, P = 2.500000000000
d = 12, cnt = 2, sumsame = 1, P = 0.500000000000

除去cnt比较少的部分,选取质数达到的碰撞概率总体是比合数低一些,但是看上去很明显。关于这一点可能硕士的时候找找论文研究一下为什么吧。

把合数全部合并到d=3里面,大概是这样:
t = 10000
d = 2, cnt = 12, sumsame = 0, P = 0.000000000000
d = 3, cnt = 39, sumsame = 0, P = 0.000000000000
t = 20000
d = 2, cnt = 12, sumsame = 1, P = 0.083333333333
d = 3, cnt = 39, sumsame = 5, P = 0.128205128205
t = 30000
d = 2, cnt = 12, sumsame = 1, P = 0.083333333333
d = 3, cnt = 39, sumsame = 23, P = 0.589743589744
t = 40000
d = 2, cnt = 12, sumsame = 2, P = 0.166666666667
d = 3, cnt = 39, sumsame = 36, P = 0.923076923077
t = 50000
d = 2, cnt = 12, sumsame = 7, P = 0.583333333333
d = 3, cnt = 39, sumsame = 55, P = 1.410256410256

这下可以看出来,在p为合数的时候发生碰撞的期望次数是比p为质数的时候明显高很多的。不过在t进一步增大到1e5之后,无论p是质数还是合数,期望碰撞就达到了4-5次,这种情况哈希已经可以是不可用的状态了,没有什么比较的意义。

3. $m$: $m$ 通常要选择一个远远超过 $s[i]$ 值域也远远超过 $p$ 的大质数,常见的比如 998244353 或者 1e9+7 都是可以的。不过选择小众又随机的质数就不容易在有hack赛制的比赛中挂掉。当 $p,m$ 都是质数时,某个字符串的哈希值与别人另一个字符串碰撞的概率就是 $\frac{1}{m}$ ,假设有n种不同的字符串,那么他们之间没有任何碰撞的概率为:
```plaintext
m = 1000000000
n = 1000, P = 0.999499625062
n = 2000, P = 0.998000999334
n = 5000, P = 0.987575310972
n = 10000, P = 0.951224509803
n = 20000, P = 0.818721474089
n = 50000, P = 0.286491665363
n = 100000, P = 0.006736487196
n = 200000, P = 0.000000002058
n = 500000, P = 0.000000000000
n = 1000000, P = 0.000000000000

可以看到其实在n=2e4开始,哈希碰撞已经是不能忽视的问题了。n=2e5开始几乎一定会发生至少1次碰撞(不碰撞的概率只有1e-8)。虽然碰撞个几次不一定会影响最终答案。
其实,使得碰撞发生至少一次的的概率超过50%的值,大概是根号m(生日悖论)。

而常见的题目中, \(m = 1e9, len(s) = 1e6\) ,这种情况可以说是一定会发生碰撞了,而且碰撞的次数已经不能忽视。

而两次哈希的碰撞概率如下表

m = 1000000000000000000.000000000000
n = 1000, P = 0.999999999999
n = 2000, P = 0.999999999998
n = 5000, P = 0.999999999987
n = 10000, P = 0.999999999950
n = 20000, P = 0.999999999800
n = 50000, P = 0.999999998750
n = 100000, P = 0.999999995000
n = 200000, P = 0.999999980000
n = 500000, P = 0.999999875000
n = 1000000, P = 0.999999500000
n = 2000000, P = 0.999998000001
n = 5000000, P = 0.999987500076
n = 10000000, P = 0.999950001245
n = 20000000, P = 0.999800019989

可以看到哪怕范围到2e7,无碰撞概率还是高达99.98%,几乎不可能会发生问题(而且哪怕真的运气不好发生了0.02%的小概率事件,这个碰撞也不一定会影响答案)。

注释掉的rev开头的字段为回文哈希,使用方法主要是rev_same判断两个子串是不是把其中一个反转之后与另一个相等。

struct BaseHash {

    int b, p, m;
    vector<int> h, plen;
    vector<int> rev_h;

    inline int add_mod (const int &x, const int &y, const int &m) const {
        return x + y >= m ? x + y - m : x + y;
    }

    inline int sub_mod (const int &x, const int &y, const int &m) const {
        return x - y < 0 ? x - y + m : x - y;
    }

    inline int mul_mod (const int &x, const int &y, const int &m)  const {
        ll res = 1LL * x * y;
        if (res >= m) {
            res %= m;
        }
        return res;
    }

    void set_const (int b, int p, int m) {
        this -> b = b, this -> p = p, this -> m = m;
    }

    void init (char* s) {
        int len = strlen (s + 1);
        h.resize (len + 2), plen.resize (len + 2);
        h[0] = 0, plen[0] = 1;
        for (int i = 1; i <= len; ++i) {
            h[i] = add_mod (mul_mod (h[i - 1], p, m), (s[i] + b), m);
            plen[i] = mul_mod (plen[i - 1], p, m) ;
        }
        // 如果需要使用回文哈希,启用下面几行即可,无需调整其他部分(没有额外空间开销)
//        rev_h.resize (len + 2);
//        rev_h[len + 1] = 0;
//        for (int i = len; i >= 1; --i) {
//            rev_h[i] = add_mod (mul_mod (rev_h[i + 1], p, m), (s[i] + b), m);
//        }
    }

    int code (const int &l, const int &r) const {
        if (l > r) {
            return 0;
        }
        int res = sub_mod (h[r], mul_mod (h[l - 1], plen[r - l + 1], m), m);
//        printf ("[l, r] = [%2d, %2d], hashcode = %8X\n", l, r, res);
        return res;
    }

    int rev_code (const int &l, const int &r) const {
        if (l > r) {
            return 0;
        }
        int res = sub_mod (rev_h[l], mul_mod (rev_h[r + 1], plen[r - l + 1], m), m);
//        printf ("[l, r] = [%2d, %2d], rev_hashcode = %8X\n", l, r, res);
        return res;
    }

};

struct MultiHash {

    static const int MAX_CHECK_TIMES = 8;
    int b[MAX_CHECK_TIMES] = {
        11, 13, 17, 19,
        23, 29, 31, 37
    };
    int p[MAX_CHECK_TIMES] = {
        307, 311, 313, 317,
        331, 337, 347, 349
    };
    int m[MAX_CHECK_TIMES] = {
        998244353, 998244389, 998244391, 998244397,
        998244407, 998244431, 998244433, 998244473
    };

    static const int CHECK_TIMES = 2;
    BaseHash bh[CHECK_TIMES];

    void anti_hack() {
        static bool ANTI_HACK_ENABLED = true;
        if (!ANTI_HACK_ENABLED) {
            return;
        }
        ANTI_HACK_ENABLED = false;
        int rnd_seed = chrono::system_clock::now().time_since_epoch().count();
        mt19937 rnd (rnd_seed);
        shuffle (b, b + MAX_CHECK_TIMES, rnd);
        shuffle (p, p + MAX_CHECK_TIMES, rnd);
        shuffle (m, m + MAX_CHECK_TIMES, rnd);
    }

    void init (char *s) {
        anti_hack();
        for (int i = 0; i < CHECK_TIMES; ++i) {
            bh[i].set_const (b[i], p[i], m[i]);
            bh[i].init (s);
        }
    }

    ll code (const int &l, const int &r) const {
        ll res = 0;
        for (int i = 0; i < CHECK_TIMES; ++i) {
            res = (res << 32) ^ (bh[i].code (l, r));
        }
//        printf ("[l, r] = [%2d, %2d], hashcode = %8X\n", l, r, res);
        return res;
    }

    ll rev_code (const int &l, const int &r) const {
        ll res = 0;
        for (int i = 0; i < CHECK_TIMES; ++i) {
            res = (res << 32) ^ (bh[i].rev_code (l, r));
        }
//        printf ("[l, r] = [%2d, %2d], rev_hashcode = %8X\n", l, r, res);
        return res;
    }

    bool same (int l1, int r1, int l2, int r2) {
        for (int i = 0; i < CHECK_TIMES; ++i) {
            if (bh[i].code (l1, r1) != bh[i].code (l2, r2)) {
                return false;
            }
        }
        return true;
    }

    bool rev_same (int l1, int r1, int l2, int r2) {
        for (int i = 0; i < CHECK_TIMES; ++i) {
            if (bh[i].code (l1, r1) != bh[i].rev_code (l2, r2)) {
                return false;
            }
        }
        return true;
    }

} mh;

省一半空间的版本

不储存质数p的前缀和,作为代价,查询子串的哈希值时间复杂度变为log(不推荐)

struct BaseHash {

    int b, p, m;
    vector<int> h;

    void set_const (int b, int p, int m) {
        this -> b = b, this -> p = p, this -> m = m;
    }

    void init (char* s) {
        int len = strlen (s + 1);
        h.resize (len + 2);
        h[0] = 0;
        for (int i = 1; i <= len; ++i) {
            h[i] = (1LL * h[i - 1] * p + (s[i] + b)) % m;
        }
    }

    ll calc_plen (int len) const {
        ll res = 1, tp = p;
        for (ll tp = p; len; tp = tp * tp % m, len >>= 1) {
            if (len & 1) {
                res = res * tp % m;
            }
        }
        return res;
    }

    ll code (const int &l, const int &r) const {
        if (l > r) {
            return 0;
        }
        ll res = (h[r] - h[l - 1] * calc_plen (r - l + 1) % m + m) % m;
//        printf ("[l, r] = [%d, %d], hashcode = %lld\n", l, r, res);
        return res;
    }

};

一次性使用的版本,也就是最基础的字符串哈希

非常不推荐使用,会碰撞到头皮发麻

struct BaseHash {

    int b = 11, p = 113, m = (int) 1e9 + 21;
    vector<int> h;

    int init (char* s) {
        int len = strlen (s + 1);
        int h = 0;
        for (int i = 1; i <= len; ++i) {
            h = (1LL * h * p + (s[i] + b)) % m;
        }
        return h;
    }

} bh;

哈希值为32位整数的列表

最多支持4次哈希的重写的版本:

struct BaseHash {

    static const int LEN = 2e5 + 10;
    int b, p, m;
    int h[LEN], plen[LEN];

    void set_const (int b, int p, int m) {
        this -> b = b, this -> p = p, this -> m = m;
    }

    void init (char* s, int len) {
        h[0] = 0, plen[0] = 1;
        for (int i = 1; i <= len; ++i) {
            h[i] = (1LL * h[i - 1] * p + (s[i] - 'a' + b)) % m;
            plen[i] = (1LL * plen[i - 1] * p) % m;
        }
    }

//    ll calc_plen (int len) {
//        ll res = 1, tp = p;
//        for (ll tp = p; len; tp = tp * tp % m, len >>= 1) {
//            if (len & 1) {
//                res = res * tp % m;
//            }
//        }
//        return res;
//    }

    ll code (const int &l, const int &r) const {
        if (l > r) {
            return 0;
        }
        int res = (h[r] - 1LL * h[l - 1] * plen[r - l + 1] % m + m) % m;
//        ll res = (h[r] - h[l - 1] * calc_plen (r - l + 1) % m + m) % m;
//        D3 (l, r, res);
        return res;
    }

};

struct MultiHash {

    static const int MAX_CHECK_TIMES = 4;
    int b[MAX_CHECK_TIMES] = {11, 13, 17, 19};
    int p[MAX_CHECK_TIMES] = {113, 127, 131, 137};
    int m[MAX_CHECK_TIMES] = { (int) 1e9 + 21, (int) 1e9 + 33, (int) 1e9 + 87, (int) 1e9 + 93};

    static const int CHECK_TIMES = 2;
    BaseHash bh[CHECK_TIMES];

    struct Code {
        int c[CHECK_TIMES];
        bool operator< (const Code& t) const {
            for (int i = 0; i < CHECK_TIMES; ++i) {
                if (c[i] != t.c[i]) {
                    return c[i] < t.c[i];
                }
            }
            return false;
        }
        bool operator== (const Code& t) const {
            for (int i = 0; i < CHECK_TIMES; ++i) {
                if (c[i] != t.c[i]) {
                    return false;
                }
            }
            return true;
        }
    };

    void anti_hack() {
        static bool ANTI_HACK_ENABLED = true;
        if (!ANTI_HACK_ENABLED) {
            return;
        }
        ANTI_HACK_ENABLED = false;
        int rnd_seed = chrono::system_clock::now().time_since_epoch().count();
        mt19937 rnd (rnd_seed);
        shuffle (b, b + MAX_CHECK_TIMES, rnd);
        shuffle (p, p + MAX_CHECK_TIMES, rnd);
        shuffle (m, m + MAX_CHECK_TIMES, rnd);
    }

    void init (char *s, int len) {
        anti_hack();
        for (int i = 0; i < CHECK_TIMES; ++i) {
            bh[i].set_const (b[i], p[i], m[i]);
            bh[i].init (s, len);
        }
    }

    Code code (const int &l, const int &r) const {
        Code res;
        for (int i = 0; i < CHECK_TIMES; ++i) {
            res.c[i] = bh[i].code (l, r);
        }
//        printf ("[l, r] = [%d, %d], hashcode = [", l, r);
//        for (int i = 0; i < CHECK_TIMES; ++i) {
//            printf ("%d%s", res.c[i], i != CHECK_TIMES - 1 ? ", " : "]\n") ;
//        }
        return res;
    }

    bool same (int l1, int r1, int l2, int r2) {
        return code (l1, r1) == code (l2, r2);
    }

} mh;

使用方法:读入n和s,s从1开始计数,n为s的长度。
mh.init()
如果需要缓冲某个子串的哈希值,auto f = mh.code(l, r)

字符串类型为std::string

字符串类型为std::string,下标从0开始,专门给Leetcode用的版本

#ifndef ll
#define ll long long
#endif

struct BaseHash {

    int b, p, m;
    vector<int> h, plen;

    void set_const (int b, int p, int m) {
        this -> b = b, this -> p = p, this -> m = m;
    }

    void init (const string &s) {
        int len = s.length();
        h.resize (len + 2), plen.resize (len + 2);
        h[0] = 0, plen[0] = 1;
        for (int i = 1; i <= len; ++i) {
            h[i] = (1LL * h[i - 1] * p + (s[i - 1] - 'a' + b)) % m;
            plen[i] = (1LL * plen[i - 1] * p) % m;
        }
    }

    ll code (const int &l, const int &r) const {
        if (l > r) {
            return 0;
        }
        int res = (h[r + 1] - 1LL * h[l] * plen[r - l + 1] % m + m) % m;
//        D3 (l, r, res);
        return res;
    }

};

struct MultiHash {

    static const int MAX_CHECK_TIMES = 4;
    int b[MAX_CHECK_TIMES] = {11, 13, 17, 19};
    int p[MAX_CHECK_TIMES] = {113, 127, 131, 137};
    int m[MAX_CHECK_TIMES] = { (int) 1e9 + 21, (int) 1e9 + 33, (int) 1e9 + 87, (int) 1e9 + 93};

    static const int CHECK_TIMES = 2;
    BaseHash bh[CHECK_TIMES];

    void anti_hack() {
        static bool ANTI_HACK_ENABLED = true;
        if (!ANTI_HACK_ENABLED) {
            return;
        }
        ANTI_HACK_ENABLED = false;
        int rnd_seed = chrono::system_clock::now().time_since_epoch().count();
        mt19937 rnd (rnd_seed);
        shuffle (b, b + MAX_CHECK_TIMES, rnd);
        shuffle (p, p + MAX_CHECK_TIMES, rnd);
        shuffle (m, m + MAX_CHECK_TIMES, rnd);
    }

    void init (const string &s) {
        anti_hack();
        for (int i = 0; i < CHECK_TIMES; ++i) {
            bh[i].set_const (b[i], p[i], m[i]);
            bh[i].init (s);
        }
    }

    ll code (const int &l, const int &r) const {
        ll res = 0;
        for (int i = 0; i < CHECK_TIMES; ++i) {
            res = (res << 32) ^ (bh[i].code (l, r));
        }
//        printf ("[l, r] = [%d, %d], hashcode = [%lld]\n", l, r, res);
        return res;
    }

    bool same (int l1, int r1, int l2, int r2) {
        return code (l1, r1) == code (l2, r2);
    }

} mh;

一些特殊版本的字符串哈希

线段树维护的版本

要实现将两个字符串前后拼在一起并且计算出他们新的哈希值的方法

对于每个线段树的节点,存储其哈希值,然后向上合并时让左串的哈希值乘以plen[右串长度] + 右串的哈希值即可

TODO 应该用处不是很大,到时候再改算了

回文串的版本

存储相反的子串的哈希值,记为rev_h,生成的哈希值为rev_code,就是多了几句。大体上都是一模一样的。rev_same(a,b,x,y)表示子串[a,b]和反转之后的子串[x,y]是相等的(注意a,b,x,y都是指在原串中的下标)

所以判断某个字符串[l,r]是否是回文串就是rev_same(l,r,l,r)

由于回文子串的长度有区分奇数和偶数的单调性,所以也可以用二分法通过manacher的模板。

这里的乘法等应该复制上文中的判断+取模。先不写了。

struct BaseHash {

    int b, p, m;
    vector<int> h, plen;
    vector<int> rev_h;

    void set_const (int b, int p, int m) {
        this -> b = b, this -> p = p, this -> m = m;
    }

    void init (char* s) {
        int len = strlen (s + 1);
        h.resize (len + 2), plen.resize (len + 2);
        h[0] = 0, plen[0] = 1;
        for (int i = 1; i <= len; ++i) {
            h[i] = (1LL * h[i - 1] * p + (s[i] + b)) % m;
            plen[i] = (1LL * plen[i - 1] * p) % m;
        }
        rev_h.resize (len + 2);
        rev_h[len + 1] = 0;
        for (int i = len; i >= 1; --i) {
            rev_h[i] = (1LL * rev_h[i + 1] * p + (s[i] + b)) % m;
        }
    }

    ll code (const int &l, const int &r) const {
        if (l > r) {
            return 0;
        }
        ll res = (h[r] - 1LL * h[l - 1] * plen[r - l + 1] % m + m) % m;
//        printf ("[l, r] = [%2d, %2d], hashcode = %8X\n", l, r, res);
        return res;
    }

    ll rev_code (const int &l, const int &r) const {
        if (l > r) {
            return 0;
        }
        ll res = (rev_h[l] - 1LL * rev_h[r + 1] * plen[r - l + 1] % m + m) % m;
//        printf ("[l, r] = [%2d, %2d], rev_hashcode = %8X\n", l, r, res);
        return res;
    }

};

struct MultiHash {

    static const int MAX_CHECK_TIMES = 8;
    int b[MAX_CHECK_TIMES] = {
        11, 13, 17, 19,
        23, 29, 31, 37
    };
    int p[MAX_CHECK_TIMES] = {
        113, 127, 131, 137,
        139, 149, 151, 157
    };
    int m[MAX_CHECK_TIMES] = {
        998244353, 998244389, 998244391, 998244397,
        998244407, 998244431, 998244433, 998244473
    };

    static const int CHECK_TIMES = 2;
    BaseHash bh[CHECK_TIMES];

    void anti_hack() {
        static bool ANTI_HACK_ENABLED = true;
        if (!ANTI_HACK_ENABLED) {
            return;
        }
        ANTI_HACK_ENABLED = false;
        int rnd_seed = chrono::system_clock::now().time_since_epoch().count();
        mt19937 rnd (rnd_seed);
        shuffle (b, b + MAX_CHECK_TIMES, rnd);
        shuffle (p, p + MAX_CHECK_TIMES, rnd);
        shuffle (m, m + MAX_CHECK_TIMES, rnd);
    }

    void init (char *s) {
        anti_hack();
        for (int i = 0; i < CHECK_TIMES; ++i) {
            bh[i].set_const (b[i], p[i], m[i]);
            bh[i].init (s);
        }
    }

    ll code (const int &l, const int &r) const {
        ll res = 0;
        for (int i = 0; i < CHECK_TIMES; ++i) {
            res = (res << 32) ^ (bh[i].code (l, r));
        }
//        printf ("[l, r] = [%2d, %2d], hashcode = %8X\n", l, r, res);
        return res;
    }

    ll rev_code (const int &l, const int &r) const {
        ll res = 0;
        for (int i = 0; i < CHECK_TIMES; ++i) {
            res = (res << 32) ^ (bh[i].rev_code (l, r));
        }
//        printf ("[l, r] = [%2d, %2d], rev_hashcode = %8X\n", l, r, res);
        return res;
    }

    bool same (int l1, int r1, int l2, int r2) {
        return code (l1, r1) == code (l2, r2);
    }

    bool rev_same (int l1, int r1, int l2, int r2) {
        return code (l1, r1) == rev_code (l2, r2);
    }

} mh;
posted @ 2020-11-30 12:54  purinliang  阅读(437)  评论(0编辑  收藏  举报