FFT在字符串匹配上的应用
多项式乘法:
设多项式:
$$A = a_0 + a_1 * x + a_2 * x^2 + ... + a_n * x^n$$
$$B = b_0 + b_1 * x + b_2 * x^2 + ... + b_n * x^n$$
则$C = A * B$的第$p$项($C$的项数为$n+m$)可以表示为:$$C_p = \sum_{i = 1}^{p}(a_i * b_{p - i})x^p$$
通过$FFT$,我们可以将复杂度从$O(n^2)$优化为$O(nlogn)$
FFT与字符串匹配
问题:给定模式串$A$(长度为$m$),文本串$B$(长度为$n$),需要求出所有的位置$p$,满足$B$串从第$p$个字符开始的连续$m$个字符,与$A$串完全相同。
我们定义完全匹配函数$P(x) = \sum_{i = 0}^{m - 1}[A(i) - B(x - m + 1 + i)]$,若$P(x) = 0$则称$B$以第$x$位结束的连续$m$位,与$A$完全匹配。
但是这个完全匹配函数有一些问题,在这种规则下,$ab$与$ba$也是完全匹配的,为了避免这种正负号相加为0的情况,我们给它加上一个平方符号,变成:
$$P(x) = \sum_{i = 0}^{m - 1}[A(i) - B(x - m + 1 + i)]^2$$
这样还是不能转换成多项式乘法,我们再把$A$翻转一下,设翻转后的串为$S$,于是:
$$P(x) = \sum_{i = 0}^{m - 1}[S(m - i - 1) - B(x - m + 1 + i)]^2$$
把式子展开:
$$P(x) = \sum_{i = 0}^{m - 1}S(m - i - 1)^2 + \sum_{i = 0}^{m - 1}B(x - m + 1 + i)^2 - 2 * \sum_{i = 0}^{m - 1}S(m - 1 - i) * B(x - m + 1 + i)$$
第一项是一个定值,直接$O(m)$预处理即可。第二项可以$O(n)$预处理前缀和。第三项中,两个小括号里的东西加起来刚好等于$x$,于是第三项可以写成$-2 * \sum_{i + j = x}S(i) * B(j)$
我们设$T = \sum_{i = 0}^{m - 1}S(m - i - 1)^2$,$f(x) = \sum_{i = 0}^x B(i)^2$,$g(x) = \sum_{i + j = x}S(i) * B(j)$,于是:
$$P(x) = T + f(x) - f(x - m) - 2 * g(x)$$
利用$FFT$计算$g$函数即可,复杂度为$O(nlogn)$
1 void FFT_Match(char *s1, char *s2, int m, int n) 2 { 3 for(int i = 0; i < m; ++i) A[i].r = s1[i] - 'a' + 1; 4 for(int i = 0; i < n; ++i) B[i].r = s2[i] - 'a' + 1; 5 reverse(A, A + m); double T = 0; 6 for(int i = 0; i < m; ++i) T += A[i].r * A[i].r; 7 f[0] = B[0].r * B[0].r; 8 for(int i = 1; i < n; ++i) f[i] = f[i - 1] + B[i].r * B[i].r; 9 FFT(A, len, 1); FFT(B, len, 1); 10 for(int i = 0; i < len; ++i) g[i] = A[i] * B[i]; 11 FFT(g, len, -1); 12 for(int x = m - 1; x < n; ++x) 13 { 14 double P; 15 if(x == m - 1) P = T + f[x] - 2 * g[x].r; 16 else P = T + f[x] - f[x - m] - 2 * g[x].r; 17 if(fabs(P) < EPS) printf("%d ", x - m + 2); 18 } 19 }
通配符匹配
显然用$FFT$进行一般的字符串匹配不是一个明智的选择,无论是从编程复杂度,时间复杂度,空间复杂度,$FFT$都远不如$KMP$优秀,那为什么我们还要学习这种算法呢?
来看一道题:残缺的字符串。遇到这种题目,$KMP$就无能为力了,但是$FFT$仍然能够进行匹配
按照套路,我们先设完全匹配函数,我们令通配符为0,$$P(x) = \sum_{i = 0}^{m - 1}[A(i) - B(x - m + 1 + i)]^2 * A(i) * B(x - m + 1 + i)$$
同样令$S = reverse(A)$,则:$$P(x) = \sum_{i = 0}^{m - 1}[S(m - 1 - i) - B(x - m + 1 + i)]^2 * S(m - 1 - i) * B(x - m + 1 + i)$$
展开算一下:$$\sum_{i = 0}^{m - 1}S(m - 1 - i)^3 * B(x - m + 1 + i)) + \sum_{i = 0}^{m - 1}S(m - 1 - i) * B(x - m + 1 + i))^3 - \sum_{i = 0}^{m - 1}S(m - 1 - i)^2 * B(x - m + 1 + i))^2$$
每一项中的两个小括号加起来都为$x$,可以写成:
$$\sum_{i + j = x}S(i)^3 * B(j) + \sum_{i + j = x}S(i) * B(j)^3 + \sum_{i + j = x}S(i)^2 * B(j)^2$$
$FFT$算卷积即可。注意数组越界问题(最好全部开成300000 * 4),还有最后$IDFT$的时候要记得除以$n$。
1 #include<iostream> 2 #include<cstdio> 3 #include<cmath> 4 #include<vector> 5 #include<algorithm> 6 using namespace std; 7 8 const int MAXN = 300010 << 2; 9 const double Pi = acos(-1); 10 11 struct complex 12 { 13 double x, y; 14 complex(double xx = 0, double yy = 0) {x = xx, y = yy;} 15 }; 16 complex operator + (complex a, complex b) 17 {return complex(a.x + b.x, a.y + b.y);} 18 complex operator - (complex a, complex b) 19 {return complex(a.x - b.x, a.y - b.y);} 20 complex operator * (complex a, complex b) 21 {return complex(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);} 22 23 int n, m, r[MAXN]; 24 25 void FFT(complex *f, int op) 26 { 27 for(int i = 0; i < n; ++i) 28 if(i < r[i]) swap(f[i], f[r[i]]); 29 for(int p = 2; p <= n; p <<= 1) 30 { 31 int len = p >> 1; 32 complex tmp(cos(Pi / len), op * sin(Pi / len)); 33 for(int k = 0; k < n; k += p) 34 { 35 complex buf(1, 0); 36 for(int l = k; l < k + len; ++l) 37 { 38 complex t = buf * f[len + l]; 39 f[len + l] = f[l] - t; 40 f[l] = f[l] + t; 41 buf = buf * tmp; 42 } 43 } 44 } 45 if(op == -1) for(int i = 0; i < n; ++i) f[i].x /= n; 46 } 47 48 int M, N, A[MAXN], B[MAXN]; 49 char s1[MAXN], s2[MAXN]; 50 complex a[MAXN], b[MAXN], p[MAXN]; 51 vector<int> ans; 52 53 int main() 54 { 55 scanf("%d %d", &M, &N); 56 scanf("%s %s", s1, s2); reverse(s1, s1 + M); 57 for(int i = 0; i < M; ++i) A[i] = (s1[i] != '*') ? (s1[i] - 'a' + 1) : 0; 58 for(int i = 0; i < N; ++i) B[i] = (s2[i] != '*') ? (s2[i] - 'a' + 1) : 0; 59 60 n = N - 1, m = M - 1; for(m += n, n = 1; n <= m; n <<= 1); 61 for(int i = 0; i < n; ++i) r[i] = (r[i >> 1] >> 1) | ((i & 1) ? n >> 1 : 0); 62 63 for(int i = 0; i < n; ++i) a[i] = (complex){A[i] * A[i] * A[i], 0}, b[i] = (complex){B[i], 0}; 64 FFT(a, 1); FFT(b, 1); 65 for(int i = 0; i < n; ++i) p[i] = p[i] + a[i] * b[i]; 66 67 for(int i = 0; i < n; ++i) a[i] = (complex){A[i], 0}, b[i] = (complex){B[i] * B[i] * B[i], 0}; 68 FFT(a, 1); FFT(b, 1); 69 for(int i = 0; i < n; ++i) p[i] = p[i] + a[i] * b[i]; 70 71 for(int i = 0; i < n; ++i) a[i] = (complex){A[i] * A[i], 0}, b[i] = (complex){B[i] * B[i], 0}; 72 FFT(a, 1); FFT(b, 1); 73 for(int i = 0; i < n; ++i) p[i] = p[i] - a[i] * b[i] * complex(2, 0); 74 75 FFT(p, -1); 76 for(int i = M - 1; i < N; ++i) if(fabs(p[i].x) < 0.5) ans.push_back(i - M + 2); 77 printf("%d\n", ans.size()); 78 for(int i = 0; i < ans.size(); ++i) printf("%d%c", ans[i], i == ans.size() - 1 ? '\n' : ' '); 79 return 0; 80 }