「双串最长公共子串」SP1811 LCS - Longest Common Substring

知识点: SAM,SA,单调栈,Hash

原题面 Luogu 来自 poj 的双倍经验

简述

给定两字符串 \(S_1, S_2\),求它们的最长公共子串长度。
\(|S_1|,|S_2|\le 2.5\times 10^5\)
294ms,1.46GB。

分析

以下将介绍四种效率不同的写法。
图片为按最优解排序后的结果,从上到下依次为 SAM、SA、自己 YY 的 SA 和 Hash。

efficient

Hash

二分答案枚举最长公共子串长度 \(mid\)
Check 时,将一个串所有长度为 \(mid\) 的子串的 hash 值扔到 hash 表里。
枚举另一个串所有长度为 \(mid\) 的子串,检查在 hash 表中是否存在即可。

理论复杂度 \(O(n\log n)\),但取模运算过多,所以常数巨大,实际运行效率是四种方法中最低的。

注意写单 hash 或用 set 都会被卡。

自己 YY 的 SA 做法

只有两个字符串,考虑 SA。
\(S_1\) 加个终止符,\(S_2\) 拼接到 \(S_1\) 后面,跑 SA 求出 \(\operatorname{height}\)
问题变为 求前半段后缀 与 后半段后缀 \(\operatorname{lcp}\) 的最大值。

考虑一个很直观的暴力:

\(i\) 为后半段的后缀,则必有 \(i>|S_1|+1\)
对于一个后缀 \(i>|S_1|+1\),设 \(l_i\) 为后缀排序后 最大\(i\)前半段 的后缀的 排名
即有 \(sa_{l_i}\le |S_1|,\ l+i< rk_i\),且 \(\forall l_i<k\le rk_i, sa_k>|S_1|+1\) 成立。
类似地,设 \(r_i\) 为后缀排序后 最小\(i\)前半段 的后缀的 排名

考虑 \(\operatorname{lcp}\) 的单调性。
对于一个后半段的后缀 \(i>|S_1|+1\),满足 \(\operatorname{lcp}(i,j)\) 最大的 \(j\le|S_1|\),显然为 \(l_i\)\(r_i\),有

\[\max\{\operatorname{lcp}(i,j)\} = \max\{\operatorname{lcp}(l_i, i), \operatorname{lcp}(i, r_i)\} \]

则有:

\[ans = \max_{i=|S_1|+2}^{|S_1|+|S_2|+1}\max\{\operatorname{lcp}(l_i, i), \operatorname{lcp}(i, r_i)\} \]

先预处理,对 \(\operatorname{height}\) 建立 st 表。
\(l_i,r_i\) 可通过单调栈简单求得,计算答案时枚举后半段后缀,\(O(1)\) 查询 \(\operatorname{lcp}\) 即可。

总复杂度 \(O(n\log n)\) 级别。

一些细节:
\(l_i<1\) 时,该 \(l_i\) 不作出贡献,因为不存在这样的后缀。
\(r_i>|S|\) 时也没有贡献,这样的后缀已经属于后半段了。

SA

上面的算法二是自己 YY 的结果,看了题解之后,学习了以下这种,理论上最优的算法。

对于 \(sa_i,sa_{i-1}\),其 \(\operatorname{lcp} = \operatorname{height}_i\)
考虑 \(\operatorname{lcp}\) 的单调性,有一个显然的结论:

最长公共子串为:所有满足 \(sa_{i-1}, sa_i\) 分属 前/后 半段的 \(\operatorname{height}_i\) 的最大值。
即作为答案的 \(\operatorname{lcp}(l_i,i)\) (或 \(\operatorname{lcp}(i, r_i)\)),一定有 \(l_i=rk_{i}-1\)\(r_i=rk_i+1\)

证明考虑反证法。
\(ans=\operatorname{lcp}(l_i, i)\),且 \(l_i < rk_i-1\)
\(\operatorname{lcp}(l_i,i)=\min\limits_{j=l_i+1}^{rk_i}\operatorname{height}_j\),可知对于 \(\forall l_i<j<rk_i\)\(\operatorname{lcp}(l_i, sa_j)\ge \operatorname{lcp}(l_i,i) = ans\),取它们作为答案,答案不会变劣。
反证原结论成立,\(ans = \operatorname{lcp}(i,r_i)\) 同理。

SAM

对第一个串建 SAM,用第二个串从起始节点开始,在 SAM 上进行匹配。

若当前状态为 \(x\),如果有对应字符 \(s_i\) 的转移,直接转移即可,匹配长度 \(+1\)
如果没有对应转移,转移到 \(\operatorname{link}(x)\),匹配长度 \(=\operatorname{len}(x)+1\) 检查有无对应转移,若没有则继续转移到 \(\operatorname{link}(\operatorname{link}(x))\),直到存在对应转移。
若始终找不到对应转移,则从根开始重新匹配。

跳 parnet 树相当于失配指针,继续利用了已匹配的部分。
匹配过程中匹配的最长长度即为答案。

复杂度 \(O(n)\),实际运行效率也非常高。

代码

SAM

//知识点:SAM
/*
By:Luckyblock
*/
#include <algorithm>
#include <cctype>
#include <cstdio>
#include <cstring>
#define ll long long
const int kMaxn = 3e5 + 10;
const int kMaxm = 26;
//=============================================================
char S[kMaxn];
int n, k, ans, last = 1, node_num = 1;
int ch[kMaxn << 1][kMaxm], len[kMaxn <<1], link[kMaxn << 1];
//=============================================================
inline int read() {
  int f = 1, w = 0;
  char ch = getchar();
  for (; !isdigit(ch); ch = getchar())
    if (ch == '-') f = -1;
  for (; isdigit(ch); ch = getchar()) w = (w << 3) + (w << 1) + (ch ^ '0');
  return f * w;
}
void GetMax(int &fir, int sec) {
  if (sec > fir) fir = sec;
}
void Insert(int c_) {
  int p = last, now = last = ++ node_num;
  len[now] = len[p] + 1;
  for (; p && ! ch[p][c_]; p = link[p]) ch[p][c_] = now;
  if (! p) {link[now] = 1; return ;} 
  int q = ch[p][c_];
  if (len[q] == len[p] + 1) {link[now] = q; return ;}
  int newq = ++ node_num;
  memcpy(ch[newq], ch[q], sizeof(ch[q]));  
  link[newq] = link[q], len[newq] = len[p] + 1; 
  link[q] = link[now] = newq; 
  for (; p && ch[p][c_] == q; p = link[p]) ch[p][c_] = newq;
}
void Work() {
  scanf("%s", S + 1);
  int n = strlen(S + 1), now = 1, l = 0;
  for (int i = 1; i <= n; ++ i) {
    while (now && ! ch[now][S[i] - 'a']) {
      now = link[now];
      l = len[now]; 
    }
    if (! now) {
      now = 1;
      l = 0;
      continue ;
    }
    ++ l;
    now = ch[now][S[i] - 'a'];
    if (l > k) GetMax(ans, l);
  }
}
//=============================================================
int main() {
//  k = read();
  scanf("%s", S + 1); n = strlen(S + 1);
  for (int i = 1; i <= n; ++ i) Insert(S[i] - 'a');
  Work();
  printf("%d", ans);
  return 0; 
}

SA

//知识点:SA
/*
By:Luckyblock
*/
#include <algorithm>
#include <cstdio>
#include <cstring>
#include <ctype.h>
#define ll long long
const int kMaxn = 5e5 + 10;
//=============================================================
char S[kMaxn];
int n1, n, m, ans, cnt[kMaxn], id[kMaxn], rkid[kMaxn];
int sa[kMaxn], rk[kMaxn << 1], oldrk[kMaxn << 1], height[kMaxn];
int MaxHeight[kMaxn][20], Log2[kMaxn];
//=============================================================
inline int read() {
  int f = 1, w = 0; char ch = getchar();
  for (; !isdigit(ch); ch = getchar()) if (ch == '-') f = -1;
  for (; isdigit(ch); ch = getchar()) w = (w << 3) + (w << 1) + (ch ^ '0');
  return f * w;
}
void GetMax(int &fir, int sec) {
  if (sec > fir) fir = sec;
}
bool cmp(int x, int y, int w) { //判断两个子串是否相等。
  return oldrk[x] == oldrk[y] && 
         oldrk[x + w] == oldrk[y + w]; 
}
void GetHeight() {
  for (int i = 1, k = 0; i <= n; ++ i) {
    if (rk[i] == 1) k = 0;
    else {
      if (k > 0) k --;
      int j = sa[rk[i] - 1];
      while (i + k <= n && j + k <= n && 
             S[i + k] == S[j + k]) {
        ++ k;
      }
    }
    height[rk[i]] = k;
  }
}
void SuffixSort() {
  m = 300;
  for (int i = 1; i <= n; ++ i) ++ cnt[rk[i] = S[i]];
  for (int i = 1; i <= m; ++ i) cnt[i] += cnt[i - 1];
  for (int i = n; i >= 1; -- i) sa[cnt[rk[i]] --] = i;
  for (int p, w = 1; w < n; w <<= 1) {
    p = 0;
    for (int i = n; i > n - w; -- i) id[++ p] = i;
    for (int i = 1; i <= n; ++ i) {
      if (sa[i] > w) id[++ p] = sa[i] - w;
    }
    memset(cnt, 0, sizeof (cnt));
    for (int i = 1; i <= n; ++ i) ++ cnt[(rkid[i] = rk[id[i]])];
    for (int i = 1; i <= m; ++ i) cnt[i] += cnt[i - 1];
    for (int i = n; i >= 1; -- i) sa[cnt[rkid[i]] --] = id[i];
    std ::swap(rk, oldrk);
    m = 0;
    for (int i = 1; i <= n; ++ i) {
      m += (cmp(sa[i], sa[i - 1], w) ^ 1);
      rk[sa[i]] = m;
    }
  }
  GetHeight();
}
bool Judge(int x, int y) {
  return (x <= n1 && y > n1) || (x > n1 && y < n1);
}
//=============================================================
int main() {
  scanf("%s", S + 1); n1 = strlen(S + 1);
  S[n1 + 1] = 'z' + 1;
  scanf("%s", S + n1 + 1 + 1); n = strlen(S + 1);
  SuffixSort();
  for (int i = 2; i <= n; ++ i) {
    if (Judge(sa[i - 1], sa[i])) GetMax(ans, height[i]);
  }
  printf("%d", ans);
  return 0;
}

自己 YY 的 SA

//知识点:SA,单调栈
/*
By:Luckyblock
*/
#include <algorithm>
#include <cstdio>
#include <cstring>
#include <ctype.h>
#define ll long long
const int kMaxn = 5e5 + 10;
//=============================================================
char S[kMaxn];
int n1, n, m, ans, cnt[kMaxn], id[kMaxn], rkid[kMaxn];
int sa[kMaxn], rk[kMaxn << 1], oldrk[kMaxn << 1], height[kMaxn];
int MaxHeight[kMaxn][20], Log2[kMaxn];
int top, st[kMaxn], L[kMaxn], R[kMaxn];
//=============================================================
inline int read() {
  int f = 1, w = 0; char ch = getchar();
  for (; !isdigit(ch); ch = getchar()) if (ch == '-') f = -1;
  for (; isdigit(ch); ch = getchar()) w = (w << 3) + (w << 1) + (ch ^ '0');
  return f * w;
}
void GetMax(int &fir, int sec) {
  if (sec > fir) fir = sec;
}
bool cmp(int x, int y, int w) {
  return oldrk[x] == oldrk[y] && 
         oldrk[x + w] == oldrk[y + w]; 
}
void GetHeight() {
  for (int i = 1, k = 0; i <= n; ++ i) {
    if (rk[i] == 1) k = 0;
    else {
      if (k > 0) k --;
      int j = sa[rk[i] - 1];
      while (i + k <= n && j + k <= n && 
             S[i + k] == S[j + k]) {
        ++ k;
      }
    }
    height[rk[i]] = k;
  }
}
int Query(int l_, int r_) {
  int k = Log2[r_ - l_ + 1];
  return std :: min(MaxHeight[l_][k], MaxHeight[r_ - (1 << k) + 1][k]);
}
void MakeSt() {
  for (int i = 2; i <= n; ++ i) MaxHeight[i][0] = height[i];
  for (int i = 2; i <= n; ++ i) {
    Log2[i] = Log2[i - 1] + ((1 << Log2[i - 1] + 1) == i);
  }
  for (int j = 1; j < 20; ++ j) {
    for (int i = 1; i + (1 << j) - 1 <= n; ++ i) {
      MaxHeight[i][j] = std :: min(MaxHeight[i][j - 1], 
                                   MaxHeight[i + (1 << j - 1)][j - 1]);
    }
  }
}
void SuffixSort() {
  m = 300;
  for (int i = 1; i <= n; ++ i) ++ cnt[rk[i] = S[i]];
  for (int i = 1; i <= m; ++ i) cnt[i] += cnt[i - 1];
  for (int i = n; i >= 1; -- i) sa[cnt[rk[i]] --] = i;
  for (int p, w = 1; w < n; w <<= 1) {
    p = 0;
    for (int i = n; i > n - w; -- i) id[++ p] = i;
    for (int i = 1; i <= n; ++ i) {
      if (sa[i] > w) id[++ p] = sa[i] - w;
    }
    memset(cnt, 0, sizeof (cnt));
    for (int i = 1; i <= n; ++ i) ++ cnt[(rkid[i] = rk[id[i]])];
    for (int i = 1; i <= m; ++ i) cnt[i] += cnt[i - 1];
    for (int i = n; i >= 1; -- i) sa[cnt[rkid[i]] --] = id[i];
    std ::swap(rk, oldrk);
    m = 0;
    for (int i = 1; i <= n; ++ i) {
      m += (cmp(sa[i], sa[i - 1], w) ^ 1);
      rk[sa[i]] = m;
    }
  }
  GetHeight();
  MakeSt();
}
//=============================================================
int main() {
  scanf("%s", S + 1); n1 = strlen(S + 1);
  S[n1 + 1] = 'z' + 1;
  scanf("%s", S + n1 + 1 + 1); n = strlen(S + 1);
  SuffixSort();
  //注意 l,r 存的是排名,枚举时按照字典序枚举后缀。
  for (int i = 1, now = 0; i <= n; ++ i) {
    if (sa[i] == n1 + 1) continue ;
    if (sa[i] > n1 + 1) {
      L[i] = now, st[++ top] = i;
      continue ;
    }
    for (; top; top --) R[st[top]] = i;
    now = i;
  }
  for (; top; top --) R[st[top]] = n1 + 1;
  for (int i = 1; i <= n; ++ i) {
    if (sa[i] <= n1 + 1) continue ;
    if (L[i]) GetMax(ans, Query(L[i] + 1, i));
    if (R[i] <= n1 + 1) GetMax(ans, Query(i + 1, R[i]));
  }
  printf("%d", ans);
  return 0;
}
/*
aabcab
adadeaf

aabcab{adadeaf
abcab{adadeaf
ab{adadeaf
adadeaf
adeaf
af
bcab{adadeaf
b{adadeaf
cab{adadeaf
dadeaf
deaf
eaf
f
{adadeaf
*/

Hash

//知识点:二分答案,Hash
/*
By:Luckyblock
*/
#include <algorithm>
#include <cctype>
#include <cstdio>
#include <cstring>
#define LL long long
const int kMaxn = 3e5 + 10;
const LL kMod1 = 998244353;
const LL kMod2 = 1e9 + 9;
const LL kBase = 1145141;
//=============================================================
int n1, n2, ans, e_num, head[kBase + 10], ne[kMaxn << 1];
char s1[kMaxn], s2[kMaxn];
LL pow1[kMaxn], pow2[kMaxn];
LL has11[kMaxn], has12[kMaxn], has21[kMaxn], has22[kMaxn];
std::pair <LL, LL> val[kMaxn << 1];
//=============================================================
inline int read() {
  int f = 1, w = 0;
  char ch = getchar();
  for (; !isdigit(ch); ch = getchar())
    if (ch == '-') f = -1;
  for (; isdigit(ch); ch = getchar()) w = (w << 3) + (w << 1) + (ch ^ '0');
  return f * w;
}
void Chkmax(int &fir_, int sec_) {
  if (sec_ > fir_) fir_ = sec_;
}
void Chkmin(int &fir_, int sec_) {
  if (sec_ < fir_) fir_ = sec_;
}
void Insert(LL val_, LL v1_, LL v2_) {
  int pos = val_ % kBase + 1;
  for (int i = head[pos]; i; i = ne[i]) {
    if (val[i].first == v1_ && val[i].second == v2_) return ;
  }
  val[++ e_num] = std::make_pair(v1_, v2_);
  ne[e_num] = head[pos];
  head[pos] = e_num;
}
bool Count(LL val_, LL v1_, LL v2_) {
  int pos = val_ % kBase + 1;
  for (int i = head[pos]; i; i = ne[i]) {
    if (val[i].first == v1_ && val[i].second == v2_) return true; 
  }
  return false;
}
void Prepare() {
  scanf("%s", s1 + 1);
  scanf("%s", s2 + 1);
  n1 = strlen(s1 + 1);
  n2 = strlen(s2 + 1);
  pow1[0] = pow2[0] = 1;
  for (int i = 1; i < std::max(n1, n2); ++ i) {
    pow1[i] = pow1[i - 1] * kBase % kMod1;
    pow2[i] = pow2[i - 1] * kBase % kMod2;
  }
  for (int i = 1; i <= n1; ++ i) {
    has11[i] = (has11[i - 1] * kBase + s1[i]) % kMod1;
    has12[i] = (has12[i - 1] * kBase + s1[i]) % kMod2;
  }
  for (int i = 1; i <= n2; ++ i) {
    has21[i] = (has21[i - 1] * kBase + s2[i]) % kMod1;
    has22[i] = (has22[i - 1] * kBase + s2[i]) % kMod2;
  }
}
bool Check(int lth_) {
  e_num = 0;
  memset(head, 0, sizeof (head));
  for (int l = 1, r = lth_; r <= n1; ++ l, ++ r) {
    LL now_has11 = ((has11[r] - has11[l - 1] * pow1[r - l + 1] % kMod1) + kMod1) % kMod1;
    LL now_has12 = ((has12[r] - has12[l - 1] * pow2[r - l + 1] % kMod2) + kMod2) % kMod2;
    Insert(now_has11 * kMod2 + now_has12, now_has11, now_has12);
  }
  for (int l = 1, r = lth_; r <= n2; ++ l, ++ r) {
    LL now_has21 = ((has21[r] - has21[l - 1] * pow1[r - l + 1] % kMod1) + kMod1) % kMod1;
    LL now_has22 = ((has22[r] - has22[l - 1] * pow2[r - l + 1] % kMod2) + kMod2) % kMod2;
    if (Count(now_has21 * kMod2 + now_has22, now_has21, now_has22)) return true;
  }
  return false;
}
//=============================================================
int main() {
  // freopen("A.txt", "r", stdin);
  Prepare();
  for (int l = 1, r = std::min(n1, n2); l <= r; ) {
    int mid = (l + r) >> 1;
    if (Check(mid)) {
      ans = mid;
      l = mid + 1;
    } else {
      r = mid - 1;
    }
  }
  printf("%d\n", ans);
  return 0;
}
/*
opawmfawklmiosjcas1145141919810asopdfjawmfwaiofhauifhnawf
opawmdawlmioaszhcsan1145141919810bopdjawmdaw
*/
posted @ 2020-08-18 19:43  Luckyblock  阅读(364)  评论(0编辑  收藏  举报