SAM 二次学习笔记

SAM

review

板子

先复习一下 SAM 的构建过程吧。SAM 本质上是以 endpos 树为基础生成的一个 DAG。构建思路是这样的：对于一个字符串 \(w\)，先新建一个点 \(np\)，其对应的位置集合是 \(\{n\}\)，它的 \(len\) 显然应该等于当时的字符串长度（或者说是 \(len(last)+1\)）。但是此时原串的后缀不一定都只出现了一次，一个后缀出现多次当且仅当后缀丢弃最后一个元素之后仍然在次后缀中出现多次，也就是说一个点的位置集合包括 \(n-1\) 并且有 \(w\) 的出边。所以构建方法只需要沿着 \(p\) 往上走，走到第一个如此的点，这样的点加上 \(w\) 大概率是 \(np\) 的父亲，因为它的 \(endpos\) 集合是最小的（毕竟越往上越大）。此时判断出去的点 \(q\)，由于呢我们知道的是 \(p+w\) 在后缀中出现，而 \(q\) 的 \(len\) 可能不等于 \(len(p)+1\)，也就是说 \(q\) 中不一定所有的子串都是当前的后缀，所以需要分化出来一个 \(nq\)。\(nq\) 由于相较于 \(np\) 多了一些位置，而相较于 \(q\) 又多了一个 \(n\)，所以它同时是 \(q\) 和 \(np\) 的父亲。代码：

inline void insert(int c){
	int p=lastCnt;int np=lastCnt=++cnt;f[np]=1;
	t[np].len=t[p].len+1;
	for(;p&&t[p].nxt[c]==0;p=t[p].fa)t[p].nxt[c]=np;
	if(!p)return t[np].fa=1,void();
	int q=t[p].nxt[c];
	if(t[q].len==t[p].len+1)return t[np].fa=q,void();
	int nq=++cnt;t[nq]=t[q];
	t[nq].len=t[p].len+1,t[np].fa=t[q].fa=nq;
	for(;p&&t[p].nxt[c]==q;p=t[p].fa)t[p].nxt[c]=nq;
}

一些技巧

然后有些技巧。主要就是为了减小常数用的，就是由于一个点的父亲的 \(len\) 一定比自己要短，然后 \(nxt\) 其实也是一个道理，所以我们可以构建完 SAM 之后按 \(len\) 粗略地给节点排序，这样一来在 DP 的时候就只需要从后往前就可以了，就不需要用 dfs 了，减小了一些常数。

板子部分

P3804

题意：求子串出现次数乘长度的最大值。

根据 endpos 的定义可以在每个第一次插入的字符串处给 \(f\) 为 \(1\)，剩下的点的 \(f\) 值可以由孩子们去更新。然后就可以暴力的去找 \(len_x\times f_x\) 的最大值了，复杂度线性。

inline void insert(int c){
	int p=lastCnt;int np=lastCnt=++cnt;
	t[np].len=t[p].len+1;f[np]=1;
	for(;p&&t[p].nxt[c]==0;p=t[p].fa)t[p].nxt[c]=np;
	if(!p)return t[np].fa=1,void();
	int q=t[p].nxt[c];
	if(t[q].len==t[p].len+1)return t[np].fa=q,void();
	int nq=++cnt;t[nq]=t[q];
	t[nq].len=t[p].len+1,t[np].fa=t[q].fa=nq;
	for(;p&&t[p].nxt[c]==q;p=t[p].fa)t[p].nxt[c]=nq;
}
int m;char w[N];
inline void solve(int wh){
	for(int i=head[wh],th;i;i=e[i].nxt)solve(th=e[i].t),f[wh]+=f[th];
	if(f[wh]^1)ans=max(ans,(ll)t[wh].len*f[wh]);
}
signed main(){
	scanf("%s",w+1);m=strlen(w+1);
	for(int i=1;i<=m;i++)insert(w[i]-'a');
	for(int i=2;i<=cnt;i++)add(t[i].fa,i);
	solve(1);printf("%lld\n",ans);
}

P4070

题意：求每个前缀本质不同的子串个数。

你考虑 SA 是怎么解决这个问题的。柿子 \(\text{ans}=\dfrac{m(m+1)}{2}-\sum \text{height}_x\) 的含义其实就是求每个后缀的所有前缀中究竟有多少个在之前出现过。SAM 也可以用相似的方法，考虑当前前缀的所有后缀中之前从来没有出现过的串有多少，转化成问当前 \(\text{endpos}={n}\) 的串有多少，直接在原来的基础上加上 \(len_{np}-len_{fa}\) 即可。

inline void insert(int c){
	int p=lastCnt;int np=lastCnt=++cnt;
	t[np].len=t[p].len+1;
	for(;p&&t[p].nxt[c]==0;p=t[p].fa)t[p].nxt[c]=np;
	if(!p)return t[np].fa=1,void();
	int q=t[p].nxt[c];
	if(t[q].len==t[p].len+1)return t[np].fa=q,void();
	int nq=++cnt;t[nq]=t[q];
	t[nq].len=t[p].len+1;t[q].fa=t[np].fa=nq;
	for(;p&&t[p].nxt[c]==q;p=t[p].fa)t[p].nxt[c]=nq;
}
signed main(){
	int m,in;read(m);
	while(m--){
		read(in);insert(in);
		printf("%lld\n",ans+=t[lastCnt].len-t[t[lastCnt].fa].len);
	}
}

P3975

题意：求第 \(k\) 小的子串。有去重和不去重的两种问法。

考虑 SAM 任意从根出发的路径都是原串子串的性质，可以考虑 DP 出从某个节点出发的串的个数，记为 \(g_{x}\)。显然 \(g_{x}=f_{x}+\sum\limits_{y\in son_x}g_y\)，而 \(f\) 则可以分成两类，如果去重的话答案就是 \(1\)，否则就是 \(|\text{endpos}_x|\)。然后就利用这玩意去剪枝地遍历即可。注意 \(g\) 要和 \(\text{inf}\) 动态取更小值，虽然说理论上界不会特别大。

void insert(int c){
	int p=lastCnt;int np=lastCnt=++cnt;
	f[np]=1;t[np].len=t[p].len+1;
	for(;p&&t[p].nxt[c]==0;p=t[p].fa)t[p].nxt[c]=np;
	if(p==0)return t[np].fa=1,void();
	int q=t[p].nxt[c];
	if(t[q].len==t[p].len+1)return t[np].fa=q,void();
	int nq=++cnt;t[nq]=t[q];t[nq].len=t[p].len+1;
	t[np].fa=t[q].fa=nq;
	for(;p&&t[p].nxt[c]==q;p=t[p].fa)t[p].nxt[c]=nq;
}
int pl[N],id[N];
char ans[N];int nowCnt;
void print(int wh){
	if(wh!=1&&f[wh]>=n){printf("%s",ans+1);exit(0);}
	if(wh^1)n-=f[wh];
	for(int i=0,th;i<26;i++){
		if((th=t[wh].nxt[i])==0)continue;
		if(g[th]>=n)ans[++nowCnt]=(char)(i+'a'),print(th);
		else n-=g[th];
	}
}
signed main(){
	scanf("%s",w+1);m=strlen(w+1);
	for(int i=1;i<=m;i++)insert(w[i]-'a');
	read(type);read(n);
	for(int i=1;i<=cnt;i++)pl[t[i].len]++;
	for(int i=1;i<=m;i++)pl[i]+=pl[i-1];
	for(int i=cnt;i;i--)id[pl[t[i].len]--]=i;
	if(type)for(int x=cnt,wh;x;x--)wh=id[x],f[t[wh].fa]+=f[wh];
	else for(int i=1;i<=cnt;i++)f[i]=1;
	for(int x=cnt;x;x--){
		int wh=id[x];g[wh]=f[wh];
		for(int i=0,th;i<26;i++){
			if((th=t[wh].nxt[i])==0)continue;
			g[wh]=min(inf,g[wh]+g[th]);
		}
	}
	print(1);puts("-1");
}

P5341

题意：找出那些出现特定次数的子串。

非常非常板子。在 \(\text{endpos}\) 树上求出 \(f\)，然后满足这个 \(f\) 值的长度区间应该是 \((len(fa),len(x)]\)，差分维护一下即可。

inline void insert(int w){
	int p=lastCnt,np=++nowCnt;lastCnt=np;
	t[np].len=t[p].len+1;f[np]=1;
	for(;p&&t[p].nxt[w]==0;p=t[p].fa)t[p].nxt[w]=np;
	if(p==0)return t[np].fa=1,void();
	int q=t[p].nxt[w];
	if(t[q].len==t[p].len+1)return t[np].fa=q,void();
	int nq=++nowCnt;t[nq]=t[q];t[nq].len=t[p].len+1;
	t[np].fa=t[q].fa=nq;
	for(;p&&t[p].nxt[w]==q;p=t[p].fa)t[p].nxt[w]=nq;
}
int pl[N],id[N],want,c[N];char w[N];
void solve(){
	for(int i=1;i<=nowCnt;i++)t[i]=newone;
	nowCnt=lastCnt=1;
	scanf("%s",w+1);read(want);int m=strlen(w+1);
	memset(f,0,sizeof(f));
	memset(c,0,sizeof(c));
	memset(pl,0,sizeof(pl));
	for(int i=1;w[i];i++)insert(w[i]-'a');
	for(int i=1;i<=nowCnt;i++)++pl[t[i].len];
	for(int i=1;i<=m;i++)pl[i]+=pl[i-1];
	for(int i=nowCnt;i;i--)id[pl[t[i].len]--]=i;
	for(int i=nowCnt;i;i--){
		int wh=id[i];f[t[wh].fa]+=f[wh];
		if(f[wh]==want)c[t[t[wh].fa].len+1]++,c[t[wh].len+1]--;
	}
	int ans=0;
	for(int i=1;i<=m;i++)c[i]+=c[i-1],c[i]>=c[ans]&&(ans=i);
	printf("%d\n",c[ans]?ans:-1);
}

P4341

题意：按字典序输出出现超过一次的子串的出现次数。

还是按套路求出 \(f\) 值之后在自动机上按字典序遍历输出即可。

inline void insert(int w){
	int np=++nowCnt,p=lastCnt;lastCnt=nowCnt;
	f[np]=1;t[np].len=t[p].len+1;
	for(;t[p].nxt[w]==0;p=t[p].fa)t[p].nxt[w]=np;
	if(p==0)return t[np].fa=1,void();
	int q=t[p].nxt[w];
	if(t[q].len==t[p].len+1)return t[np].fa=q,void();
	int nq=++nowCnt;t[nq]=t[q];t[nq].len=t[p].len+1;
	t[q].fa=t[np].fa=nq;
	for(;p&&t[p].nxt[w]==q;p=t[p].fa)t[p].nxt[w]=nq;
}
inline void dfs(int wh){
	if(wh!=1&&f[wh]!=1)printf("%d\n",f[wh]);
	if(t[wh].nxt[0])dfs(t[wh].nxt[0]);
	if(t[wh].nxt[1])dfs(t[wh].nxt[1]);
}
signed main(){
	read(m);scanf("%s",w+1);
	for(int i=1;i<=m;i++)insert(w[i]-'0');
	for(int i=1;i<=nowCnt;i++)pl[t[i].len]++;
	for(int i=1;i<=m;i++)pl[i]+=pl[i-1];
	for(int i=nowCnt;i;i--)id[pl[t[i].len]--]=i;
	for(int i=nowCnt,wh;i;i--)wh=id[i],f[t[wh].fa]+=f[wh];
	dfs(1);
}

LCS - Longest Common Substring

题意：给定两个串，求最长公共子串长度。

等价于对于 \(s,t\)，求 \(s\) 的每个前缀的最长后缀使得该后缀是 \(t\) 的子串。可以考虑在 \(t\) 上遍历 \(s\)，如果当前点有对应出边，则说明可以通过这条出边使得后缀拓展一个字符，也就是长度加一；如果没有，则一直向上跳，跳到有出边或者空点的情况。如果是空点，那么把长度重载为 \(0\)，否则跳到下一个节点并把长度重载为之前那个节点的 \(len\) 加一。复杂度线性。

inline void insert(int w){
	int np=++nowCnt,p=lastCnt;lastCnt=nowCnt;
	t[np].len=t[p].len+1;
	for(;p&&t[p].nxt[w]==0;p=t[p].fa)t[p].nxt[w]=np;
	if(p==0)return t[np].fa=1,void();
	int q=t[p].nxt[w];
	if(t[q].len==t[p].len+1)return t[np].fa=q,void();
	int nq=++nowCnt;t[nq]=t[q];t[nq].len=t[p].len+1;
	t[q].fa=t[np].fa=nq;
	for(;p&&t[p].nxt[w]==q;p=t[p].fa)t[p].nxt[w]=nq;
}
signed main(){
	scanf("%s",w+1);
	for(int i=1;w[i];i++)insert(w[i]-'a');
	scanf("%s",w+1);int ans=0,wh=1,len=0;
	for(int i=1;w[i];i++){
		int now=w[i]-'a';
		if(t[wh].nxt[now])wh=t[wh].nxt[now],len++;
		else{
			while(wh&&t[wh].nxt[w[i]-'a']==0)wh=t[wh].fa;
			if(wh==0)wh=1,len=0;
			else len=t[wh].len+1,wh=t[wh].nxt[w[i]-'a'];
		}
		ans=max(ans,len);
	}
	printf("%d\n",ans);
}

应用

Security

题意：给定字符串 \(S\)，每次询问给定 \(l,r,t\)，求严格大于 \(t\) 且是 \(S_{l\dots r}\) 子串的最小串。

首先答案应该是这样的一个形式：先是一个 \(S\) 的前缀，然后在这个前缀的后面拼上一个尽量小的字符。这个前缀显然也要是中间那段的子串，所以大可直接找到最大的 \(len\)，使得 \(S\) 长度为 \(len\) 的前缀是子串。这样一来，我们就只需要找到最大的 \(p\le len\)，使得存在字符 \(c>t_{p+1}\) 并且 \(t_{1\dots p}+c\) 是原串的子串，枚举即可。

然后问题就只剩下了如何快速判断一个子串是否是某个区间的子串。可以离线之后用妙的方法去做，也可以对于 SAM 上每个节点维护一个集合，也就是 endpos。但考虑到这玩意可能很大（比如一个全是 a 的串，建出来的 endpos 集合就非常非常大），没法直接用 set 来维护，所以采用了线段树合并的方式。需要注意的是和普通的线段树合并不同的是，由于被合并的线段树也会被查询，所以要采用新建节点而不是单纯偏加的方式。而这就可能带来空间上的问题，可以感性证明一下。由于 endpos 树的性质，所以合并的线段树的值域之间有交集为空的性质，所以需要合并的地方其实不会太多。

还有一个点是说如何判断一个串到底是否合法。假设当前的串长度为 \(x\)，那么首先这个串的结束点不能大于 \(r\)，同时这个串的起始点不能小于 \(l\)，也就是结束点不能小于 \(l+x-1\)，在查询的时候要注意一点。另外在我的写法下要注意在找最后加的那个点时 \(l\) 的限制要加一。然后就没有什么了。

namespace Tree{
	#define lc t[wh].left
	#define rc t[wh].right
	#define mid ((l+r)>>1)
	struct node{int left,right;}t[N*40];
	int cnt;
	inline int insert(int wh,int l,int r,int pl){
		if(wh==0)wh=++cnt;if(l==r)return wh;
		if(pl<=mid)lc=insert(lc,l,mid,pl);
		else rc=insert(rc,mid+1,r,pl);return wh;
	}
	inline int merge(int x,int y,int l,int r){
		if(x==0||y==0||l==r)return x+y;int wh=++cnt;
		lc=merge(t[x].left,t[y].left,l,mid);
		rc=merge(t[x].right,t[y].right,mid+1,r);
		return wh;
	}
	inline bool work(int wh,int l,int r,int wl,int wr){
		if(wh==0)return false;if(wl<=l&&r<=wr)return true;
		if(wl<=mid&&work(lc,l,mid,wl,wr))return true;
		if(wr>mid&&work(rc,mid+1,r,wl,wr))return true;
		return false;
	}
	#undef lc
	#undef rc
	#undef mid
}
namespace SAM{
	struct node{int nxt[26],fa,len;}t[N];
	int nowCnt=1,lastCnt=1;int rt[N];
	inline void insert(int w){
		int np=++nowCnt,p=lastCnt;lastCnt=nowCnt;
		t[np].len=t[p].len+1;
		rt[np]=Tree::insert(rt[np],1,m,t[np].len);
		for(;p&&t[p].nxt[w]==0;p=t[p].fa)t[p].nxt[w]=np;
		if(p==0)return t[np].fa=1,void();
		int q=t[p].nxt[w];
		if(t[q].len==t[p].len+1)return t[np].fa=q,void();
		int nq=++nowCnt;t[nq]=t[q];t[nq].len=t[p].len+1;
		t[np].fa=t[q].fa=nq;
		for(;p&&t[p].nxt[w]==q;p=t[p].fa)t[p].nxt[w]=nq;
	}
	int pl[N],id[N];
	inline void build(){
		for(int i=1;i<=nowCnt;i++)pl[t[i].len]++;
		for(int i=1;i<=m;i++)pl[i]+=pl[i-1];
		for(int i=nowCnt;i;i--)id[pl[t[i].len]--]=i;
		for(int i=nowCnt;i;i--){
			int wh=id[i];
			rt[t[wh].fa]=Tree::merge(rt[t[wh].fa],rt[wh],1,m);
		}
	}
	inline bool get(int wh,int l,int r){
		if(wh==0||l>r)return false;
		return Tree::work(rt[wh],1,m,l,r);
	}
}
int pl[N];
inline void solve(){
	int l,r;read(l);read(r);scanf("%s",w+1);
	int len=strlen(w+1),wh=1,able=0;
	for(int i=0;i<=len;i++)pl[i]=-1;
	for(int j=max(w[1]-'a',-1)+1;j<26;j++)
		if(SAM::get(SAM::t[wh].nxt[j],l,r)){pl[0]=j;break;}
	for(int i=1;i<=len;i++){
		int now=w[i]-'a';
		wh=SAM::t[wh].nxt[now];
		if(SAM::get(wh,l+i-1,r)==false)break;
		for(int j=max(w[i+1]-'a',-1)+1;j<26;j++)
			if(SAM::get(SAM::t[wh].nxt[j],l+i,r)){pl[i]=j;break;}
		able=i;
	}
	while(able>=0&&pl[able]==-1)able--;
	if(able<0)return puts("-1"),void();
	for(int i=1;i<=able;i++)putchar(w[i]);
	putchar('a'+pl[able]);putchar('\n');
}
signed main(){
	scanf("%s",w+1);m=strlen(w+1);
	for(int i=1;i<=m;i++)SAM::insert(w[i]-'a');
	SAM::build();read(n);
	while(n--)solve();
}

P4770

题意：给定串 \(S\)，每次询问给出 \(T,l,r\)，求有多少 \(T\) 本质不同的子串不是 \(S_{l\dots r}\) 的子串。

答案应该是 \(T\) 的不同子串个数减去在 \(S\) 中的数量。于是对于 \(T\) 的一个前缀对应的节点 \(x\)，这个前缀贡献的子串个数显然是 \(len_x-len_{fa}\) 个，假设该前缀和 \(S\) 的匹配度为 \(t\)，那么答案应该就是 \(len_x-\max(len_{fa},t)\)。于是问题就转变成了如何求 \(t\)。

参照上面两道题的方法。首先是如何求的问题，可以和上面一样地逐个遍历 \(T\) 中字符，如果当前在 \(S\) 的后缀自动机中的节点是 \(x\)，如果 \(x\) 有对应字符的出边，那么 \(len\leftarrow len+1,x\leftarrow son_w\)；否则就尝试减少长度，和原问题不同的是，由于这个问题下有可能是该节点可以匹配但由于长度限制被卡了，所以要做的是尝试 \(len\leftarrow len-1\)，如果可以匹配了那很好，如果已经到达当前节点的 \(len\) 的下界（也就是 \(len_{fa}\)）了都还没匹配上就跳到父亲。复杂度均摊应该是对的。至于如何判断一个节点合法，就是和上面那道题一样，对 \(S\) 中每个节点用线段树合并维护 endpos 集合，到时候区间查询即可。

不是很懂题解为什么要维护区间最大值以及为什么要对每个节点都计算答案，因为我只对每个 \(T\) 前缀计算答案交上去是可以过的。而且感觉上面的复杂度没有问题啊，为啥还有人提问题捏。

namespace Tree{
	#define lc t[wh].left
	#define rc t[wh].right
	#define mid ((l+r)>>1)
	struct node{int left,right;}t[N*40];int cnt;
	inline int insert(int wh,int l,int r,int pl){
		if(wh==0)wh=++cnt;if(l==r)return wh;
		if(pl<=mid)lc=insert(lc,l,mid,pl);
		else rc=insert(rc,mid+1,r,pl);return wh;
	}
	inline int merge(int x,int y,int l,int r){
		if(x==0||y==0||l==r)return x+y;int wh=++cnt;
		lc=merge(t[x].left,t[y].left,l,mid);
		rc=merge(t[x].right,t[y].right,mid+1,r);return wh;
	}
	inline bool work(int wh,int l,int r,int wl,int wr){
		if(wh==0)return false;if(wl<=l&&r<=wr)return true;
		if(wl<=mid&&work(lc,l,mid,wl,wr))return true;
		if(wr>mid&&work(rc,mid+1,r,wl,wr))return true;return false;
	}
	#undef lc
	#undef rc
	#undef mid
}
char w[N];
namespace SAM{
	int m;
	struct node{int nxt[26],fa,len;}t[N];
	int cnt=1,last=1,rt[N];
	inline void insert(int w){
		int np=++cnt,p=last;last=cnt;t[np].len=t[p].len+1;
		rt[np]=Tree::insert(0,1,m,t[np].len);
		for(;p&&t[p].nxt[w]==0;p=t[p].fa)t[p].nxt[w]=np;
		if(p==0)return t[np].fa=1,void();int q=t[p].nxt[w];
		if(t[q].len==t[p].len+1)return t[np].fa=q,void();
		int nq=++cnt;t[nq]=t[q];t[nq].len=t[p].len+1;t[np].fa=t[q].fa=nq;
		for(;p&&t[p].nxt[w]==q;p=t[p].fa)t[p].nxt[w]=nq;
	}
	int pl[N],id[N];
	inline void build(){
		scanf("%s",w+1);m=strlen(w+1);
		for(int i=1;i<=m;i++)insert(w[i]-'a');
		for(int i=1;i<=cnt;i++)pl[t[i].len]++;
		for(int i=1;i<=m;i++)pl[i]+=pl[i-1];
		for(int i=cnt;i;i--)id[pl[t[i].len]--]=i;
		for(int i=cnt;i>1;i--){int wh=id[i];rt[t[wh].fa]=Tree::merge(rt[t[wh].fa],rt[wh],1,m);}
	}
	inline bool check(int wh,int l,int r){
		if(wh==0||l>r)return false;
		return Tree::work(rt[wh],1,m,l,r);
	}
}

namespace sam{
	struct node{
		int nxt[26],fa,len;
	}t[N],newone;
	int cnt,last,p[N],m;
	inline void init(){
		for(int i=1;i<=cnt;i++)t[i]=newone,p[i]=0;
		cnt=last=1;
	}
	inline void insert(int w){
		int np=++cnt,p=last;last=cnt;t[np].len=t[p].len+1;
		for(;p&&t[p].nxt[w]==0;p=t[p].fa)t[p].nxt[w]=np;
		if(p==0)return t[np].fa=1,void();int q=t[p].nxt[w];
		if(t[q].len==t[p].len+1)return t[np].fa=q,void();
		int nq=++cnt;t[nq]=t[q];t[nq].len=t[p].len+1;t[q].fa=t[np].fa=nq;
		for(;p&&t[p].nxt[w]==q;p=t[p].fa)t[p].nxt[w]=nq;
	}
	int l,r;
	inline void find(int &wh,int &len,int w){
		while(true){
			if(SAM::check(SAM::t[wh].nxt[w],l+len,r)){
				len++;wh=SAM::t[wh].nxt[w];return;
			}
			if(len==0)return;len--;
			if(len==SAM::t[SAM::t[wh].fa].len)wh=SAM::t[wh].fa;
		}
	}
	void main(){
		init();scanf("%s",w+1);
		read(l);read(r);m=strlen(w+1);
		long long ans=0;
		for(int i=1,wh=1;i<=m;i++){
			p[i]=p[i-1];find(wh,p[i],w[i]-'a');
			insert(w[i]-'a');
			ans+=max(0,t[last].len-max(t[t[last].fa].len,p[t[last].len]));
		}
		printf("%lld\n",ans);
	}
}
signed main(){
	SAM::build();
	int test;read(test);
	while(test--)sam::main();
}

P5284

题意：给定字符串 \(S\) 以及两类区间，再给定一些从第一类区间连向第二类区间的边，一个第二类区间能连向一个第一类区间当且仅当前者是后者的前缀。每个第一类区间的价值是区间长度，求这张图上的最长路（或判断无限长）。

首先有一个非常关键的问题是说，如何利用 SAM 快速判断一个串是否是另一个串的子串。可以弱化问题成为求两个后缀的最长公共前缀。后缀数组的做法非常简单，用 height 数组 RMQ 即可。而 SAM 上也可以处理这个问题，只不过需要转化一下，转化成求两个前缀的最长公共后缀。这个公共后缀既然在两个地方都出现了，就说明这个后缀对应的节点的 endpos 应该是包括了 \(x\) 和 \(y\) 的。而由于 endpos 的性质，这个点一定是两个点的公共祖先，要找最长的就只需要深度最大即可（毕竟越向下长度会越小）。

重新审视一下“前者是后者的前缀”这一限制条件。可以使用上面的那种思路，把一个区间对应的字符串映射到 SAM 上的一个节点，具体方法是先建立反串的 SAM，对于区间 \([l,r]\)，可以对 \(l\) 对应的节点向上倍增，找到最浅的点使得 \(len(x)\ge len',len(fa(x))<len'\)。由上面的思路可以知道所有映射到该节点及其子孙的节点包含了这个点。然后就可以遍历这棵树，加边，最后尝试着给生成的有向图拓扑排序即可。

然后就是如何优化建边。上面也说了，由于一个节点一定是它子孙对应节点的前缀，所以如果一个 \(B\) 类串能向一个 \(A\) 连边，那么它的所有祖先都是可以的。于是就只需要顺着 endpos 树的结构去连边即可，需要注意的是由于一个节点可能对应多个串，而同节点的串之间又肯定存在着包含关系，所以只需要按长度排序，同长度 \(B\) 在前 \(A\) 在后即可。

#include<bits/stdc++.h>
//#define feyn
#define LL long long
const int N=400010;
using namespace std;
inline void read(int &wh){
	wh=0;char w=getchar();int f=1;
	while(w<'0'||w>'9'){if(w=='-')f=-1;w=getchar();}
	while(w<='9'&&w>='0'){wh=wh*10+w-'0';w=getchar();}
	wh*=f;return;
}

int lg[N];

namespace G{
	int cnt;
	struct edge{int t,nxt;}e[N<<1];
	int head[N],esum;
	inline void add(int fr,int to){
		if(fr*to==0)return;
		e[++esum]=(edge){to,head[fr]};head[fr]=esum;
	}
	int v[N],q[N],ll,rr,d[N];LL f[N];
	void init(){
		memset(head,0,sizeof(head));
		memset(v,0,sizeof(v));
		memset(d,0,sizeof(d));
		memset(f,0,sizeof(f));
		esum=0;
	}
	void main(){
		for(int i=1;i<=esum;i++)d[e[i].t]++;ll=1,rr=0;
		for(int i=1;i<=cnt;i++)if(d[i]==0)q[++rr]=i;
		while(ll<=rr){
			int wh=q[ll++];f[wh]+=v[wh];
			for(int i=head[wh],th;i;i=e[i].nxt){
				d[th=e[i].t]--;f[th]=max(f[th],f[wh]);
				if(d[th]==0)q[++rr]=th;
			}
		}
		for(int i=1;i<=cnt;i++)if(d[i])return puts("-1"),void();
		LL ans=0;
		for(int i=1;i<=cnt;i++)ans=max(ans,f[i]);
		return printf("%lld\n",ans),void();
	}
}

namespace Tree{
	int cnt;
	struct edge{int t,nxt;}e[N];
	int head[N],esum;
	inline void add(int fr,int to){
		if(fr*to==0)return;
		e[++esum]=(edge){to,head[fr]};head[fr]=esum;
	}
	int id[N],bg[N],ed[N],pl[N];
	int nxt[N][23],d[N],len[N];
	void dfs(int wh,int fa){
		d[wh]=d[fa]+1;nxt[wh][0]=fa;
		for(int i=1;i<=lg[d[wh]];i++)nxt[wh][i]=nxt[nxt[wh][i-1]][i-1];
		for(int i=head[wh];i;i=e[i].nxt)dfs(e[i].t,wh);
	}
	inline int get(int wh,int lenn){
		for(int i=lg[d[wh]];i>=0;i--)
			if(len[nxt[wh][i]]>=lenn)wh=nxt[wh][i];
		return wh;
	}
	inline void init(){
		memset(head,0,sizeof(head));esum=0;
		memset(pl,0,sizeof(pl));
		memset(d,0,sizeof(d));
		memset(len,0,sizeof(len));
		memset(bg,0,sizeof(bg));
		memset(ed,0,sizeof(ed));
		memset(nxt,0,sizeof(nxt));
		memset(id,0,sizeof(id));
	}
	struct node{int wh,op,len,id;}a[N];
	inline bool operator <(node x,node y){
		if(x.wh^y.wh)return x.wh<y.wh;
		if(x.len^y.len)return x.len<y.len;
		return x.op;
	}
	void solve(int wh){
		pl[wh]=pl[nxt[wh][0]];
		if(bg[wh]<=ed[wh]){
			for(int i=bg[wh];i<=ed[wh];i++){
				G::add(pl[wh],a[i].id);
				if(a[i].op)pl[wh]=a[i].id;
			}
		}
		for(int i=head[wh];i;i=e[i].nxt)solve(e[i].t);
	}
	void main(){
		dfs(1,0);int m,n,l,r;
		read(m);
		for(int i=1;i<=m;i++){
			read(l);read(r);int nowPl=get(id[l],r-l+1);
			G::v[i]=r-l+1;
			a[i]=(node){nowPl,false,r-l+1,i};
		}
		read(n);
		for(int i=m+1;i<=m+n;i++){
			read(l);read(r);int nowPl=get(id[l],r-l+1);
			a[i]=(node){nowPl,true,r-l+1,i};
		}
		sort(a+1,a+m+n+1);
		for(int i=1,j=0;i<=cnt;i++){
			bg[i]=j+1;while(a[j+1].wh==i)j++;ed[i]=j;
		}
		solve(1);
		int num;read(num);
		while(num--){
			read(l);read(r);
			G::add(l,r+m);
		}
		G::cnt=n+m;
	}
}

namespace SAM{
	struct node{int nxt[26],len,fa;}t[N],newone;
	int cnt,last;
	inline void init(){
		for(int i=1;i<=cnt;i++)t[i]=newone;
		cnt=last=1;
	}
	inline void insert(int w){
		int np=++cnt,p=last;last=cnt;t[np].len=t[p].len+1;
		for(;p&&t[p].nxt[w]==0;p=t[p].fa)t[p].nxt[w]=np;
		if(p==0)return t[np].fa=1,void();int q=t[p].nxt[w];
		if(t[q].len==t[p].len+1)return t[np].fa=q,void();
		int nq=++cnt;t[nq]=t[q];t[nq].len=t[p].len+1;t[np].fa=t[q].fa=nq;
		for(;p&&t[p].nxt[w]==q;p=t[p].fa)t[p].nxt[w]=nq;
	}
	char w[N];int nowLen;
	void main(){
		scanf("%s",w+1);nowLen=strlen(w+1);
		for(int i=nowLen;i;i--)insert(w[i]-'a'),Tree::id[i]=last;
		for(int i=1;i<=cnt;i++)Tree::add(t[i].fa,i),Tree::len[i]=t[i].len;
		Tree::cnt=G::cnt=cnt;
	}
}

void Main(){
	SAM::init();Tree::init();G::init();
	SAM::main();Tree::main();G::main();
}

char ww[N];

signed main(){
	
	#ifdef feyn
	freopen("in.txt","r",stdin);
	#endif
	
	for(int i=0;i<N;i++)lg[i]=lg[i>>1]+1;
	int Test;read(Test);
	while(Test--)Main();
    
	return 0;
}

Cool Slogans

题意：给定一个串 \(S\)，希望构造一个尽可能长的序列 \(t\)，使得 \(t_i\) 是 \(S\) 的子串，并且 \(t_{i-1}\) 在 \(t_i\) 中至少出现两次，求序列长度。

首先有结论说，构造出来的合法序列 \(t\) 应该有一个性质，就是 \(t_{i-1}\) 同时是 \(t_i\) 的前缀和后缀。如果不是这样的话，去掉多余部分并不影响前者在后者中的出现次数，却会使得后面更容易让 \(t_i\) 出现更多次，所以肯定不劣。而这让我们想到了 endpos 树，因为在这棵树上有一个结论，一个节点的 endpos 集合一定包含于它祖先的，所以所以说一个节点祖先的串一定是该节点的串的后缀，相当于天然地满足了一个条件。所以想到可以在 endpos 树上进行 DP。

然后要用到一个结论：在 endpos 树上，若 \(p\) 是 \(q\) 的祖先，则 \(p\) 中所有字符串在 \(longest(q)\)（下面称为 \(s\)）中出现次数与出现位置相同。证明如下（摘自 iostream 博客）：

有串 \(1\)：abcb，串 \(2\) ：babcb，串 \(s\)：abcbabcb。考虑反证：假设这里 \(s\) 的后缀 \(1,2\) 均为 \(p\) 节点表示的串，\(1\) 成功匹配而 \(2\) 不行。因为 \(2\)，所有显然还存在着这个串：babcbabcb。又因为\(s\) 表示的已经是 \(q\) 的最长串了，所以 \(3\) 串一定来自另一个节点。设 \(3\) 串来自另一个节点 \(w\)，\(q\) 是 \(w\) 的祖先。根据定义知 \(|R(q)| > |R(w)|\)。这样，则一定存在一个位置 \(r\)，\(p\in R(q) - R(w)\)，在这个位置只出现了 \(s\) 串而没有 \(3\) 串。这样就存在一个位置使得只出现 \(1\) 串而没有 \(2\) 串。这样得到 \(12\) 两串 \(R\) 集合不同，矛盾，所以不合法。

有了这个结论之后问题就变得相对容易了。既然得到的结果是一样的，那么显然会贪心地选择每个节点里最长的那个串。用 \(f_x\) 代表根到这个点最多能产生多长的合法序列，\(g_x\) 表示贡献点。思考转移。对于一个点 \(x\)，假设它父亲的转移点是 \(y\)，已知 \(y\) 中节点已经是它的后缀了，那么能否转移只取决于有没有它的前缀，如果有，说明 \(y\) 中存在一个起始点不小于 \(pl-len(x)+1\) 的点，换句话说就是存在一个终止点不小于 \(pl-len(x)+len(y)\) 的点，于是只需要在 endpos 集合中查询是否存在 \([pl-len(x)+len(y),pl-1]\) 的点，动态开点的线段树维护一下即可。

#include<bits/stdc++.h>
//#define feyn
const int N=400010;
using namespace std;
inline void read(int &wh){
	wh=0;char w=getchar();int f=1;
	while(w<'0'||w>'9'){if(w=='-')f=-1;w=getchar();}
	while(w<='9'&&w>='0'){wh=wh*10+w-'0';w=getchar();}
	wh*=f;return;
}

int m;
char w[N];

namespace Tree{
	#define lc t[wh].left
	#define rc t[wh].right
	#define mid ((l+r)>>1)
	struct node{int left,right;}t[N*30];int cnt;
	inline int insert(int wh,int l,int r,int pl){
		if(wh==0)wh=++cnt;if(l==r)return wh;
		if(pl<=mid)lc=insert(lc,l,mid,pl);
		else rc=insert(rc,mid+1,r,pl);return wh;
	}
	inline int merge(int x,int y,int l,int r){
		if(x==0||y==0||l==r)return x+y;int wh=++cnt;
		lc=merge(t[x].left,t[y].left,l,mid);
		rc=merge(t[x].right,t[y].right,mid+1,r);return wh;
	}
	inline bool work(int wh,int l,int r,int wl,int wr){
		if(wh==0)return false;if(wl<=l&&r<=wr)return true;
		if(wl<=mid&&work(lc,l,mid,wl,wr))return true;
		if(wr>mid&&work(rc,mid+1,r,wl,wr))return true;return false;
	}
	#undef lc
	#undef rc
	#undef mid
}

namespace SAM{
	struct node{
		int nxt[26],len,fa;
	}t[N];
	int cnt=1,last=1,rt[N],onePl[N];
	inline void insert(int w){
		int np=++cnt,p=last;last=cnt;
		t[np].len=t[p].len+1;
		onePl[np]=t[np].len;rt[np]=Tree::insert(0,1,m,t[np].len);
		for(;p&&t[p].nxt[w]==0;p=t[p].fa)t[p].nxt[w]=np;
		if(p==0)return t[np].fa=1,void();
		int q=t[p].nxt[w];
		if(t[q].len==t[p].len+1)return t[np].fa=q,void();
		int nq=++cnt;t[nq]=t[q];t[nq].len=t[p].len+1;
		t[np].fa=t[q].fa=nq;
		for(;p&&t[p].nxt[w]==q;p=t[p].fa)t[p].nxt[w]=nq;
	}
	int pl[N],id[N];
	int f[N],g[N],ans=2;
	void main(){
		read(m);scanf("%s",w+1);
		for(int i=1;i<=m;i++)insert(w[i]-'a');
		for(int i=1;i<=cnt;i++)pl[t[i].len]++;
		for(int i=1;i<=cnt;i++)pl[i]+=pl[i-1];
		for(int i=cnt;i;i--)id[pl[t[i].len]--]=i;
		for(int i=cnt;i;i--){
			int wh=id[i],fa=t[id[i]].fa;
			rt[fa]=Tree::merge(rt[fa],rt[wh],1,m);
			onePl[fa]=onePl[wh];
		}
		for(int i=1;i<=cnt;i++){
			int wh=id[i],fa=t[id[i]].fa;
			if(f[fa]==0){f[wh]=1,g[wh]=wh;continue;}
			int th=g[fa];
			if(Tree::work(rt[th],1,m,onePl[wh]-t[wh].len+t[th].len,onePl[wh]-1)){
				f[wh]=f[th]+1;g[wh]=wh;
			}
			else f[wh]=f[fa],g[wh]=g[fa];
		}
		for(int i=1;i<=cnt;i++)ans=max(ans,f[i]);
		printf("%d\n",ans-1);
	}
}

signed main(){
	SAM::main();
	return 0;
}

广义 SAM

广义 SAM 是构建在 Trie 树上的后缀自动机，可以方便地处理多串的匹配问题。我采用的是离线之后在 Trie 树上 bfs 进行构造，感觉也没啥问题啊。

P6139 【模板】广义后缀自动机（广义 SAM）

按照板子把广义后缀自动机建出来之后，累加 \(len_x-len_{fa}\) 即可。

LCS2 - Longest Common Substring

相似地，用 \(f_{x,i}\) 代表节点 \(x\) 在串 \(i\) 意义下的 endpos 集合大小，要计算的时候只需要找所有 \(f\) 都不为零的点，并用 \(len\) 更新即可。

LONGCS - Longest Common Substring

上面那道题的多测版本。

P4081 Standing Out from the Herd P

还是先把广义 SAM 建出来，然后找到那些只被一个串覆盖的点，累加求答案即可。

Good Substrings

先建广义 SAM，可以套路地求出每个点在每个串意义下的出现次数，然后找到合法的即可。

Forensic Examination

题意：给你一个串 \(S\) 以及一个字符串数组 \(T_{1\ldots m}\)，\(q\) 次询问，每次问 \(S\) 的子串 \(S[p_l\ldots p_r]\) 在 \(T_{l\ldots r}\) 中的哪个串里的出现次数最多，并输出出现次数。如有多解输出最靠前的那一个。

SAM 各种常用技巧结合版。首先对 \(S\) 和 \(T_i\) 一并建出 GSAM，动态开点线段树维护每个节点对应的子串在每个 \(T_i\) 中出现的次数（以 \(T\) 为下标建树），即线段树 \(Tree_p\) 的位置 \(i\) 上记录着节点 \(p\) 所对应的所有串在 \(T_i\) 中的出现次数。由于题目还需求最小编号，所以线段树维护区间最大出现次数以及对应最小编号。

使用线段树合并，预处理 endpos 的倍增数组以快速定位子串，单次询问只需把 \(ed_{pr}\) 倍增到 \(s[pl,pr]\) 的对应状态 \(x\)，查询 \(Tree_x\) 上 \([l,r]\) 的信息即可。时空复杂度均为线性对数。

#include<bits/stdc++.h>
//#define feyn
const int N=1200010;
using namespace std;
inline void read(int &wh){
	wh=0;char w=getchar();int f=1;
	while(w<'0'||w>'9'){if(w=='-')f=-1;w=getchar();}
	while(w<='9'&&w>='0'){wh=wh*10+w-'0';w=getchar();}
	wh*=f;return;
}

struct nd{int pl,data;};
inline nd operator +(nd s1,nd s2){
	if(s1.pl==s2.pl)return (nd){s1.pl,s1.data+s2.data};
	else if(s1.data==s2.data)return s1.pl<s2.pl?s1:s2;
	else return s1.data>s2.data?s1:s2;
}

namespace Tree{
	#define lc t[wh].left
	#define rc t[wh].right
	#define mid ((l+r)>>1)
	struct node{int left,right;nd data;}t[N*20];int cnt;
	inline int insert(int wh,int l,int r,int pl,int data){
		if(wh==0)wh=++cnt;
		if(l==r)return t[wh].data=t[wh].data+(nd){pl,data},wh;
		if(pl<=mid)lc=insert(lc,l,mid,pl,data);
		else rc=insert(rc,mid+1,r,pl,data);
		t[wh].data=t[lc].data+t[rc].data;
		return wh;
	}
	inline int merge(int x,int y,int l,int r){
		if(x==0||y==0)return x+y;int wh=++cnt;
		if(l==r)return t[wh].data=t[x].data+t[y].data,wh;
		lc=merge(t[x].left,t[y].left,l,mid);
		rc=merge(t[x].right,t[y].right,mid+1,r);
		return t[wh].data=t[lc].data+t[rc].data,wh;
	}
	inline nd find(int wh,int l,int r,int wl,int wr){
		if(!wh)return (nd){0,0};
		if(wl<=l&&r<=wr)return t[wh].data;
		if(wl<=mid&&wr>mid)return find(lc,l,mid,wl,wr)+find(rc,mid+1,r,wl,wr);
		if(wl<=mid)return find(lc,l,mid,wl,wr);
		else return find(rc,mid+1,r,wl,wr);
	}
	#undef lc
	#undef rc
	#undef mid
}

namespace SAM{
	struct node{int len,fa,nxt[26];}t[N];
	int cnt=1;
	inline int insertChar(int wh,int w){
		if(t[wh].nxt[w])return t[wh].nxt[w];
		else return t[wh].nxt[w]=++cnt;
	}
	char w[N],ww[N];int num,rt[N],ed[N];
	inline void insertString(int id){
		scanf("%s",w+1);int wh=1;
		if(id==0)for(int i=1;w[i];i++)ww[i]=w[i];
		for(int i=1;w[i];i++){
			wh=insertChar(wh,w[i]-'a');
			if(id){
				rt[wh]=Tree::insert(rt[wh],1,num,id,1);
			}
			else ed[i]=wh;
		}
	}
	inline int insert(int last,int w){
		int np=t[last].nxt[w];
		t[np].len=t[last].len+1;
		int p=t[last].fa;
		for(;p&&t[p].nxt[w]==0;p=t[p].fa)t[p].nxt[w]=np;
		if(p==0)return t[np].fa=1,np;
		int q=t[p].nxt[w];
		if(t[q].len==t[p].len+1)return t[np].fa=q,np;
		int nq=++cnt;t[nq]=t[q];t[nq].len=t[p].len+1;
		for(int i=0;i<26;i++)t[nq].nxt[i]=t[t[q].nxt[i]].len?t[q].nxt[i]:0;
		t[np].fa=t[q].fa=nq;
		for(;p&&t[p].nxt[w]==q;p=t[p].fa)t[p].nxt[w]=nq;
		return np;
	}
	queue<pair<int,int>>q;
	void build(){
		for(int i=0;i<26;i++)
			if(t[1].nxt[i])q.push(make_pair(1,i));
		while(!q.empty()){
			auto now=q.front();q.pop();
			int wh=insert(now.first,now.second);
			for(int i=0;i<26;i++)
				if(t[wh].nxt[i])q.push(make_pair(wh,i));
		}
	}
	int nxt[N][22],pl[N],id[N];
	inline int find(int wh,int len){
		for(int i=21;i>=0;i--)
			if(t[nxt[wh][i]].len>=len)wh=nxt[wh][i];
		return wh;
	}
	void main(){
		insertString(0);
		read(num);
		for(int i=1;i<=num;i++)insertString(i);
		build();
		for(int i=1;i<=cnt;i++)pl[t[i].len]++;
		for(int i=1;i<=cnt;i++)pl[i]+=pl[i-1];
		for(int i=cnt;i;i--)id[pl[t[i].len]--]=i;
		for(int i=cnt;i;i--){
			int wh=id[i];int fa=t[wh].fa;
			rt[fa]=Tree::merge(rt[fa],rt[wh],1,num);
		}
		for(int i=1;i<=cnt;i++){
			int wh=id[i];int fa=t[wh].fa;
			nxt[wh][0]=fa;
			for(int i=1;i<=21;i++)nxt[wh][i]=nxt[nxt[wh][i-1]][i-1];
		}
		
		int q;read(q);int ql,qr,wl,wr;
		while(q--){
			read(wl);read(wr);read(ql);read(qr);
			int wh=find(ed[qr],qr-ql+1);
			nd an=Tree::find(rt[wh],1,num,wl,wr);
			if(an.data==0)printf("%d 0\n",wl);
			else printf("%d %d\n",an.pl,an.data);
		}
	}
}

signed main(){
	
	#ifdef feyn
	freopen("in.txt","r",stdin);
	#endif
	
	SAM::main();
	
	return 0;
}

posted @ 2022-12-09 10:05 Feynn 阅读(56) 评论(0) 编辑收藏举报

刷新页面返回顶部

SAM

review

板子

一些技巧

板子部分

应用

广义 SAM

公告