[学习笔记] hash & kmp & Trie树 - 字符串

Hash

Oulipo

没啥好说的，进制hash板子。

#include<bits/stdc++.h>
using namespace std;
#define ull unsigned long long
const int N = 1e6 + 1;
int n, lw, lt, ans;
ull hw, ht[N], p[N];
inline ull gethash(int l, int r){ return ht[r] - ht[l] * p[r - l]; }
inline void wt(int k){ if(k/10) wt(k/10); putchar(k%10+'0'); }
int main(){
	p[0] = 1;
	for(int i=1; i<=N; ++i) p[i] = p[i-1] * 131;
	char ch = getchar();
	while(isdigit(ch)) n = (n<<1) + (n<<3) + (ch^48), ch = getchar();
	while(n--){
		ans = hw = lt = lw = 0;
		char ch = getchar();
		while(ch < 'A' || ch > 'Z') ch = getchar();
		while(ch >= 'A' && ch <= 'Z') hw = hw * p[1] + ch, ++lw, ch = getchar();
		while(ch < 'A' || ch > 'Z') ch = getchar();
		while(ch >= 'A' && ch <= 'Z' && ++lt){
			ht[lt] = ht[lt-1] * p[1] + ch;
			ch = getchar();
			if(lt >= lw) if(gethash(lt-lw, lt) == hw) ++ans;
		} wt(ans), putchar('\n');
	} return 0;
}

[USACO17OPEN] Bovine Genomics S

简单来说，存在某一区间使得任意A串与B串都不相等，现在要找这一区间的最小值。进制哈希之后，可以暴力判断某一长度区间是否合法。再加上区间长度和其合法性是有单调关系的(随着区间长度越小，就越不可能合法)，所以再加上二分即可

#include<bits/stdc++.h>
using namespace std;
#define ull unsigned long long
int n, m, ans;
ull ha[501][501], hb[501][501], p[501];
string A[501], B[501];
inline void Hash(string str, ull *h){
	for(int i=0; i<m; ++i)
		h[i+1] = h[i] * p[1] + str[i] - 'A';
}
inline ull gethash(int l, int r, ull *h){ return h[r] - h[l] * p[r-l]; }
inline bool check(int len){ //暴力判断 
	for(int k=len; k<=m; ++k){
		bool data = 0;
		for(int i=1; i<=n; ++i)
		for(int j=1; j<=n; ++j)
			if(gethash(k-len, k, ha[i]) == gethash(k-len, k, hb[j])) data = 1;
		if(!data) return 1;
	} return 0;
}
int main(){
	ios::sync_with_stdio(0), cin.tie(0), cout.tie(0);
	p[0] = 1;
	for(int i=1; i<=500; ++i) p[i] = p[i-1] * 131;
	cin>>n>>m;
	for(int i=1; i<=n; ++i) cin>>A[i], Hash(A[i], ha[i]);
	for(int i=1; i<=n; ++i) cin>>B[i], Hash(B[i], hb[i]);
	int l = 1, r = m;
	while(l < r){
		int mid = (l + r) >> 1;
		if(check(mid)) r = mid;
		else l = mid+1;
	} return cout<<l, 0;
}

P4591 [TJOI2018] 碱基序列

hash + 分组背包

dp[i][j] 表示第 \(i\) 组氨基酸放到蛋白质的第 \(j\) 个位置的方案数。同样是每一组只取一种物品放入背包，就可以类比于分组背包。len_z表示第 \(z\) 个氨基酸的长度，于是有转移方程(转移时记得判断是否能配对上)：

\[dp[i][j]=\sum\limits_{z=1}^{z=a}dp[i-1][j-len_z] \]

#include<bits/stdc++.h>
using namespace std;
#define ull unsigned long long
int n, num[101], ls, lp[101][11], dp[10001], mod = 1e9 + 7, ans;
string s;
ull h[101][11], hs[10001], p[10001];
inline ull gethash(int l, int r){ return hs[r] - hs[l] * p[r-l]; }
int main(){
	ios::sync_with_stdio(0), cin.tie(0), cout.tie(0);
	cin>>n>>s;
	p[0] = 1; for(int i=1; i<=n; ++i) p[i] = p[i-1] * 131;
	ls = s.size();
	for(int i=0; i<ls; ++i) hs[i+1] = hs[i] * p[1] + s[i];
	for(int i=1; i<=n; ++i){
		cin>>num[i];
		string str;
		for(int j=1; j<=num[i]; ++j){
			cin>>str;
			lp[i][j] = str.size();
			for(int z=0; z<lp[i][j]; ++z) h[i][j] = h[i][j] * p[1] + str[z];
		}
	}
	for(int i=0; i<=ls; ++i) dp[i] = 1; //注意从0开始 
	for(int i=1; i<=n; ++i)
	for(int k=ls; k>=0; --k){//注意要把dp[0]也更新一下 
		dp[k] = 0;
		for(int j=1; j<=num[i]; ++j)
			if(k >= lp[i][j] && gethash(k-lp[i][j], k) == h[i][j]) dp[k] = (dp[k] + dp[k - lp[i][j]]) % mod;
		if(i == n) ans = (ans + dp[k]) % mod;
	} return cout<<ans, 0;
}

[CQOI2014] 通配符匹配

#include<bits/stdc++.h>
using namespace std;
#define ull unsigned long long
#define bl (gethash(j-id[i]+id[i-1], j-1, hp) == gethash(id[i-1], id[i]-1, hs))
const int N = 1e5 + 5;
ull hs[N], p[N], hp[N];
string s, st;
int ls, id[15], n, len, cnt, dp[15][N];
inline ull gethash(int l, int r, ull *h){ return h[r] - h[l] * p[r-l]; }
int main(){
	ios::sync_with_stdio(0), cin.tie(0), cout.tie(0);
	p[0] = 1; for(int i=1; i<N; ++i) p[i] = p[i-1] * (ull)131;
	cin>>s; s += '?';
	ls = s.size();
	for(int i=1; i<=ls; ++i){
		hs[i] = hs[i-1] * p[1] + s[i-1];
		if(s[i-1] == '?' || s[i-1] == '*') id[++cnt] = i;
	}
	cin>>n;
	while(n--){
		memset(dp, 0, sizeof dp);
		cin>>st; st += 'a';
		len = st.size();
		for(int i=1; i<=len; ++i) hp[i] = hp[i-1] * p[1] + st[i-1];
		dp[0][0] = 1;
		for(int i=1; i<=cnt; ++i) for(int j=id[i]-id[i-1]; j<=len; ++j){
				if(dp[i][j-1] == 2){ dp[i][j] = 2; continue; }
				if(!dp[i-1][j - id[i] + id[i-1]] || !bl) continue;
				if(s[id[i]-1] == '?') dp[i][j] = 1;
				else dp[i][j] = dp[i][j-1] = 2;
		}
		if(dp[cnt][len]) cout<<"YES\n";
		else cout<<"NO\n";
	} return 0;
}

[NOI2017] 蚯蚓排队

上面的内容都只是用hash来映射字符串，但如果要映射数字或者其他数据类型就需要用到哈希表。5分钟搞定哈希表。哈希表要比map快的多。

对于本题来说，需要完成以下三个操作：

合并队伍：这个很好实现，直接用链表模拟即可。用两个数组分别表示某一个蚯蚓的前驱和后继。我们注意到，数据范围中 \(k\) 非常小，不妨考虑暴力统计每一次合并时新产生的字符串。可以将新产生的字符串映射成哈希，接着用哈希表统计该字符串的数量即可。
分离队伍：同样用链表实现，统计每次分开时消失的字符串，并在哈希表中将数量-1即可。
统计个数：将字符串 \(s\) 映射成哈希，并在哈希表里统计个数即可。

一般来说，在哈希表中链表越长，搜索效率越差。所以为了提高效率，哈希表开成尽量大的素数。我用的 10000019。

#include<bits/stdc++.h>
using namespace std;
#define ull unsigned long long
#define ll long long
const int N = 2e5 + 2, hmod = 10000019, hmax = 1e7 + 5;
const ll mod = 998244353;
int n, m, dt, a, b, nxt[N], lst[N], k;
ull p[N], hs[N], f[N];
string s;
struct Hash_table{
	int cnt, head[hmod+1], nxt[hmax]; ull val[hmax]; ll num[hmax];
	inline void add(ull h, int v){
		int tb = h % hmod;
		for(int i=head[tb]; i; i=nxt[i])
			if(val[i] == h){ num[i] = num[i] + v; return; }
		val[++cnt] = h, ++num[cnt], nxt[cnt] = head[tb], head[tb] = cnt;
	}
	inline ll query(ull h){
		int tb = h % hmod;
		for(int i=head[tb]; i; i=nxt[i])
			if(val[i] == h) return num[i];
		return 0ll;
	}
} Hash;
inline int rd(){
	int x = 0; char ch = getchar();
	while(!isdigit(ch)) ch = getchar();
	while(isdigit(ch)) x = (x<<1) + (x<<3) + (ch^48), ch = getchar();
	return x;
}
inline string rds(){
	string str = ""; char ch = getchar();
	while(ch != ' ') str += ch, ch = getchar();
	return str;
}
inline void wt(ll k){ if(k/10) wt(k/10); putchar(k%10+'0'); }
inline ull gethash(int l, int r){ return hs[r]-hs[l]*p[r-l]; }
inline void add(int a, int b){
	nxt[a] = b, lst[b] = a;
	ull nhash = 0, mhash;
	for(int i=a, la=1; i&&la<=50; i=lst[i], ++la){
		nhash += f[i]*p[la-1], mhash = nhash;
		for(int j=b, lb=la+1; j&&lb<=50; j=nxt[j], ++lb){
			mhash = mhash * p[1] + f[j];
			Hash.add(mhash, 1);
		}
	}
}
inline void divide(int a){
	int b = nxt[a]; ull nhash = 0, mhash;
	for(int i=a, la=1; i&&la<=50; i=lst[i], ++la){
		nhash += f[i]*p[la-1], mhash = nhash;
		for(int j=b, lb=la+1; j&&lb<=50; j=nxt[j], ++lb){
			mhash = mhash * p[1] + f[j];
			Hash.add(mhash, -1);
		}
	}
	nxt[a] = lst[b] = 0;
}
inline ll query(int k){
	int len = s.size(); ll ans = 1;
	for(int i=1; i<=len; ++i) hs[i] = hs[i-1]*p[1] + s[i-1] - '0';
	for(int i=k; i<=len; ++i){
		ull nhash = gethash(i-k, i);
		ans = (__int128)ans * Hash.query(nhash) % mod;
	} return ans;
}
int main(){
	n=rd(), m=rd(); p[0] = 1;
	for(int i=1; i<=N; ++i) p[i] = p[i-1] * 131;
	for(int i=1; i<=n; ++i) f[i] = rd(), Hash.add(f[i], 1);
	while(m-- && (dt=rd())){
		if(dt == 1) a=rd(), b=rd(), add(a, b);
		if(dt == 2) a=rd(), divide(a);
		if(dt == 3) s=rds(), k=rd(), wt(query(k)), putchar('\n');
	} return 0;
}

kmp

[POJ 2752] Seek the Name, Seek the Fame

这道题纯纯是考Next数组的。对于一个串来说，它的第一长公共前后缀肯定是他自己，第二就是 \(Next[s.len]\) ，第三就是 \(Next[Next[s.len]]\) 以此类推。在这道题里仅需递推求出所有Next值即可。

#include<bits/stdc++.h>
using namespace std;
const int N = 4e5 + 1; string s; int Next[N];
inline void getans(int k){
	if(!k) return;
	getans(Next[k]), cout<<k<<' ';
}
inline void getnext(){
	int len = s.size(), j = 0;
	Next[0] = Next[1] = 0;
	for(int i=1; i < len; ++i, j=Next[i]){
		while(j && s[i] != s[j]) j = Next[j];
		if(s[i] == s[j]) Next[i+1] = j+1;
		else Next[i+1] = 0;
	} getans(j), cout<<len<<'\n';
}
int main(){
	ios::sync_with_stdio(0), cin.tie(0), cout.tie(0);
	while(cin>>s) getnext(); return 0;
}

[NOI2014] 动物园

这道题是上道题的进阶版本，加深了我对kmp求next数组的思想运用。我们知道，Next[i]是一个字符串里最长前缀后缀的长度，计算num数组其实只需要判断最长前后缀的长度和原字符串的长度即可。思路简单，但问题在于复杂度。这道题的思想和kmp思想是一样的，使用双指针，固定 \(i\) ，活动 \(j\)，始终让 \(j\) 保持在 \(i\) 的一半以前，这样复杂度就为 \(O(n)\)。

#include<bits/stdc++.h>
using namespace std;
#define ll long long
const int N = 1e7 + 5, mod = 1e9 + 7;
int n, len, Next[N], j, num[N];
string s; ll ans;
inline void getnext(){
	Next[0] = Next[1] = 0; j = 0; ans = 1;
	for(int i=1; i<len; ++i, j=Next[i]){
		num[i] = num[j] + 1;
		while(j && s[i] != s[j]) j = Next[j];
		Next[i+1] = j + (s[i] == s[j]);
	}
	num[0] = 0; j = 0;
	for(int i=1; i<len; ++i){
		while(j && s[i] != s[j]) j = Next[j];
		if(s[i] == s[j]) ++j;
		while((j<<1) > i+1) j = Next[j];
		ans = (__int128)ans * (num[j] + 1) % mod;
	}
}
int main(){
	ios::sync_with_stdio(0), cin.tie(0), cout.tie(0);
	cin>>n;
	while(n--){
		cin>>s; len = s.size();
		ans = 1; getnext();
		cout<<ans<<'\n';
	} return 0;
}

[USACO15FEB] Censoring S

算是个kmp板子吧。在板子的基础上，用栈来模拟删除后的字符串，如果可以删除，那就退栈。再建立一个数组存还没匹配完的 \(j\) 的值。

#include<bits/stdc++.h>
using namespace std;
const int N = 1e6 + 5;
string s, t;
int ls, lt, Next[N], last[N], p[N];
int main(){
	ios::sync_with_stdio(0), cin.tie(0), cout.tie(0);
	cin>>s>>t; ls = s.size(), lt = t.size();
	for(int i=1, j=0; i<lt; ++i, j=Next[i]){
		while(j && t[i] != t[j]) j = Next[j];
		Next[i+1] = j + (t[i] == t[j]);
	}
	for(int i=0, j=0; i<ls; ++i){
		while(j && s[i] != t[j]) j = Next[j];
		if(s[i] == t[j]) ++j;
		last[i] = j;
		p[++p[0]] = i;
		if(j == lt){
			p[0] -= lt;
			j = last[p[p[0]]];
		}
	}
	for(int i=1; i<=p[0]; ++i) cout<<s[p[i]];
	return 0;
}

[POI2006] OKR-Periods of Words

读完题目后可以发现，其实求的就是字符串 \(S\) 的前缀的最短公共前后缀。所以只需要在求Next数组时添加3行代码即可求出。

#include<bits/stdc++.h>
using namespace std;
const int N = 1e6 + 5;
int k, Next[N], j, p;
long long ans;
string s;
int main(){
    ios::sync_with_stdio(0), cin.tie(0), cout.tie(0);
    cin>>k>>s;
    for(int i=1; i<k; ++i, j=Next[i]){
        while(j && s[i] != s[j]) j = Next[j];
        Next[i+1] = j += (s[i] == s[j]);
        p = j;
        while(Next[p]) p = Next[p];
        if(p) ans += i + 1 - p;
    }
	return cout<<ans, 0;
}

TLE73。不难发现，当字符串长的时候，会产生许多无用回溯。那么只需要记录下每一个回溯的值，在新的回溯时使用已有的结果，这样就能避免重复回溯。

#include<bits/stdc++.h>
using namespace std;
const int N = 1e6 + 5;
int k, Next[N], j, p, Last[N];
long long ans;
string s;
int main(){
    ios::sync_with_stdio(0), cin.tie(0), cout.tie(0);
    cin>>k>>s;
    for(int i=1; i<k; ++i, j=Next[i]){
        while(j && s[i] != s[j]) j = Next[j];
        Next[i+1] = j += (s[i] == s[j]);
        if(Next[j]) Last[j] = Last[Next[j]];
        else Last[j] = j;
        if(Last[j]) ans += i + 1 - Last[j];
    }
	return cout<<ans, 0;
}

Trie 树

「POJ 3630」Phone List

Trie树板子。对于静态trie树，数组开多大其实要看运气(能开多大开多大)，严格来说tire树空间应该开到 \(26^{26^{26…}}\) 一共有最长字符串长度个26，太大了。

#include<bits/stdc++.h>
using namespace std;
const int N = 1e6 + 1;
int T, n, cnt;
bool opt;
struct node{
	bool repeat, end;
	int s[10], num;
}t[N];
string s;
inline void insert(string s){
	int len = s.size(), now = 0;
	for(int i=0; i<len; ++i){
		int ch = s[i] - '0';
		if(!t[now].s[ch]) t[now].s[ch] = ++cnt, ++t[now].num;
		now = t[now].s[ch];
		if(t[now].end) opt = 1;
		if(i == len-1){
			if(t[now].num) opt = 1;
			t[now].end = 1;
		}
	}
}
int main(){
	ios::sync_with_stdio(0), cin.tie(0), cout.tie(0);
	cin>>T;
	while(T--){
		cin>>n;
		memset(t, 0, sizeof t);
		opt = 0, cnt = 0;
		for(int i=1; i<=n; ++i) cin>>s, insert(s);
		if(opt) cout<<"NO\n";
		else cout<<"YES\n";
	}
}

The XOR Largest Pair

首先，两个数的异或值最大不超过二进制位数最长的那个数的二进制位数，其次需要让其中一个数的取反与另一个数相似。这样就可以保证异或和最大。根据贪心思想，需要从最高位开始搜索，所以在存入字典树时也要从最高位开始存。

#include<bits/stdc++.h>
using namespace std;
const int N = 1e5 + 1;
int n, mx, cnt, tot, ans, k;
bitset<32> num[N], nump[N];
struct node{ int s[2]; }t[N<<4];
inline void insert(int k){
    int now = 0;
    for(int i=30; i>=0; --i){
    	int Bit = num[k][i];
        if(!t[now].s[Bit]) t[now].s[Bit] = ++cnt;
        now = t[now].s[Bit];
    }
}
inline int search(int k, int p){
    int now = 0;
    for(int i=30; i>=0; --i){
    	int Bit = !(num[k][i]);
    	if(t[now].s[Bit]){
    		now = t[now].s[Bit];
    		nump[p][i] = 1;
		}
    	else{
    		if(t[now].s[!Bit]){
    			now = t[now].s[!Bit];
    			nump[p][i] = 0;
			}else break;
		}
	} return (int)nump[p].to_ulong();
}
int main(){
    ios::sync_with_stdio(0), cin.tie(0), cout.tie(0);
    cin>>n;
    for(int i=1; i<=n; ++i){
    	cin>>k, num[i] = k;
    	int len = num[i].size();
		mx = max(mx, len);
        insert(i);
    }
    for(int i=1; i<=n; ++i){
        if(num[i].size() == mx){
        	nump[++tot] = num[i];
        	ans = max(ans, search(i, tot));
		}
    } return cout<<ans, 0;
}

这道题非常的巧妙。先求出dis[N]的所有值，那么可以知道从 \(1\) 到 \(x\) 的路径为 \(dis(1,x)=dis(1,lca)\oplus dis(lca+1,x)\)，那么 \(dis(1,x)\oplus dis(1,y)=dis(1,lca)\oplus dis(1,lca)\oplus dis(lca+1,x)\oplus dis(lca+1,y)=dis(x,y)\)。由此，这道题就和上道题一样了。

#include<bits/stdc++.h>
using namespace std;
#define Bit dis[k][i]
const int N = 1e5 + 1;
int n, h[N], cnt, t[N<<4][2], tot, Xor;
bitset<N> vis;
bitset<31> dis[N], ans; 
struct edge{ int v, nt; bitset<31> w; }e[N];
inline void add(int u, int v, int w){
	e[++cnt].v = v, e[cnt].nt = h[u], e[cnt].w = w; h[u] = cnt;
}
inline void insert(int k){
	for(int i=30, now=0; i>=0; --i){
		if(!t[now][Bit]) t[now][Bit] = ++tot;
		now = t[now][Bit];
	}
}
inline void search(int k){
	ans = dis[k];
	for(int i=30, now=0; i>=0; --i){
		if(t[now][!Bit]) ans[i] = 1, now = t[now][!Bit];
		else if(t[now][Bit]) ans[i] = 0, now = t[now][Bit];
		else break;
	}
	Xor = max(Xor, (int)ans.to_ulong());
}
inline void dfs(int k){
	vis[k] = 1;
	insert(k);
	search(k);
	for(int i=h[k]; i; i=e[i].nt){
		int v = e[i].v;
		if(vis[v]) continue;
		dis[v] = dis[k] ^ e[i].w;
		dfs(v);
	}
}
int main(){
	ios::sync_with_stdio(0), cin.tie(0), cout.tie(0);
	cin>>n;
	for(int i=1, a, b, v; i<n; ++i) cin>>a>>b>>v, add(a, b, v);
	dfs(1); return cout<<Xor, 0;
}

posted @ 2024-05-10 22:12 XiaoLe_MC 阅读(11) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

xiaolemc