HASH算法小结
一、简述
HASH算法的本质是特征提取——将某种不太好表示的特征,通过某种压缩的方式映射成一个值。这样,就可以优雅解决一部分难以解决的特征统计问题。
同时考虑到hash算法的本质是个概率算法,因此并不能保证所有的数据都不发生冲突<冲突是指两个不同的特征计算出了同一个HASH值>,因此可以考虑使用双hash的形式,使用两个不同的HASH算法,算出来的HASH值来表示一个特征量——pair<ull,ull>就是一种实现方式。
一种常用的hash算法来自一一个递推式:hash[i] = ( hash[i-1] * HASH_P + val[i] ) % HASH_MOD;
这种方式实际上可以比喻成为一个在%HASH_MOD意义下P进制的大数,且每次都添加一个新的个位数进入hash值中。
因此,实际使用中可以支持%HASH_MOD意义下的加法、减法。
另外hash算法好想好写,可以分外暴力的解决相当部分的问题。<甚至可以直接使用优雅的#define来完成模板的编写>
二、提取树的特征
Similarity of Subtrees
https://vjudge.net/problem/Aizu-2784
题意:给出一颗树,询问以1作为树根的树中,结构相同的子树个数有多少对。结构相同定义为,以某点为根节点,其以下每层的的节点个数都与另一节点相应属性相同。
Define the depth of a node in a rooted tree by applying the following rules recursively:
- The depth of a root node is 0.
- The depths of child nodes whose parents are with depth dd are d+1d+1.
Let S(T,d)S(T,d) be the number of nodes of TT with depth dd. Two rooted trees TT and T′T′ are similar if and only if S(T,d)S(T,d) equals S(T′,d)S(T′,d) for all non-negative integer dd.
You are given a rooted tree TT with NN nodes. The nodes of TT are numbered from 1 to NN. Node 1 is the root node of TT. Let TiTi be the rooted subtree of TT whose root is node ii. Your task is to write a program which calculates the number of pairs (i,j)(i,j)such that TiTi and TjTj are similar and i<ji<j.
https://cn.vjudge.net/problem/Aizu-2784
题解:可以发现,子树的结构实际上是可以通过HASH算法形式的递推得到——hash[now] = (∑(hash[child]) * HAHS_P + num[now])%HASH_MOD
该递推式实际上表现了hash值的加法过程。
则,如果支持dfs且不爆栈的话,可以使用dfs一发搞定,相当的优雅。
但是反过来,如果不支持dfs,则必须用bfs的方式来搞定树的遍历和递推,实际上也很好想,因为记录了每个节点的父节点,也记录了每个节点的子节点数量,就可以很容易的计算出来某个节点的所有子节点是否已经完成了递推计算。提供两个版本的代码:dfs实现和bfs实现。
dfs:
#include<math.h> #include<algorithm> #include<vector> #include<stdlib.h> #include<string.h> #include<string> #include<set> #include<map> #include<queue> #include<stack> #include <iostream> using namespace std; #define ull unsigned long long #define hash1(x,b) (((ull)x * HASH_P1 + b)%HASH_MOD1) #define hash2(x,b) (((ull)x * HASH_P2 + b)%HASH_MOD2) #define ll long long const int MAXN = 200233; const ull HASH_MOD1 = 1000000007; const ull HASH_MOD2 = 1000000009; const ull HASH_P1 = 100003; const ull HASH_P2 = 100019; #define veci vector<int> #define pp pair<ull,ull> veci G[MAXN]; int n; map<ull,int> mapp; ll ans; pp dfs_count(int now,int father){ // pp ret = make_pair<ull.int>(0ull,0); pp ret; ret.first = ret.second = 0; int len = G[now].size(); for(int i=0;i<len;++i){ int tar = G[now][i]; if(tar == father)continue; pp tmp = dfs_count(tar,now); ret.first += tmp.first; ret.second += tmp.second; } ret.first %= HASH_MOD1; ret.second %= HASH_MOD2; ret.first = hash1(ret.first,1); ret.second = hash2(ret.second,1); ull hash_tmp = ret.first * HASH_MOD1 + ret.second; if(mapp.count(hash_tmp)){ int tmp = mapp[hash_tmp]; ans += tmp; mapp[hash_tmp] = tmp+1; }else{ // mapp.insert(make_pair(hash_tmp,1)); mapp[hash_tmp] = 1; } return ret; } void init(){ ans = 0; for(int i=0;i<n+23;++i)G[i].clear(); mapp.clear(); for(int i=1;i<n;++i){ int a,b; cin>>a>>b; G[a].push_back(b); G[b].push_back(a); } dfs_count(1,0); cout<<ans<<"\n"; } int main(){ cin.sync_with_stdio(false); while(cin>>n)init(); return 0; }
bfs:
#include<math.h> #include<algorithm> #include<vector> #include<stdlib.h> #include<string.h> #include<string> #include<set> #include<map> #include<queue> #include<stack> #include <iostream> using namespace std; #define ull unsigned long long #define hash1(x,b) (((ull)x * HASH_P1 + b)%HASH_MOD1) #define hash2(x,b) (((ull)x * HASH_P2 + b)%HASH_MOD2) #define ll long long const int MAXN = 200233; const ull HASH_MOD1 = 1000000007; const ull HASH_MOD2 = 1000000009; const ull HASH_P1 = 100003; const ull HASH_P2 = 100019; #define veci vector<int> #define pp pair<ull,ull> veci G[MAXN]; int n; map<ull,int> mapp; ll ans; ull hash_tmp; int fa[MAXN]; pp anss[MAXN]; int times[MAXN]; void bfs(){ queue<int>que; que.push(1); for(int i=0;i<G[1].size();++i)fa[G[1][i]] = 1; while(!que.empty()){ int now = que.front(); que.pop(); times[now] = 0; for(int i=0;i<G[now].size();++i){ int tar = G[now][i]; if(tar==fa[now])continue; times[now] ++; fa[tar] = now; que.push(tar); } } } void deal(){ queue<int> que; for(int i=2;i<=n;++i){ // G[i].size() == 1; if(times[i] == 0) que.push(i); // anss[i] = make_pair(hash1(0,1),hash2(0,1)); } while(!que.empty()){ int now = que.front(); que.pop(); // if(times[now])continue; // cout<<"check_seq: "<<now; times[fa[now]]--; if(times[fa[now]] == 0)que.push(fa[now]); int len = G[now].size(); // anss[now] = make_pair(0,0); for(int i=0;i<len;++i){ int tar = G[now][i]; if(tar == fa[now])continue; anss[now].first += anss[tar].first; anss[now].second += anss[tar].second; } anss[now].first %= HASH_MOD1; anss[now].second %= HASH_MOD2; anss[now].first = hash1(anss[now].first,1); anss[now].second = hash2(anss[now].second,1); ull hash_tmp = anss[now].first * HASH_MOD1 + anss[now].second; // cout<<" "<<hash_tmp<<endl; if(mapp.count(hash_tmp)){ int tmp = mapp[hash_tmp]; ans += tmp; mapp[hash_tmp] = tmp+1; }else{ mapp[hash_tmp] = 1; } times[now] = 1; } } void init(){ memset(anss,0,sizeof(anss)); memset(times,0,sizeof(times)); ans = 0; for(int i=0;i<n+23;++i)G[i].clear(); mapp.clear(); for(int i=1;i<n;++i){ int a,b; cin>>a>>b; G[a].push_back(b); G[b].push_back(a); } bfs(); deal(); cout<<ans<<"\n"; } int main(){ cin.sync_with_stdio(false); while(cin>>n)init(); return 0; }
三、提取连续子串的特征
Stammering Aliens
https://cn.vjudge.net/problem/UVALive-4513
题意:给一个长串,问至少出现m次的最长连续字串的长度和出现的最右一个字串的起始的位置是多少。
题解:
这道题实际上时刘汝佳蓝书上的一道例题,在做的过程中表现了用到了hash串做减法的思路。
考虑答案中的两个量:最长长度和最右起始位置。最长长度具有某种意义上的单调性:如果长度为n的字串可以符合题目条件,则n-1的也可以(n>1);因此考虑使用二分的形式来枚举字串的长度。最右起始位置可以直观的求解。
考虑递推式:hash[i] = (hash[i-1] * HAHS_P + str[i]) % HASH_MOD
若简化为十进制数字则可以有如下样例:
3129741938274 求字串由2到7的hash值
hash[7] = 31297419
hash[2] = 312
hahs[2-7] = 97419
观察可得:hash[2-7] = hash[7] - hash[2]*10^(7-2);
则实际上只要保证上式在%HASH_MOD意义上成立即可。
#include<bits/stdc++.h> using namespace std; #define ll long long #define ull unsigned long long #define pp pair<ull,ull> const int MAXN = 1000233; const ull HASH_P1 = 233; const ull HASH_P2 = 241; const ull HASH_MOD1 = 1000000037; const ull HASH_MOD2 = 1000000049; #define hash1(x,b) (((ull)x * HASH_P1 + b) % HASH_MOD1) #define hash2(x,b) (((ull)x * HASH_P2 + b) % HASH_MOD2) #define get_next_hash(tmp,b) (make_pair(hash1(tmp.first,b),hash2(tmp.second,b))) pp hashs[MAXN]; pp hash_hex[MAXN]; int m; char str[MAXN]; int str_len,pos; ull mapp[MAXN]; int anss[MAXN]; int mapp_num; bool cmp(int a,int b){ if(mapp[a] == mapp[b])return a<b; return mapp[a]<mapp[b]; } bool check(int length){ pos = -1; mapp_num = 0; anss[mapp_num] = mapp_num; mapp[mapp_num++] = hash1(hashs[length-1].first,hashs[length-1].second); for(int i=length;i<str_len;++i){ ull a = hashs[i].first; ull tmp = (hashs[i-length].first * hash_hex[length].first)%HASH_MOD1; a-= tmp; a+=HASH_MOD1;a%=HASH_MOD1; ull b = hashs[i].second; tmp = (hashs[i-length].second * hash_hex[length].second)%HASH_MOD2; b -= tmp; b+=HASH_MOD2;b%=HASH_MOD2; ull hash_tmp = hash1(a,b); anss[mapp_num] = mapp_num ; mapp[mapp_num++] = hash_tmp; } sort(anss,anss+mapp_num,cmp); int cntt = 1; if(m == 1)pos = anss[0]; for(int i=1;i<mapp_num;++i){ if(mapp[anss[i]] == mapp[anss[i-1]])cntt++; else cntt = 1; if(cntt >= m )pos = max(pos,anss[i]); } return pos != -1; } int bin_search(int a,int b){ if(a == b-1)return a; int mid = (a+b)/2; if(check(mid))return bin_search(mid,b); else return bin_search(a,mid); } void init(){ gets(str); str_len = strlen(str); pp tmp = make_pair(0,0); for(int i=0;i<str_len;++i){ tmp = get_next_hash(tmp,str[i]); hashs[i] = tmp; } int ans = bin_search(0,str_len+1); check(ans); if(ans){ printf("%d %d\n",ans,pos); }else{ puts("none"); } } int main(){ // pp tmp = make_pair(1,1); hash_hex[0] = make_pair(1,1); for(int i=1;i<MAXN;++i){ hash_hex[i] = get_next_hash(hash_hex[i-1],0); } while(~scanf("%d\n",&m)&&m)init(); return 0; }
四、统计字母出现个数
Hidden Anagrams
AIZU:https://cn.vjudge.net/problem/Aizu-1370
Gym:https://cn.vjudge.net/problem/Gym-101158D
UVALive:https://cn.vjudge.net/problem/UVALive-7592
题意:给出两个字符串,求出最大的长度满足,两个字符串都包含该子串,同时两个字串包含的字母的种类和个数完全相同。
题解:思路很简单,就是枚举长度,并检查上面的字符串中是否存在能和下面的串长度相同的,HASH值一致的串。如果有,则检查通过,没有则不通过,从高往低枚举,找到第一个通过的跳出循环<也许会有个常数优化>。
此时HASH算法应当做一个简单的变化:统计某个字母出现的个数。HASH[I] = HASH[I-1] + HASH_HEX[STR[I]-'a']
此实HASH_HEX代表了HASH_P的在对HASH_MOD做膜法操作的前提下的若干次方。
这道题有4个来源可以提交,Gym和AIZU可以让N2LOGN甚至更慢的代码通过,UVALIVE允许N2的代码加入邻接链表优化通过<此时我已经开了IO挂>,HOJ。。。。。需要在进一步的取消HASH2的膜法操作。
#include<math.h> #include<algorithm> #include<vector> #include<stdlib.h> #include<string.h> #include<string> #include<stdio.h> #include<set> #include<map> #include<queue> #include<stack> #include <iostream> #include <limits.h> using namespace std; #define ull unsigned long long #define pp pair<ull,ull> const ull MAXN = 1000249; #define vecu vector<ull> #define vevi vector<int> #define vecp vector<pp > const ull HASH_P1 = 109; const ull HASH_P2 = 4007; const ull HASH_MOD1 = 1000249; const ull HASH_MOD2 = 1000000037; #define hash1(x,b) (((ull)x * HASH_P1 + b)%HASH_MOD1) #define hash2(x,b) (((ull)x * HASH_P2 + b)) #define next_hash(tmp,b) (make_pair(hash1(tmp.first,b),hash2(tmp.second,b))) #define add_hash(tmp,b) (make_pair((tmp.first + hash_hex[idx(b)].first) % HASH_MOD1,(tmp.second + hash_hex[idx(b)].second) )) #define sub_hash(tmpa,tmpb) (make_pair((tmpa.first + HASH_MOD1 - tmpb.first) % HASH_MOD1 , (tmpa.second - tmpb.second) ) ) #define idx(x) (x-'a') namespace fastIO{ #define BUF_SIZE 100000 #define OUT_SIZE 100000 #define ll long long //fread->read bool IOerror=0; inline char nc(){ static char buf[BUF_SIZE],*p1=buf+BUF_SIZE,*pend=buf+BUF_SIZE; if (p1==pend){ p1=buf; pend=buf+fread(buf,1,BUF_SIZE,stdin); if (pend==p1){IOerror=1;return -1;} //{printf("IO error!\n");system("pause");for (;;);exit(0);} } return *p1++; } inline bool blank(char ch){return ch==' '||ch=='\n'||ch=='\r'||ch=='\t';} inline void read(int &x){ bool sign=0; char ch=nc(); x=0; for (;blank(ch);ch=nc()); if (IOerror)return; if (ch=='-')sign=1,ch=nc(); for (;ch>='0'&&ch<='9';ch=nc())x=x*10+ch-'0'; if (sign)x=-x; } inline void read(ll &x){ bool sign=0; char ch=nc(); x=0; for (;blank(ch);ch=nc()); if (IOerror)return; if (ch=='-')sign=1,ch=nc(); for (;ch>='0'&&ch<='9';ch=nc())x=x*10+ch-'0'; if (sign)x=-x; } inline void read(double &x){ bool sign=0; char ch=nc(); x=0; for (;blank(ch);ch=nc()); if (IOerror)return; if (ch=='-')sign=1,ch=nc(); for (;ch>='0'&&ch<='9';ch=nc())x=x*10+ch-'0'; if (ch=='.'){ double tmp=1; ch=nc(); for (;ch>='0'&&ch<='9';ch=nc())tmp/=10.0,x+=tmp*(ch-'0'); } if (sign)x=-x; } inline int read(char *s){ char ch=nc();if(ch == EOF)return -1; for (;blank(ch);ch=nc()); if (IOerror)return -1; for (;!blank(ch)&&!IOerror;ch=nc())*s++=ch; *s=0; return 0; } inline void read(char &c){ for (c=nc();blank(c);c=nc()); if (IOerror){c=-1;return;} } //getchar->read inline void read1(int &x){ char ch;int bo=0;x=0; for (ch=getchar();ch<'0'||ch>'9';ch=getchar())if (ch=='-')bo=1; for (;ch>='0'&&ch<='9';x=x*10+ch-'0',ch=getchar()); if (bo)x=-x; } inline void read1(ll &x){ char ch;int bo=0;x=0; for (ch=getchar();ch<'0'||ch>'9';ch=getchar())if (ch=='-')bo=1; for (;ch>='0'&&ch<='9';x=x*10+ch-'0',ch=getchar()); if (bo)x=-x; } inline void read1(double &x){ char ch;int bo=0;x=0; for (ch=getchar();ch<'0'||ch>'9';ch=getchar())if (ch=='-')bo=1; for (;ch>='0'&&ch<='9';x=x*10+ch-'0',ch=getchar()); if (ch=='.'){ double tmp=1; for (ch=getchar();ch>='0'&&ch<='9';tmp/=10.0,x+=tmp*(ch-'0'),ch=getchar()); } if (bo)x=-x; } inline int read1(char *s){ char ch=getchar(); for (;blank(ch);ch=getchar()); for (;!blank(ch);ch=getchar())*s++=ch; *s=0; } inline void read1(char &c){for (c=getchar();blank(c);c=getchar());} //scanf->read inline void read2(int &x){scanf("%d",&x);} inline void read2(ll &x){ #ifdef _WIN32 scanf("%I64d",&x); #else #ifdef __linux scanf("%lld",&x); #else puts("error:can't recognize the system!"); #endif #endif } inline void read2(double &x){scanf("%lf",&x);} inline void read2(char *s){scanf("%s",s);} inline void read2(char &c){scanf(" %c",&c);} inline void readln2(char *s){gets(s);} //fwrite->write struct Ostream_fwrite{ char *buf,*p1,*pend; Ostream_fwrite(){buf=new char[BUF_SIZE];p1=buf;pend=buf+BUF_SIZE;} void out(char ch){ if (p1==pend){ fwrite(buf,1,BUF_SIZE,stdout);p1=buf; } *p1++=ch; } void print(int x){ static char s[15],*s1;s1=s; if (!x)*s1++='0';if (x<0)out('-'),x=-x; while(x)*s1++=x%10+'0',x/=10; while(s1--!=s)out(*s1); } void println(int x){ static char s[15],*s1;s1=s; if (!x)*s1++='0';if (x<0)out('-'),x=-x; while(x)*s1++=x%10+'0',x/=10; while(s1--!=s)out(*s1); out('\n'); } void print(ll x){ static char s[25],*s1;s1=s; if (!x)*s1++='0';if (x<0)out('-'),x=-x; while(x)*s1++=x%10+'0',x/=10; while(s1--!=s)out(*s1); } void println(ll x){ static char s[25],*s1;s1=s; if (!x)*s1++='0';if (x<0)out('-'),x=-x; while(x)*s1++=x%10+'0',x/=10; while(s1--!=s)out(*s1); out('\n'); } void print(double x,int y){ static ll mul[]={1,10,100,1000,10000,100000,1000000,10000000,100000000, 1000000000,10000000000LL,100000000000LL,1000000000000LL,10000000000000LL, 100000000000000LL,1000000000000000LL,10000000000000000LL,100000000000000000LL}; if (x<-1e-12)out('-'),x=-x;x*=mul[y]; ll x1=(ll)floor(x); if (x-floor(x)>=0.5)++x1; ll x2=x1/mul[y],x3=x1-x2*mul[y]; print(x2); if (y>0){out('.'); for (size_t i=1;i<y&&x3*mul[i]<mul[y];out('0'),++i); print(x3);} } void println(double x,int y){print(x,y);out('\n');} void print(char *s){while (*s)out(*s++);} void println(char *s){while (*s)out(*s++);out('\n');} void flush(){if (p1!=buf){fwrite(buf,1,p1-buf,stdout);p1=buf;}} ~Ostream_fwrite(){flush();} }Ostream; inline void print(int x){Ostream.print(x);} inline void println(int x){Ostream.println(x);} inline void print(char x){Ostream.out(x);} inline void println(char x){Ostream.out(x);Ostream.out('\n');} inline void print(ll x){Ostream.print(x);} inline void println(ll x){Ostream.println(x);} inline void print(double x,int y){Ostream.print(x,y);} inline void println(double x,int y){Ostream.println(x,y);} inline void print(char *s){Ostream.print(s);} inline void println(char *s){Ostream.println(s);} inline void println(){Ostream.out('\n');} inline void flush(){Ostream.flush();} //puts->write char Out[OUT_SIZE],*o=Out; inline void print1(int x){ static char buf[15]; char *p1=buf;if (!x)*p1++='0';if (x<0)*o++='-',x=-x; while(x)*p1++=x%10+'0',x/=10; while(p1--!=buf)*o++=*p1; } inline void println1(int x){print1(x);*o++='\n';} inline void print1(ll x){ static char buf[25]; char *p1=buf;if (!x)*p1++='0';if (x<0)*o++='-',x=-x; while(x)*p1++=x%10+'0',x/=10; while(p1--!=buf)*o++=*p1; } inline void println1(ll x){print1(x);*o++='\n';} inline void print1(char c){*o++=c;} inline void println1(char c){*o++=c;*o++='\n';} inline void print1(char *s){while (*s)*o++=*s++;} inline void println1(char *s){print1(s);*o++='\n';} inline void println1(){*o++='\n';} inline void flush1(){if (o!=Out){if (*(o-1)=='\n')*--o=0;puts(Out);}} struct puts_write{ ~puts_write(){flush1();} }_puts; inline void print2(int x){printf("%d",x);} inline void println2(int x){printf("%d\n",x);} inline void print2(char x){printf("%c",x);} inline void println2(char x){printf("%c\n",x);} inline void print2(ll x){ #ifdef _WIN32 printf("%I64d",x); #else #ifdef __linux printf("%lld",x); #else puts("error:can't recognize the system!"); #endif #endif } inline void println2(ll x){print2(x);printf("\n");} inline void println2(){printf("\n");} #undef ll #undef OUT_SIZE #undef BUF_SIZE }; char str1[MAXN]; char str2[MAXN]; int str1_len,str2_len; pp hash_hex[MAXN]; pp str1_hash[MAXN]; pp str2_hash[MAXN]; class hash_node{ public: ull val;int next; }; hash_node hash_nodes[MAXN]; int hash_nodes_num; int hash_table[MAXN]; inline int new_hash_nodes(int idx,ull key){ hash_nodes[hash_nodes_num].next = hash_table[idx]; hash_nodes[hash_nodes_num].val = key; return hash_nodes_num++; } inline bool hash_find_key(int idx,ull key){ int now = hash_table[idx]; while(now!=-1){ if(hash_nodes[now].val == key)return true; now = hash_nodes[now].next; }return false; } inline void hash_insert(int idx,ull key){ hash_table[idx] = new_hash_nodes(idx,key); } inline void hash_clear(int idx){ hash_table[idx] = -1; } // vecu hash_table[HASH_MOD1]; // inline bool find_key(ull idx,ull key){ // int len = hash_table[idx].size(); // for(int i=0;i<len;++i){ // if(hash_table[idx][i] == key)return true; // }return false; // } // inline void hash_insert(ull idx,ull key){ // hash_table[idx].push_back(key); // } // inline void hash_clear(ull idx){ // hash_table[idx].clear(); // } inline bool check(int length){ hash_nodes_num = 0; hash_insert(str1_hash[length-1].first,str1_hash[length-1].second); for(int i=length;i<str1_len;++i){ pp tmp = sub_hash(str1_hash[i],str1_hash[i-length]); hash_insert(tmp.first,tmp.second); } if(hash_find_key(str2_hash[length-1].first,str2_hash[length-1].second))return true; for(int i=length;i<str2_len;++i){ pp tmp = sub_hash(str2_hash[i],str2_hash[i-length]); // hash_insert(tmp.first,tmp.second); if(hash_find_key(tmp.first,tmp.second))return true; } hash_clear(str1_hash[length-1].first); for(int i=length;i<str1_len;++i){ pp tmp = sub_hash(str1_hash[i],str1_hash[i-length]); hash_clear(tmp.first); } return false; } void init(){ // for(int i=0;i<HASH_MOD1;++i)hash_table[i].clear(); memset(hash_table,-1,sizeof(hash_table)); str1_len = strlen(str1); str2_len = strlen(str2); str1_hash[0] = hash_hex[idx(str1[0])]; str2_hash[0] = hash_hex[idx(str2[0])]; for(int i=1;i<str1_len;++i)str1_hash[i] = add_hash(str1_hash[i-1],str1[i]); for(int i=1;i<str2_len;++i)str2_hash[i] = add_hash(str2_hash[i-1],str2[i]); int limit = min(str1_len,str2_len); int ans = 0; for(int i=limit;i;i--){ if(check(i)){ ans = i; break; } } // cout<<ans<<"\n"; fastIO::println(ans); } int main(){ hash_hex[0] = make_pair(1,1); for(int i=1;i<233;++i) hash_hex[i] = next_hash(hash_hex[i-1],0); // while(gets(str1)&&gets(str2))init(); while(~fastIO::read(str1) && ~fastIO::read(str2))init(); // while() return 0; }