POJ3294 Life Forms —— 后缀数组 最长公共子串
题目链接:https://vjudge.net/problem/POJ-3294
Time Limit: 5000MS | Memory Limit: 65536K | |
Total Submissions: 16905 | Accepted: 4970 |
Description
You may have wondered why most extraterrestrial life forms resemble humans, differing by superficial traits such as height, colour, wrinkles, ears, eyebrows and the like. A few bear no human resemblance; these typically have geometric or amorphous shapes like cubes, oil slicks or clouds of dust.
The answer is given in the 146th episode of Star Trek - The Next Generation, titled The Chase. It turns out that in the vast majority of the quadrant's life forms ended up with a large fragment of common DNA.
Given the DNA sequences of several life forms represented as strings of letters, you are to find the longest substring that is shared by more than half of them.
Input
Standard input contains several test cases. Each test case begins with 1 ≤ n ≤ 100, the number of life forms. n lines follow; each contains a string of lower case letters representing the DNA sequence of a life form. Each DNA sequence contains at least one and not more than 1000 letters. A line containing 0 follows the last test case.
Output
For each test case, output the longest string or strings shared by more than half of the life forms. If there are many, output all of them in alphabetical order. If there is no solution with at least one letter, output "?". Leave an empty line between test cases.
Sample Input
3 abcdefg bcdefgh cdefghi 3 xxx yyy zzz 0
Sample Output
bcdefg cdefgh ?
Source
题意:
给出n个字符串,问是否存在至少出现于n/2+1个字符串中的公共子串。如果存在,输入长度最大的;如果有多个答案,按字典序输出所有。
题解:
1.将n个字符串拼接在一起,并且相邻两个之间用分隔符隔开,并且分隔符应各异。因此得到新串。
2.求出新串的后缀数组,然后二分公共子串的长度mid:可知当前的mid可将新串的后缀按排名的顺序将其分成若干组,且每一组的最长公共前缀都大于等于mid,于是就在每一组内统计出现了多少个字符串,如果>n/2,即表明当前mid合法,否则不合法,因此可以根据此规则最终求得长度。
3.由于题目还要求按字典序输出所有答案。所以,在求得长度之后,再遍历一遍sa[]数组,并且判断每个分组是否满足要求,若满足,则输出答案。
注意点:
1.每个分隔符应该不一样,如果一样,在求后缀数组的时候就很可能从当前字符串匹配到下一个字符串,而这是不可能的,因为对于每个字符,最多只能匹配到串尾。
2.输出答案时,为了避免同一组内多次输出(每一组对应着一个子串),应该加个标记。
代码如下:
1 #include <iostream> 2 #include <cstdio> 3 #include <cstring> 4 #include <algorithm> 5 #include <vector> 6 #include <cmath> 7 #include <queue> 8 #include <stack> 9 #include <map> 10 #include <string> 11 #include <set> 12 using namespace std; 13 typedef long long LL; 14 const int INF = 2e9; 15 const LL LNF = 9e18; 16 const int MOD = 1e9+7; 17 const int MAXN = 2e5+100; 18 19 int id[MAXN]; //记录属于哪个字符串 20 int r[MAXN], sa[MAXN], Rank[MAXN], height[MAXN]; 21 int t1[MAXN], t2[MAXN], c[MAXN]; 22 23 bool cmp(int *r, int a, int b, int l) 24 { 25 return r[a]==r[b] && r[a+l]==r[b+l]; 26 } 27 28 void DA(int str[], int sa[], int Rank[], int height[], int n, int m) 29 { 30 n++; 31 int i, j, p, *x = t1, *y = t2; 32 for(i = 0; i<m; i++) c[i] = 0; 33 for(i = 0; i<n; i++) c[x[i] = str[i]]++; 34 for(i = 1; i<m; i++) c[i] += c[i-1]; 35 for(i = n-1; i>=0; i--) sa[--c[x[i]]] = i; 36 for(j = 1; j<=n; j <<= 1) 37 { 38 p = 0; 39 for(i = n-j; i<n; i++) y[p++] = i; 40 for(i = 0; i<n; i++) if(sa[i]>=j) y[p++] = sa[i]-j; 41 42 for(i = 0; i<m; i++) c[i] = 0; 43 for(i = 0; i<n; i++) c[x[y[i]]]++; 44 for(i = 1; i<m; i++) c[i] += c[i-1]; 45 for(i = n-1; i>=0; i--) sa[--c[x[y[i]]]] = y[i]; 46 47 swap(x, y); 48 p = 1; x[sa[0]] = 0; 49 for(i = 1; i<n; i++) 50 x[sa[i]] = cmp(y, sa[i-1], sa[i], j)?p-1:p++; 51 52 if(p>=n) break; 53 m = p; 54 } 55 56 int k = 0; 57 n--; 58 for(i = 0; i<=n; i++) Rank[sa[i]] = i; 59 for(i = 0; i<n; i++) 60 { 61 if(k) k--; 62 j = sa[Rank[i]-1]; 63 while(str[i+k]==str[j+k]) k++; 64 height[Rank[i]] = k; 65 } 66 } 67 68 bool vis[110]; 69 bool test(int n, int len, int k) 70 { 71 int cnt = 0; 72 memset(vis, false, sizeof(vis)); 73 for(int i = 2; i<=len; i++) 74 { 75 if(height[i]<k) 76 { 77 cnt = 0; 78 memset(vis, false, sizeof(vis)); 79 } 80 else 81 { 82 if(!vis[id[sa[i-1]]]) vis[id[sa[i-1]]] = true, cnt++; 83 if(!vis[id[sa[i]]]) vis[id[sa[i]]] = true, cnt++; 84 if(cnt>n/2) return true; 85 } 86 } 87 return false; 88 } 89 90 void Print(int n, int len, int k) 91 { 92 int cnt = 0, flag = false; 93 memset(vis, false, sizeof(vis)); 94 for(int i = 2; i<=len; i++) 95 { 96 if(height[i]<k) 97 { 98 flag = false; 99 cnt = 0; 100 memset(vis, false, sizeof(vis)); 101 } 102 else 103 { 104 if(!vis[id[sa[i-1]]]) vis[id[sa[i-1]]] = true, cnt++; 105 if(!vis[id[sa[i]]]) vis[id[sa[i]]] = true, cnt++; 106 if(cnt>n/2 &&!flag) 107 { 108 flag = true; //表明当前组已经输出了 109 for(int j = sa[i]; j<sa[i]+k; j++) 110 putchar(r[j]+'a'-1); 111 putchar('\n'); 112 } 113 } 114 } 115 } 116 117 char str[MAXN]; 118 int main() 119 { 120 int n, firCase = false; 121 while(scanf("%d", &n)&&n) 122 { 123 int len = 0; 124 for(int i = 0; i<n; i++) 125 { 126 scanf("%s", str); 127 int LEN = strlen(str); 128 for(int j = 0; j<LEN; j++) 129 { 130 r[len] = str[j]-'a'+1; 131 id[len++] = i; 132 } 133 r[len] = 30+i; //分隔符要各异 134 id[len++] = i; 135 } 136 r[len] = 0; 137 DA(r,sa,Rank,height,len,200); 138 139 int l = 0, r = 1000; 140 while(l<=r) 141 { 142 int mid = (l+r)>>1; 143 if(test(n,len,mid)) 144 l = mid + 1; 145 else 146 r = mid - 1; 147 } 148 149 if(firCase) printf("\n"); 150 firCase = true; 151 if(r==0) puts("?"); 152 else Print(n, len, r); 153 } 154 }