常用字符串匹配算法(brute force, kmp, sunday)
1. 暴力解法
// 暴力求解 int Idx(string S, string T){ // 返回第一个匹配元素的位置,若没有匹配的子串,则返回-1 int S_size = S.length(); int T_size = T.length(); if(S_size == T_size && S_size == 0) return 0; if(S_size < T_size) return -1; int head = 0; int i = head; int j = 0; while(i < S_size && j < T_size){ if(S[i] == T[j]){ ++i; ++j; if(j == T_size && i <= S_size) return head; } else{ ++head; i = head;// i回溯, 在kmp算法中,i不会出现回溯,即i值不会减小 j = 0; } } return -1; }
2. KMP (包括返回第一个匹配字符串的位置和返回所有匹配字符串的位置)
void PartialMatchTable(string s, int next[]){ int len = s.length(); next[0] = -1; int i = 0; int j = -1; while(i < len){ if(j == -1 || s[i] == s[j]){ ++i; ++j; next[i] = j; } else j = next[j]; } } int kmp(string s, string p){ int s_size = s.length(); int p_size = p.length(); int next[p_size]; PartialMatchTable(p, next); int i = 0; int j = 0; while(i < s_size && j < p_size){ if(j == -1 || s[i] == p[j]){ i++; j++; } else{ j = next[j]; } } if(j == p_size) return i-j; else return -1; } // kmp_vec(string s, string p)找出所有匹配位置 vector<int> kmp_vec(string s, string p){ int s_size = s.length(); int p_size = p.length(); vector<int> pos; int next[p_size]; PartialMatchTable(p, next); int i = 0; int j = 0; while(i < s_size && j < p_size){ if(j == -1 || s[i] == p[j]){ i++; j++; if(j == p_size){ pos.push_back(i-j); j = 0; } } else{ j = next[j]; } } if(pos.size() == 0) pos.push_back(-1); return pos; }
3. Sunday
int SundaySearch(string t, string p){ int t_size = t.size(); int p_size = p.size(); if(p_size <= 0 || t_size <= 0) return -1; int i = 0, j = 0; int k; int m = p_size; while(i < t_size){ if(t[i] != p[j]) {// 不相等 for(k = p_size-1; k>=0; --k) { if(p[k] == t[m]) break; } // i = i + p_size - k; i = m - k; j = 0; m = i + p_size; } else { // 相等,比较下一个字符 i++; j++; if(j == p_size) return i-j; } } return -1; }
4. 完整代码
/* * @Author: z.c.wang * @Email: iwangzhengchao@gmail.com * @Last Modified time: 2019-01-23 14:39:58 */ #include<iostream> #include<string> using namespace std; /** * 方法1. brute force * 方法2. KMP (kmp_next, kmp_dfa) * 方法3. Sunday */ /** * brute_force description: * 暴力求解,在字符串s中匹配字符串p * @param t [text, 文本串] * @param p [pattern, 模式串] * @return [若s含有p, 则返回第一个匹配的位置,否则,返回-1] */ int brute_force(string t, string p){ int t_size = t.length(); int p_size = p.length(); if(t_size == p_size && t_size == 0) return 0; if(t_size < p_size) return -1; int head = 0; int i = head; int j = 0; while(i < t_size && j < p_size){ if(t[i] == p[j]){ i++; j++; if(j == p_size && i <= t_size) return head; } else{ head++; i = head; j = 0; } } return -1; } // 暴力求解的另一种写法 int brute_force2(string t, string p){ int t_size = t.length(); int p_size = p.length(); if(t_size == p_size && t_size == 0) return 0; if(t_size < p_size) return -1; int i, j; for(i = 0, j = 0; i < t_size && j < p_size; i++){ if(t[i] == p[j]){ j++; } else{ i -= j; j = 0; } } if(j == p_size) // 找到匹配 return i - j; else // 为找到匹配 return -1; } /** * ParticalMatchTable description: * 对字符串p生成next数组 * @param p [pattern string] * @param next [next数组] */ void ParticalMatchTable(string p, int next[]){ int i = 0; int j = -1; next[0] = -1; while(i < p.length()){ if(j == -1 || p[i] == p[j]){ i++; j++; next[i] = j; } else{ j = next[j]; } } } /** * kmp algorithm based on next * kmp_next algorithm * @param t [text string] * @param p [pattern string] * @return [若s含有p, 则返回第一个匹配的位置,否则,返回-1] */ int kmp_next(string t, string p){ int t_size = t.length(); int p_size = p.length(); int next[p_size]; ParticalMatchTable(p, next); int i = 0; int j = 0; while(i < t_size && j < p_size){ if(j == -1 || t[i] == p[j]){ i++; j++; } else{ j = next[j]; } } if(j == p_size) return i-j; else return -1; } /*kmp algorithm based on dfa */ int kmp_dfa(string t, string p){ int row = 256; int col = p.length(); // 动态分配数组并初始化 int** dfa = new int*[row]; for(int i = 0; i < row; i++) dfa[i] = new int[col]; for(int i = 0 ; i < row ; i++) for(int j = 0; j < col; j++) dfa[i][j] = 0; // 计算dfa dfa[p[0]][0] = 1; for (int j = 1, x = 0; j < col; ++j) { for (int i = 0; i < row; ++i) dfa[i][j] = dfa[i][x]; dfa[p[j]][j] = j + 1; x = dfa[p[j]][x]; } // kmp algo int i, j; int t_size = t.length(); int p_size = p.length(); for (i = 0, j = 0; i < t_size && j < p_size; i++){ j = dfa[t[i]][j]; } if(j == p_size) return i-j; else return -1; } /** * [Sunday description] * @param t [description] * @param p [description] * @return [description] */ int Sunday(string t, string p){ int t_size = t.length(); int p_size = p.length(); if(p_size == t_size && t_size == 0) return 0; if(p_size < 0 || t_size < 0) return -1; int i = 0; int j = 0; int k; int m = p_size; while(i < t_size){ if(t[i] != p[j]){ for(k = p_size-1; k >= 0; --k){ if(p[k] == t[m]) break; } i = m - k; j = 0; m = i + p_size; } else{ i++; j++; if(j == p_size) return i-j; } } return -1; } /** * [main description] * @param argc [description] * @param argv [description] * @return [description] */ int main(int argc, char const *argv[]) { string t = "bbc abcdab abcdabcdabde"; string p = "abcdabd"; // brute force cout<<"brute_force : "<<brute_force(t, p)<<endl; // kmp_next cout<<"kmp_next : "<<kmp_next(t, p)<<endl; // kmp_dfa cout<<"kmp_dfa : "<<kmp_dfa(t, p)<<endl; // Sunday cout<<"Sunday : "<<Sunday(t, p)<<endl; cout<<endl; return 0; }
5. 运行结果
brute_force : 15 kmp_next : 15 kmp_dfa : 15 Sunday : 15
6. 资料
D.M. Sunday: A Very Fast Substring Search Algorithm. Communications of the ACM
July. 从头到尾彻底理解KMP(2014年8月22日版)