[itint5]字符串匹配
用hash来做,目前为止做到最好也是case16超时(20w的规模),即使分桶也超时。注意计算hashcode时,'a'要算成1,否则如果'a'为0,那么"aa"和"a"是一样的。下面是超时的代码:
#define BUCKET 65535 #define ulong long long vector<unordered_set<ulong> > uset(BUCKET); vector<ulong> pow26(11); ulong hashcode(char *str, int n) { ulong code = 0; for (int i = 0; i < n; i++) { code = code * 26 + str[i] - 'a' + 1; } return code; } // 预处理初始化 void initWithString(char *str) { int len = 0; while (str[len] != '\0') { len++; } ulong num = 1; pow26[0] = 1; for (int i = 1; i <= 10; i++) { num *= 26; pow26[i] = num; } for (int l = 1; l <= 10; l++) { vector<ulong> codes(len); for (int i = 0; i < len; i++) { if (i + l <= len) { ulong code = 0l; if (i == 0) { code = hashcode(str+i, l); codes[i] = code; } else { ulong diff = pow26[l-1]; diff *= (str[i-1] - 'a' + 1); code = (codes[i-1] - diff) * 26 + str[i+l-1] - 'a' + 1; codes[i] = code; } int buck = code % BUCKET; uset[buck].insert(code); } } } } // 如果query是str的字串,返回true,否则返回false bool existSubString(char *query) { int len = strlen(query); ulong code = hashcode(query, len); int buck = code % BUCKET; if (uset[buck].find(code) != uset[buck].end()) { return true; } else { return false; } }
如果只存长度为10的字符串到排序好的vector里,然后用二分来做,是能过的。注意有的源字符串长度就小于10了。其他的备选方法还有trie以及后缀数组。
vector<string> vec; // 预处理初始化 void initWithString(char *str) { set<string> sset; int len = strlen(str); for (int i = 0; i < len; i++) { if (i + 10 >= len) { string sub(str+i); sset.insert(sub); } else { string sub(str+i, str+i+10); sset.insert(sub); } } for (set<string>::iterator it = sset.begin(); it != sset.end(); it++) { vec.push_back(*it); } } // 如果query是str的字串,返回true,否则返回false bool existSubString(char *query) { string str(query); int low = 0; int high = vec.size()-1; while (low <= high) { int mid = (low + high) / 2; bool found = true; for (int i = 0; i < str.length(); i++) { if (vec[mid][i] < str[i]) { low = mid + 1; found = false; break; } else if (vec[mid][i] > str[i]) { high = mid - 1; found = false; break; } } if (found) return true; } return false; }