[itint5]字符串匹配

http://www.itint5.com/oj/#15

用hash来做，目前为止做到最好也是case16超时（20w的规模），即使分桶也超时。注意计算hashcode时，'a'要算成1，否则如果'a'为0，那么"aa"和"a"是一样的。下面是超时的代码：

#define BUCKET 65535
#define ulong long long

vector<unordered_set<ulong> > uset(BUCKET);
vector<ulong> pow26(11);

ulong hashcode(char *str, int n) {
    ulong code = 0;
    for (int i = 0; i < n; i++) {
        code = code * 26 + str[i] - 'a' + 1;
    }
    return code;
}

// 预处理初始化
void initWithString(char *str) {
    int len = 0;
    while (str[len] != '\0') {
        len++;
    }
    ulong num = 1;
    pow26[0] = 1;
    for (int i = 1; i <= 10; i++) {
        num *= 26;
        pow26[i] = num;
    }
    for (int l = 1; l <= 10; l++) {
        vector<ulong> codes(len);
        for (int i = 0; i < len; i++) {
            if (i + l <= len) {
                ulong code = 0l;
                if (i == 0) {
                    code = hashcode(str+i, l);
                    codes[i] = code;
                } else {
                    ulong diff = pow26[l-1];
                    diff *= (str[i-1] - 'a' + 1);
                    code = (codes[i-1] - diff) * 26 + str[i+l-1] - 'a' + 1;
                    codes[i] = code;
                }
				
                int buck = code % BUCKET;
                uset[buck].insert(code);
            }
        }
    }
}
// 如果query是str的字串,返回true,否则返回false
bool existSubString(char *query) {
    int len = strlen(query);
    ulong code = hashcode(query, len);
    int buck = code % BUCKET;
    if (uset[buck].find(code) != uset[buck].end()) {
        return true;
    } else {
        return false;
    }
}

如果只存长度为10的字符串到排序好的vector里，然后用二分来做，是能过的。注意有的源字符串长度就小于10了。其他的备选方法还有trie以及后缀数组。

vector<string> vec;

// 预处理初始化
void initWithString(char *str) {
    set<string> sset;
    int len = strlen(str);
    for (int i = 0; i < len; i++) {
		if (i + 10 >= len) {
			string sub(str+i);
			sset.insert(sub);
		} else {
			string sub(str+i, str+i+10);
			sset.insert(sub);
		}
    }
    
    for (set<string>::iterator it = sset.begin(); it != sset.end(); it++) {
        vec.push_back(*it);
    }
}
// 如果query是str的字串,返回true,否则返回false
bool existSubString(char *query) {
    string str(query);
    int low = 0;
    int high = vec.size()-1;
    
    while (low <= high) {
        int mid = (low + high) / 2;
        bool found = true;
        for (int i = 0; i < str.length(); i++) {
            if (vec[mid][i] < str[i]) {
                low = mid + 1;
                found = false;
                break;
            } else if (vec[mid][i] > str[i]) {
                high = mid - 1;
                found = false;
                break;
            }
        }
        if (found) return true;
    }
    return false;
}

posted @ 2014-01-20 18:35 阿牧遥阅读(305) 评论(0) 收藏举报

刷新页面返回顶部

阿牧遥

[itint5]字符串匹配

公告