25、字符串匹配 Rabin-Karp 算法

内容来自刘宇波老师算法与数据结构体系课

1、暴力搜索

image

/**
 * 子串匹配
 */
public class Bruteforce {

    private Bruteforce() {
    }

    /**
     * 暴力搜索 O(|s| * |t|)
     */
    public static int bruteforce(String s, String t) {
        if (s.length() < t.length()) return -1;

        // s[i ... i + t.length - 1] == t ?
        for (int i = 0; i + t.length() - 1 < s.length(); i++) {
            int j;
            for (j = 0; j < t.length(); j++) {
                if (s.charAt(i + j) != t.charAt(j)) break;
            }
            if (j == t.length()) return i;
        }

        return -1;
    }
}

2、改进思路

image
image

3、字符串转哈希思想

3.1、段式回文

1147 - 段式回文

解决

public class LongestDecomposition {

    public int longestDecomposition(String text) {
        return solve(text, 0, text.length() - 1);
    }

    /**
     * s[left ... right]
     */
    private int solve(String s, int left, int right) {
        if (left > right) return 0;

        for (int l = left, r = right; l < r; l++, r--) {
            // s[left ... l] == s[r ... right] ?
            if (equal(s, left, l, r, right)) return 2 + solve(s, l + 1, r - 1);
        }

        return 1;
    }

    /**
     * s[l1 ... r1] == s[l2 ... r2] ?
     */
    private boolean equal(String s, int l1, int r1, int l2, int r2) {
        for (; l1 <= r1 && l2 <= r2; l1++, l2++) {
            if (s.charAt(l1) != s.charAt(l2)) return false;
        }
        return true;
    }
}

优化

利用 Hash 来加速字符串比较

image

public class LongestDecomposition {

    private final long MOD = (long) (1e9 + 7);
    private long[] pow26;

    public int longestDecomposition(String text) {
        // pow26[i] = (26 ^ i) % MOD
        pow26 = new long[text.length()];
        pow26[0] = 1;
        for (int i = 1; i < pow26.length; i++) pow26[i] = (pow26[i - 1] * 26) % MOD;

        return solve(text, 0, text.length() - 1);
    }

    /**
     * s[left ... right]
     */
    private int solve(String s, int left, int right) {
        if (left > right) return 0;

        int B = 26;
        long prevHash = 0;
        long postHash = 0;
        for (int l = left, r = right; l < r; l++, r--) {
            // s[left ... l] == s[r ... right] ? 有可能存在哈希冲突
            prevHash = (prevHash * B + (s.charAt(l) - 'a')) % MOD;
            postHash = ((s.charAt(r) - 'a') * pow26[right - r] + postHash) % MOD;

            if (prevHash == postHash && equal(s, left, l, r, right)) return 2 + solve(s, l + 1, r - 1);
        }

        return 1;
    }

    /**
     * s[l1 ... r1] == s[l2 ... r2] ?
     */
    private boolean equal(String s, int l1, int r1, int l2, int r2) {
        for (; l1 <= r1 && l2 <= r2; l1++, l2++) {
            if (s.charAt(l1) != s.charAt(l2)) return false;
        }
        return true;
    }
}

3.2、最长快乐前缀

1392 - 最长快乐前缀

解决

public class LongestPrefix {

    public String longestPrefix(String s) {
        // s[0 ... len - 1] == s[s.length - len ... s.length - 1] ?
        for (int len = s.length() - 1; len >= 1; len--) {
            if (equal(s, 0, len - 1, s.length() - len, s.length() - 1)) return s.substring(0, len);
        }
        return "";
    }

    /**
     * s[l1 ... r1] == s[l2 ... r2] ?
     */
    private boolean equal(String s, int l1, int r1, int l2, int r2) {
        for (; l1 <= r1 && l2 <= r2; l1++, l2++) {
            if (s.charAt(l1) != s.charAt(l2)) return false;
        }
        return true;
    }
}

优化

利用 Hash 来加速字符串比较

image

(a + b) % M == (a % M + b % M) % M
(a * b) % M == (a % M * b % M) % M
(a / b) % M != (a % M / b % M) % M
public class LongestPrefix {

    private final long MOD = (long) (1e9 + 7);
    private long[] pow26;

    public String longestPrefix(String s) {
        // pow26[i] = (26 ^ i) % MOD
        pow26 = new long[s.length()];
        pow26[0] = 1;
        for (int i = 1; i < pow26.length; i++) pow26[i] = (pow26[i - 1] * 26) % MOD;

        // prevHash[i] = hash(s[0 ... i])
        long[] prevHash = new long[s.length()];
        prevHash[0] = s.charAt(0) - 'a';
        for (int i = 1; i < s.length(); i++) {
            prevHash[i] = (prevHash[i - 1] * 26 + s.charAt(i) - 'a') % MOD;
        }

        // postHash[i] = hash(s[i ... s.length - 1])
        long[] postHash = new long[s.length()];
        postHash[postHash.length - 1] = s.charAt(s.length() - 1) - 'a';
        for (int i = postHash.length - 2; i >= 0; i--) {
            postHash[i] = ((s.charAt(i) - 'a') * pow26[s.length() - 1 - i] + postHash[i + 1]) % MOD;
        }

        for (int len = s.length() - 1; len >= 1; len--) {
            // s[0 ... len - 1] == s[s.length - len ... s.length - 1] ? 有可能存在哈希冲突
            if (prevHash[len - 1] == postHash[s.length() - len] && equal(s, 0, len - 1, s.length() - len, s.length() - 1)) {
                return s.substring(0, len);
            }
        }

        return "";
    }

    /**
     * s[l1 ... r1] == s[l2 ... r2] ?
     */
    private boolean equal(String s, int l1, int r1, int l2, int r2) {
        for (; l1 <= r1 && l2 <= r2; l1++, l2++) {
            if (s.charAt(l1) != s.charAt(l2)) return false;
        }
        return true;
    }
}

3.3、重复的 DNA 序列

187 - 重复的 DNA 序列

解决

public class FindRepeatedDnaSequences {

    public List<String> findRepeatedDnaSequences(String s) {
        HashSet<String> seen = new HashSet<>();
        HashSet<String> res = new HashSet<>();

        // s[i ... i + 9]
        for (int i = 0; i + 9 < s.length(); i++) {
            String key = s.substring(i, i + 10);
            if (seen.contains(key)) res.add(key);
            else seen.add(key);
        }

        return new ArrayList<>(res);
    }
}

优化

利用滚动 Hash 来加速字符串比较

image
image

public class FindRepeatedDnaSequences {

    /**
     * 滚动哈希, 10 进制
     */
    public List<String> findRepeatedDnaSequences(String s) {
        if (s.length() <= 10) return new ArrayList<>();

        HashSet<Long> seen = new HashSet<>();
        HashSet<String> res = new HashSet<>();

        int[] map = new int[256];
        map['A'] = 1;
        map['C'] = 2;
        map['G'] = 3;
        map['T'] = 4;

        long hash = 0;
        long ten9 = (long) 1e9;
        // hash = (s[0 ... 8])
        for (int i = 0; i < 9; i++) hash = hash * 10 + map[s.charAt(i)];

        // hash = hash(s[i - 9 ... i])
        for (int i = 9; i < s.length(); i++) {
            hash = hash * 10 + map[s.charAt(i)];

            if (seen.contains(hash)) res.add(s.substring(i - 9, i + 1));
            else seen.add(hash);

            hash -= map[s.charAt(i - 9)] * ten9;
        }

        return new ArrayList<>(res);
    }
}

4、Rabin-Karp

image

/**
 * Rabin-Karp 算法, 滚动哈希思想 O(n)
 */
public class RabinKarp {

    private RabinKarp() {
    }

    public static int rabinKarp(String s, String t) {
        if (t.length() == 0) return 0;
        if (s.length() < t.length()) return -1;

        int B = 256;
        long MOD = (long) (1e9 + 7);
        long P = 1; // P = B ^ (t.length - 1)
        for (int i = 0; i < t.length() - 1; i++) P = P * B % MOD;

        // targetHash = hash(t)
        long targetHash = 0;
        for (int i = 0; i < t.length(); i++) targetHash = (targetHash * B + t.charAt(i)) % MOD;

        // curHash = hash(s[0 ... t.length - 2])
        long curHash = 0;
        for (int i = 0; i < t.length() - 1; i++) curHash = (curHash * B + s.charAt(i)) % MOD;

        // hash(s[i - t.length + 1 ... i])
        for (int i = t.length() - 1; i < s.length(); i++) {
            curHash = (curHash * B + s.charAt(i)) % MOD;
            if (curHash == targetHash && equal(s, i - t.length() + 1, i, t)) return i - t.length() + 1;
            curHash = (curHash - s.charAt(i - t.length() + 1) * P % MOD + MOD) % MOD; // 注意
        }

        return -1;
    }

    /**
     * s[l ... r] == t ?
     */
    private static boolean equal(String s, int l, int r, String t) {
        for (int i = 0; i < t.length(); i++) {
            if (s.charAt(l + i) != t.charAt(i)) return false;
        }
        return true;
    }
}

5、复杂度分析

image

posted @ 2023-04-17 13:11  lidongdongdong~  阅读(23)  评论(0编辑  收藏  举报