LeetCode 1967. 作为子字符串出现在单词中的字符串数目

给你一个字符串数组 patterns 和一个字符串 word ，统计 patterns 中有多少个字符串是 word 的子字符串。返回字符串数目。

子字符串是字符串中的一个连续字符序列。

示例 1：

输入：patterns = [“a”,“abc”,“bc”,“d”], word = “abc”
输出：3
解释：

“a” 是 “abc” 的子字符串。
“abc” 是 “abc” 的子字符串。
“bc” 是 “abc” 的子字符串。
“d” 不是 “abc” 的子字符串。
patterns 中有 3 个字符串作为子字符串出现在 word 中。

1 <= patterns.length <= 100
1 <= patterns[i].length <= 100
1 <= word.length <= 100
patterns[i] 和 word 由小写英文字母组成

解法一：直接用库函数：

class Solution {
public:
    int numOfStrings(vector<string>& patterns, string word) {
        int ans = 0;
        for (string &s : patterns) {
            if (word.find(s) != string::npos) {
                ++ans;
            }
        }

        return ans;
    }
};

如果输入数组patterns的长度为n，其中元素的长度为m，word的长度为l，此算法时间复杂度为O（nml），空间复杂度为O（1）。

解法二：暴力匹配，遍历patterns中的每个字符串s，看word中是否有字符串s：

class Solution {
public:
    int numOfStrings(vector<string>& patterns, string word) {
        int ans = 0;
        int wordSz = word.size();
        for (string &s : patterns) {
            int sSz = s.size();
            for (int i = 0; i < wordSz - sSz + 1; ++i) {
                int j = 0;
                for (j = 0; j < sSz; ++j) {
                    if (word[i + j] != s[j]) {
                        break;
                    }
                }

                if (j == sSz) {
                    ++ans;
                    break;
                }
            }
        }

        return ans;
    }
};

如果输入数组patterns的长度为n，其中元素的长度为m，word的长度为l，此算法时间复杂度为O（nml），空间复杂度为O（1）。

解法三：BM算法，具体规则可查看该问题的题解：
https://leetcode.cn/problems/find-the-index-of-the-first-occurrence-in-a-string/

class Solution {
public:
    int numOfStrings(vector<string>& patterns, string word) {
        int ans = 0;
        int wordSz = word.size();
        for (string &s : patterns) {
            if (bm(s, word) >= 0) {
                ++ans;
            }
        }

        return ans;
    }

private:
	// map会值初始化int为0，我们用该结构体代替int，从而代替int的值初始化
    class intDefaultMinusOne {
    public:
        int num = -1;
    };

    int bm(string &needle, string &haystack) {
        // 找到needle字符最后一次出现的位置，如果访问没有出现过的字符，map会返回成员为-1的结构体
        unordered_map<char, intDefaultMinusOne> lastAppear;
        int needleLength = needle.size();
        for (int i = 0; i < needleLength; ++i) {
            lastAppear[needle[i]].num = i;
        }

		// 找到needle的每一个后缀串在needle中倒数第二次出现的起始位置，要找的后缀串是在needle中出现的最后一次位置
        unordered_map<int, intDefaultMinusOne> lastSuffixAppear;
        for (int i = 0; i < needleLength - 1; ++i) {
            int j = i;
            for (; j >= 0; --j) {
                if (needle[j] != needle[needleLength - i + j - 1]) {
                    break;
                }

                lastSuffixAppear[i - j + 1].num = j;
            }
        }

        int i = 0;
        int haystackLength = haystack.size();
        while (i <= haystackLength - needleLength) {
            int j = needleLength - 1;
            for ( ; j >= 0; --j) {
                if (needle[j] != haystack[i + j]) {
                    break;
                }
            }
            // 匹配成功
            if (j < 0) {
                return i;
            }

			// 找到坏字符条件下可以跳过多少搜索起点
            int badCharSkip = 0;
            // 如果needle中有坏字符
            if (lastAppear[haystack[i + j]].num != -1) {
                badCharSkip = j - lastAppear[haystack[i + j]].num;
            } else {    // 如果needle中没有坏字符，将下次搜索的起始位置移动到坏字符后面的那个位置
                badCharSkip = j + 1;
            }

			// 找到好后缀条件下可以跳过多少搜索起点
            int goodSuffixSkip = 0;
            do {
                int goodSuffixLenth = needleLength - j - 1;
                // 如果好后缀在needle中出现过至少两次
                if (lastSuffixAppear[goodSuffixLenth].num != -1) {
                    goodSuffixSkip = j - lastSuffixAppear[goodSuffixLenth].num;
                    break;
                }
				// 如果好后缀串在needle中只出现过一次，则找好后缀串的子串是否出现在needle的头部
                for (int suffixLen = goodSuffixLenth - 1; suffixLen >= 0; --suffixLen) {
                    if (lastSuffixAppear[suffixLen].num != 0) {
                        continue;
                    }

                    goodSuffixSkip = j - lastSuffixAppear[suffixLen].num;
                }
            } while (0);
            // 选更大的跳过搜索位置
            int skip = max(badCharSkip, goodSuffixSkip);
            // 如果两种方式都找不到要跳过的搜索位置，说明整个串都要被跳过
            if (skip <= 0) {
                skip = needleLength;
            }

            i += skip;
        }

        return -1;
    }
};

对于BM算法，时间复杂度最好为O(n/m)，最坏为O(nm)，其中n为文本串的长度，m为模式串的长度。一般文本搜索算法都使用BM，如Windows记事本的Ctrl+F搜索、Unix的grep命令。

如果输入数组patterns的长度为n，其中元素的长度为m，word的长度为l，此算法时间复杂度为O（nl/m）；空间复杂度为O（n(m+字符集大小)），字符集大小来源于记录每个字符最后一次出现的位置的lastAppear，m来源于记录后缀串首次出现的lastSuffixAppear，如果每个后缀串都在needle中出现过，则需要m大小的空间。

解法四：KMP算法，具体过程也可看算法三中的链接：

class Solution {
public:
    int numOfStrings(vector<string>& patterns, string word) {
        int ans = 0;
        int wordSz = word.size();
        for (string &s : patterns) {
            if (kmp(s, word) >= 0) {
                ++ans;
            }
        }

        return ans;
    }

private:
    int kmp(string &needle, string &haystack) {
        int needleLen = needle.size();
        vector<int> next(needleLen, -1);
        findNext(needle, next);

        int haystackIdx = 0;
        int needleIdx = 0;
        int haystackLen = haystack.size();
        while (haystackIdx < haystackLen - needleLen + 1) {
            for (; needleIdx < needleLen; ++needleIdx) {
                if (needle[needleIdx] != haystack[haystackIdx + needleIdx]) {
                    break;
                }
            }

            if (needleIdx == needleLen) {
                return haystackIdx;
            }

            int skip = 0;
            needleIdx = 0;
            if (needleIdx > 0) {
                skip = needleIdx - next[needleIdx - 1] - 1;
                needleIdx = next[needleIdx] + 1;
            }

            haystackIdx += max(1, skip);
        }

        return -1;
    }

    void findNext(string &needle, vector<int> &next) {
        int k = -1;
        int nextLen = next.size();
        for (int i = 1; i < nextLen; ++i) {
            while (k != -1 && needle[k + 1] != needle[i]) {
                k = next[k];
            }

            if (needle[k + 1] == needle[i]) {
                ++k;
            }

            next[i] = k;
        }
    }
};

如果输入数组patterns的长度为n，其中元素的长度为m，word的长度为l，此算法时间复杂度为O（n(m+l)），空间复杂度为O（m）。

以上算法构建next的过程中，while循环里令k = next[k]，此行代码可由一个例子解释，如当前已遍历到i，有如下包含i个字符的字符串，用.表示相同前后缀，*表示非最长前后缀的内容：
...*****...
如果我们想找[0,i+1]子串的最长前后缀，我们需要比较下标为4和i+1（下标为i+1的字符用-来表示）的两个字符是否相同，即下图中两个箭头对应的字符是否相同：
...****...-
...↑***...↑
如果不同，我们需要看前3个字符的最长前后缀，这是由于，假如前3个字符的最长前后缀是2个字节，则表示遍历到i时的后3个字符的最长前后缀也是2个字节，且遍历到i时，前3个字符和后3个字符相同，因此遍历到i时，下标组[0,1]、[1,2]、[i-2,i-1]、[i-1,i]是相同的，其中最重要的信息是下标组[0,1]和[i-1,i]是相同的，因此我们只需要比较下标2和下标i+1是否相同即可，因此需要每次都需要回溯到[0,i]的最长前后缀（此例中为1，即next[k]），因此需要k = next[k]。

posted @ 2023-02-15 21:39 epiphanyy 阅读(34) 评论(0) 收藏举报来源

刷新页面返回顶部

tus00000

LeetCode 1967. 作为子字符串出现在单词中的字符串数目

公告