设计思路-mysql进行分词搜索

模拟分词搜索

有时候我们数据量很少,但是需要全文检索,如果用es的话就太重了。要引入新的技术栈,当然mysql高版本支持全文检索

但是我们再低版本情况下怎么做

 

分词工具类

可以使用ik

@Slf4j
public class HanLPUtil {
    // 分词需要跳过的片段:标点,语气,助词,动词等
    // https://github.com/hankcs/HanLP/blob/1.x/data/dictionary/other/TagPKU98.csv
    private final static String[] SKIP_SEGMENT_NATURE = {"w", "y", "u", "v"};
    private final static String[] SPECIA_KEYWORDS = {"'", "%", "_", "\\", "$", "^", "*", "(", "+", "?", "["};
    private final static String[] REPLACE_KEYWORDS = {"\\\'", "\\%", "\\_", "\\\\\\\\", "\\$", "\\^", "\\*", "\\(", "\\+", "\\?", "\\["};

    public static String segment(String input, String split) {
        return HanLPUtil.segment(input, split, 0);
    }

    public static String segment(String input, String split, int mode) {
        if (StringUtil.isEmptyOrNullString(input)) {
            return input; // null skipped.
        }
        input = StringUtil.trim(input.replace(split,""));
        final List<Term> termList = IndexTokenizer.segment(input);

        // 跳过标点,语气词,助词,动词等
        final List<Term> filteredTermList = termList.stream()//
                .filter(e -> !needSkip(e)).collect(Collectors.toList());

        final List<String> filteredWords = new ArrayList<>();
        // 全文也加入分词结果
        filteredWords.add(input);

        // 两两拼接(跳过单个字符)
        if (mode == 0) {
            for (int i = 0; i < filteredTermList.size(); i++) {
                final Term current = filteredTermList.get(i);
                final Term prev = (i - 1 >= 0) ? filteredTermList.get(i - 1) : null;
                if (null == prev) {
                    continue;
                }
                String word = StringUtil.trim(prev.word + current.word);
                if (StringUtil.isBlank(word) || StringUtil.equals(split, word) || word.length() < 1) {
                    continue;
                }
                filteredWords.add(word);
            }
        }
        // 直接过滤后分词结果(跳过单个字符)
        else if (mode == 1) {
            for (Term term : filteredTermList) {
                String word = StringUtil.trim(term.word);
                if (StringUtil.isBlank(word) || StringUtil.equals(split, word) || word.length() < 1) {
                    continue;
                }
                filteredWords.add(word);
            }
        }

        if (log.isDebugEnabled()) {
            log.info("#1103 HanLPUtil.segment() input={}, mode={}, result={}", input, mode, filteredWords);
        }

        // 最终结果以分隔符拼接
        return StringUtil.join(filteredWords, split);
    }

    private static boolean needSkip(Term term) {
        if (null == term || null == term.word) return true;

        for (String nature : SKIP_SEGMENT_NATURE) {
            if (term.nature.startsWith(nature))
                return true;
        }
        return false;
    }

    /**
     * SQL的LIKE/REGEXP查询语句中,有一些特殊的字符,需要转换后才能搜索到结果:
     * ':用于包裹搜索条件,需转为\';
     * %:用于代替任意数目的任意字符,需转换为\%;
     * _:用于代替一个任意字符,需转换为\_;
     * \:转义符号,需转换为\\\\。
     * ....
     */
    public static String filterSpecia(String keyword) {
        if (StringUtil.isEmptyOrNullString(keyword)) return keyword;
        final StringBuilder result = new StringBuilder();
        for (char keywordChar : keyword.toCharArray()) {
            boolean isReplace = false;
            for (int i = 0; i < SPECIA_KEYWORDS.length; i++) {
                if ((keywordChar + "").equals(SPECIA_KEYWORDS[i])) {
                    result.append(REPLACE_KEYWORDS[i]);
                    isReplace = true;
                    break;
                }
            }
            if (!isReplace) {
                result.append(keywordChar);
            }
        }
        return result.toString();
    }
}

1.先转义

// 特殊字符转义
searchKey = HanLPUtil.filterSpecia(searchKey)

2分词条件

 HanLPUtil.segment(filterSpeciaWithSearchKey, "|", 1)

3.搜索条件

 a.body REGEXP '还有一些特殊字符|一些|特殊|字符' OR body like '%有一些特殊字符一些特殊字符'

针对权重排序

先分词后like搜索,然后根据各个结果的匹配字数生成数字列 进行排序

 

 





posted @ 2023-02-16 10:50  意犹未尽  阅读(132)  评论(0编辑  收藏  举报