JAVA使用DFA算法过滤敏感词

代码示例如下:
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.ReUtil;
import cn.hutool.core.util.StrUtil;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import java.util.*;

public class SensitiveWordUtils {
    //最小匹配模式 
    public static int minMatchTYpe = 1;

    //最大匹配模式 
    public static int maxMatchType = 2;

    //英文字母正则式 
    public static final String englishLletter = "[a-zA-z]+";

    /** 
     * @description: 初始化词库
     * @date: 2024/3/9 10:50
     * @param sensitiveWords
     * @return java.util.Map
     */
    public static Map initKeyWordAndWhiteList(List<String> sensitiveWords) {
        if(CollUtil.isEmpty(sensitiveWords)){
            return null;
        }
        try{
            Set<String> keyWordSet = new HashSet<String>();
            for(String s: sensitiveWords){
                keyWordSet.add(s.trim());
            }
            return addSensitiveWordAndWhiteListToHashMap(keyWordSet);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    /** 
     * @description: 构建词库
     * @date: 2024/3/9 10:51
     * @param keyWordSet
     * @return java.util.HashMap
     */
    private static HashMap addSensitiveWordAndWhiteListToHashMap(Set<String> keyWordSet){
        HashMap sensitiveWordMap = new HashMap(keyWordSet.size());
        String key = null;
        Map nowMap = null;
        Map<String, String> newWorMap = null;
        Iterator<String> iterator = keyWordSet.iterator();
        while (iterator.hasNext()) {
            key = iterator.next();
            nowMap = sensitiveWordMap;
            for (int i = 0; i < key.length(); i++) {
                char keyChar = key.charAt(i);
                Object wordMap = nowMap.get(keyChar);
                if(wordMap != null){
                    nowMap = (Map) wordMap;
                }else{
                    newWorMap = new HashMap<String, String>();
                    newWorMap.put("isEnd", "0");
                    nowMap.put(keyChar, newWorMap);
                    nowMap = newWorMap;
                }
                if(i == key.length() - 1){
                    nowMap.put("isEnd", "1");
                }
            }
        }
        return sensitiveWordMap;
    }

    /** 
     * @description: 敏感词匹配
     * @date: 2024/3/9 10:52
     * @param text 待检测文本
     * @param sensitiveWordMap 构建后的敏感词词库map
     * @param wordMap 处理后的敏感词map
     * @param wordWhiteMap 处理后的白名单map
     * @param ignoreCase 是否忽略大小写 1是 0否
     * @param ignoreSpace 是否忽略空格 1是 0否
     * @param specialScanWay 是否精确匹配 1是 0否
     * @return java.util.Map<java.lang.String,java.util.Set<java.lang.String>>
     */
    public static Map<String, Set<String>> findAllNew(String text, Map sensitiveWordMap, Map<String, String> wordMap, Map<String, String> wordWhiteMap, Integer ignoreCase, Integer ignoreSpace, Integer specialScanWay) {
        Map<String, Set<String>> result = Maps.newHashMap();
        Set<String> allSensitiveWordList = new HashSet<String>();
        long txtLength = text.length();
        for (int i = 0; i < txtLength; i++) {
            int length = checkSensitiveWordNew(text, i, maxMatchType, sensitiveWordMap, ignoreCase, ignoreSpace);
            //处理精准匹配
            if (null != specialScanWay && specialScanWay == CommonConstant.Numbers.NUMBER_1 && length > CommonConstant.Numbers.NUMBER_0) {
                String subStr = StrUtil.sub(text, i, i + length);
                if (ReUtil.count(englishLletter, subStr) > CommonConstant.Numbers.NUMBER_0) {
                    //取前一个字符 
                    String beforeSubStr = StrUtil.sub(text, i - 1, i);
                    //取后一个字符 
                    String afterSubStr = StrUtil.sub(text, i + length, i + length + 1);
                    //命中文本是顶行,且往后取一位,若是英文,不属于命中敏感词 
                    if(i == CommonConstant.Numbers.NUMBER_0 && ReUtil.count(englishLletter, afterSubStr) > CommonConstant.Numbers.NUMBER_0){
                        length = CommonConstant.Numbers.NUMBER_0;
                        //命中文本往后取一位,只要是任意英文单词,不属于命中敏感词 
                    }else if(ReUtil.count(englishLletter, afterSubStr) > CommonConstant.Numbers.NUMBER_0){
                        length = CommonConstant.Numbers.NUMBER_0;
                        //命中文本往前取一位等于n,且往前再取一位不等于 ‘\’,不属于命中敏感词
                    }else if((i - 1) >= CommonConstant.Numbers.NUMBER_0 && ReUtil.count(englishLletter, beforeSubStr) > CommonConstant.Numbers.NUMBER_0 && StrUtil.equals(beforeSubStr, "n") && !StrUtil.equals(StrUtil.sub(text, i - 2, i - 1), "\\")){
                        length = CommonConstant.Numbers.NUMBER_0;
                        //命中文本往前取一位为任意英文字符,不属于命中敏感词
                    }else if((i - 1) >= CommonConstant.Numbers.NUMBER_0 && ReUtil.count(englishLletter, beforeSubStr) > CommonConstant.Numbers.NUMBER_0) {
                        length = CommonConstant.Numbers.NUMBER_0;
                    }
                }
            }
            if (length > 0) {
                String keyWord = text.substring(i, i + length);
                String newKeyWord = "";
                if (CommonConstant.Numbers.NUMBER_1 == ignoreCase && CommonConstant.Numbers.NUMBER_1 == ignoreSpace) {
                    newKeyWord = keyWord.toLowerCase();
                    newKeyWord = StrUtil.cleanBlank(newKeyWord);
                } else if (CommonConstant.Numbers.NUMBER_1 == ignoreCase) {
                    newKeyWord = keyWord.toLowerCase();
                } else if (CommonConstant.Numbers.NUMBER_1 == ignoreSpace) {
                    newKeyWord = StrUtil.cleanBlank(keyWord);
                } else {
                    newKeyWord = keyWord;
                }
                if(wordMap.containsKey(newKeyWord) && !wordWhiteMap.containsKey(newKeyWord)){
                    allSensitiveWordList.add(wordMap.get(newKeyWord));
                }
                i = i + length - 1;
            }
        }
        result.put("allHitWord", allSensitiveWordList);
        return result;
    }

    /** 
     * @description: 从词库map中进行匹配
     * @date: 2024/3/9 10:49
     * @param text 待检测文本
     * @param beginIndex 文本下标开始位置
     * @param sensitiveWordMap 构建后的敏感词词库map
     * @param ignoreCase 是否忽略大小写 1是 0否
     * @param ignoreSpace 是否忽略空格 1是 0否
     * @return int 返回命中的字符长度
     */
    private static int checkSensitiveWordNew(String txt, int beginIndex, int matchType, Map sensitiveWordMap, Integer ignoreCase, Integer ignoreSpace) {
        boolean flag = false;
        int matchFlag = 0;
        int firstMatchFlag = 0;
        char word = 0;
        Map nowMap = sensitiveWordMap;
        for(int i = beginIndex; i < txt.length(); i++){
            word = txt.charAt(i);
            if(CommonConstant.Numbers.NUMBER_1 == ignoreSpace && Character.isSpaceChar(word)){
                matchFlag++;
                continue;
            }
            if(CommonConstant.Numbers.NUMBER_1 == ignoreCase){
                word = Character.toLowerCase(word);
            }
            nowMap = (Map)nowMap.get(word);
            if(nowMap != null){
                matchFlag++;
                if ("1".equals(nowMap.get("isEnd"))){
                    flag = true;
                    firstMatchFlag = matchFlag;
                    if(minMatchTYpe == matchType){
                        break;
                    }
                }
            }else{
                // 解决敏感词内嵌问题 如 Xinjiang和Xinjiang Independenc两个词汇 若文本为Xinjiang Inefb 则不会命中,逻辑上应命中Xinjiang 
                if(matchFlag > firstMatchFlag){
                    matchFlag = firstMatchFlag;
                }
                break;
            }
        }
        if(!flag){
            matchFlag = 0;
        }
        return matchFlag;
    }


    public static void main(String[] args) {
        //精确匹配 
        int specialScanWay = 1;
        //忽略大小写 
        int ignoreCase = 1;
        //原始敏感词词库列表 
        List<String> wordList = new ArrayList<>();
        wordList.add("台独");
        wordList.add("Xinjiang");
        wordList.add("Xinjiang production and construction Corps");
        //原始白名单列表
        List<String> allWhiteWordList = new ArrayList<>();
        allWhiteWordList.add("一台独立");
        //构建新的敏感词词库map 
        Map<String, String> wordMap = Maps.newHashMap();
        //构建新的白名单map 
        Map<String, String> wordWhiteMap = Maps.newHashMap();
        //最新词库列表(整合原始词库和白名单 并进行大小写处理) 
        List<String> newWordList = Lists.newArrayList();
        wordList.forEach(item->{
            String word = item;
            //处理大小写
            if(1 == ignoreCase){
                word = item.toLowerCase();
            }
            wordMap.put(word, item);
            newWordList.add(word);
        });
        if(CollUtil.isNotEmpty(allWhiteWordList)){
            allWhiteWordList.forEach(item->{
                String word = item;
                //处理大小写
                if(1 == ignoreCase){
                    word = item.toLowerCase();
                }
                wordWhiteMap.put(word, item);
                newWordList.add(word);
            });
        }
        String text = "这是一段测试文本,xiNJiang production,大胆台独分子,这是一台独立的计算机";
        Map sensitiveWordMap = SensitiveWordUtils.initKeyWordAndWhiteList(newWordList);
        Map<String, Set<String>> resultMap = SensitiveWordUtils.findAllNew(text, sensitiveWordMap, wordMap, wordWhiteMap, ignoreCase, 0, specialScanWay);
        System.out.println("resultMap = " + resultMap.toString());
    }
}

 

posted @ 2024-03-09 18:53  bug毁灭者  阅读(138)  评论(0编辑  收藏  举报