JAVA使用DFA算法过滤敏感词
代码示例如下:
import cn.hutool.core.collection.CollUtil; import cn.hutool.core.util.ReUtil; import cn.hutool.core.util.StrUtil; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import java.util.*; public class SensitiveWordUtils { //最小匹配模式 public static int minMatchTYpe = 1; //最大匹配模式 public static int maxMatchType = 2; //英文字母正则式 public static final String englishLletter = "[a-zA-z]+"; /** * @description: 初始化词库 * @date: 2024/3/9 10:50 * @param sensitiveWords * @return java.util.Map */ public static Map initKeyWordAndWhiteList(List<String> sensitiveWords) { if(CollUtil.isEmpty(sensitiveWords)){ return null; } try{ Set<String> keyWordSet = new HashSet<String>(); for(String s: sensitiveWords){ keyWordSet.add(s.trim()); } return addSensitiveWordAndWhiteListToHashMap(keyWordSet); } catch (Exception e) { e.printStackTrace(); } return null; } /** * @description: 构建词库 * @date: 2024/3/9 10:51 * @param keyWordSet * @return java.util.HashMap */ private static HashMap addSensitiveWordAndWhiteListToHashMap(Set<String> keyWordSet){ HashMap sensitiveWordMap = new HashMap(keyWordSet.size()); String key = null; Map nowMap = null; Map<String, String> newWorMap = null; Iterator<String> iterator = keyWordSet.iterator(); while (iterator.hasNext()) { key = iterator.next(); nowMap = sensitiveWordMap; for (int i = 0; i < key.length(); i++) { char keyChar = key.charAt(i); Object wordMap = nowMap.get(keyChar); if(wordMap != null){ nowMap = (Map) wordMap; }else{ newWorMap = new HashMap<String, String>(); newWorMap.put("isEnd", "0"); nowMap.put(keyChar, newWorMap); nowMap = newWorMap; } if(i == key.length() - 1){ nowMap.put("isEnd", "1"); } } } return sensitiveWordMap; } /** * @description: 敏感词匹配 * @date: 2024/3/9 10:52 * @param text 待检测文本 * @param sensitiveWordMap 构建后的敏感词词库map * @param wordMap 处理后的敏感词map * @param wordWhiteMap 处理后的白名单map * @param ignoreCase 是否忽略大小写 1是 0否 * @param ignoreSpace 是否忽略空格 1是 0否 * @param specialScanWay 是否精确匹配 1是 0否 * @return java.util.Map<java.lang.String,java.util.Set<java.lang.String>> */ public static Map<String, Set<String>> findAllNew(String text, Map sensitiveWordMap, Map<String, String> wordMap, Map<String, String> wordWhiteMap, Integer ignoreCase, Integer ignoreSpace, Integer specialScanWay) { Map<String, Set<String>> result = Maps.newHashMap(); Set<String> allSensitiveWordList = new HashSet<String>(); long txtLength = text.length(); for (int i = 0; i < txtLength; i++) { int length = checkSensitiveWordNew(text, i, maxMatchType, sensitiveWordMap, ignoreCase, ignoreSpace); //处理精准匹配 if (null != specialScanWay && specialScanWay == CommonConstant.Numbers.NUMBER_1 && length > CommonConstant.Numbers.NUMBER_0) { String subStr = StrUtil.sub(text, i, i + length); if (ReUtil.count(englishLletter, subStr) > CommonConstant.Numbers.NUMBER_0) { //取前一个字符 String beforeSubStr = StrUtil.sub(text, i - 1, i); //取后一个字符 String afterSubStr = StrUtil.sub(text, i + length, i + length + 1); //命中文本是顶行,且往后取一位,若是英文,不属于命中敏感词 if(i == CommonConstant.Numbers.NUMBER_0 && ReUtil.count(englishLletter, afterSubStr) > CommonConstant.Numbers.NUMBER_0){ length = CommonConstant.Numbers.NUMBER_0; //命中文本往后取一位,只要是任意英文单词,不属于命中敏感词 }else if(ReUtil.count(englishLletter, afterSubStr) > CommonConstant.Numbers.NUMBER_0){ length = CommonConstant.Numbers.NUMBER_0; //命中文本往前取一位等于n,且往前再取一位不等于 ‘\’,不属于命中敏感词 }else if((i - 1) >= CommonConstant.Numbers.NUMBER_0 && ReUtil.count(englishLletter, beforeSubStr) > CommonConstant.Numbers.NUMBER_0 && StrUtil.equals(beforeSubStr, "n") && !StrUtil.equals(StrUtil.sub(text, i - 2, i - 1), "\\")){ length = CommonConstant.Numbers.NUMBER_0; //命中文本往前取一位为任意英文字符,不属于命中敏感词 }else if((i - 1) >= CommonConstant.Numbers.NUMBER_0 && ReUtil.count(englishLletter, beforeSubStr) > CommonConstant.Numbers.NUMBER_0) { length = CommonConstant.Numbers.NUMBER_0; } } } if (length > 0) { String keyWord = text.substring(i, i + length); String newKeyWord = ""; if (CommonConstant.Numbers.NUMBER_1 == ignoreCase && CommonConstant.Numbers.NUMBER_1 == ignoreSpace) { newKeyWord = keyWord.toLowerCase(); newKeyWord = StrUtil.cleanBlank(newKeyWord); } else if (CommonConstant.Numbers.NUMBER_1 == ignoreCase) { newKeyWord = keyWord.toLowerCase(); } else if (CommonConstant.Numbers.NUMBER_1 == ignoreSpace) { newKeyWord = StrUtil.cleanBlank(keyWord); } else { newKeyWord = keyWord; } if(wordMap.containsKey(newKeyWord) && !wordWhiteMap.containsKey(newKeyWord)){ allSensitiveWordList.add(wordMap.get(newKeyWord)); } i = i + length - 1; } } result.put("allHitWord", allSensitiveWordList); return result; } /** * @description: 从词库map中进行匹配 * @date: 2024/3/9 10:49 * @param text 待检测文本 * @param beginIndex 文本下标开始位置 * @param sensitiveWordMap 构建后的敏感词词库map * @param ignoreCase 是否忽略大小写 1是 0否 * @param ignoreSpace 是否忽略空格 1是 0否 * @return int 返回命中的字符长度 */ private static int checkSensitiveWordNew(String txt, int beginIndex, int matchType, Map sensitiveWordMap, Integer ignoreCase, Integer ignoreSpace) { boolean flag = false; int matchFlag = 0; int firstMatchFlag = 0; char word = 0; Map nowMap = sensitiveWordMap; for(int i = beginIndex; i < txt.length(); i++){ word = txt.charAt(i); if(CommonConstant.Numbers.NUMBER_1 == ignoreSpace && Character.isSpaceChar(word)){ matchFlag++; continue; } if(CommonConstant.Numbers.NUMBER_1 == ignoreCase){ word = Character.toLowerCase(word); } nowMap = (Map)nowMap.get(word); if(nowMap != null){ matchFlag++; if ("1".equals(nowMap.get("isEnd"))){ flag = true; firstMatchFlag = matchFlag; if(minMatchTYpe == matchType){ break; } } }else{ // 解决敏感词内嵌问题 如 Xinjiang和Xinjiang Independenc两个词汇 若文本为Xinjiang Inefb 则不会命中,逻辑上应命中Xinjiang if(matchFlag > firstMatchFlag){ matchFlag = firstMatchFlag; } break; } } if(!flag){ matchFlag = 0; } return matchFlag; } public static void main(String[] args) { //精确匹配 int specialScanWay = 1; //忽略大小写 int ignoreCase = 1; //原始敏感词词库列表 List<String> wordList = new ArrayList<>(); wordList.add("台独"); wordList.add("Xinjiang"); wordList.add("Xinjiang production and construction Corps"); //原始白名单列表 List<String> allWhiteWordList = new ArrayList<>(); allWhiteWordList.add("一台独立"); //构建新的敏感词词库map Map<String, String> wordMap = Maps.newHashMap(); //构建新的白名单map Map<String, String> wordWhiteMap = Maps.newHashMap(); //最新词库列表(整合原始词库和白名单 并进行大小写处理) List<String> newWordList = Lists.newArrayList(); wordList.forEach(item->{ String word = item; //处理大小写 if(1 == ignoreCase){ word = item.toLowerCase(); } wordMap.put(word, item); newWordList.add(word); }); if(CollUtil.isNotEmpty(allWhiteWordList)){ allWhiteWordList.forEach(item->{ String word = item; //处理大小写 if(1 == ignoreCase){ word = item.toLowerCase(); } wordWhiteMap.put(word, item); newWordList.add(word); }); } String text = "这是一段测试文本,xiNJiang production,大胆台独分子,这是一台独立的计算机"; Map sensitiveWordMap = SensitiveWordUtils.initKeyWordAndWhiteList(newWordList); Map<String, Set<String>> resultMap = SensitiveWordUtils.findAllNew(text, sensitiveWordMap, wordMap, wordWhiteMap, ignoreCase, 0, specialScanWay); System.out.println("resultMap = " + resultMap.toString()); } }