用前缀树实现中文敏感词过滤器
前言
本文代码实现一个中文的敏感词过滤器,预先将准备好的敏感词写入前缀树数据结构中实现快速检索,并且节省内存。一般用于检查注册用户名称、言论是否包含不文明的词汇。
可以判断内容是否包含敏感词;找出内容中的敏感词;将内容中的敏感词替换成设置的字符。
运行环境
代码使用了JDK8语法,以及测试框架Jupiter。以下是Maven配置:
<properties>
<java.version>1.8</java.version>
<maven.compiler.source>${java.version}</maven.compiler.source>
<maven.compiler.target>${java.version}</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>RELEASE</version>
<scope>test</scope>
</dependency>
</dependencies>
过滤器源码
import java.util.*;
import java.util.function.Predicate;
/**
* 敏感词过滤器,限中文
*/
public class SensitiveWordFilters {
/**
* 如词典中有敏感词:[敏感, 敏感词]
* true
* ├── 匹配到[敏感]结束匹配
* └── 比较省时,作简单判断用
* false
* ├── 匹配到[敏感词]才结束匹配
* └── 相对费时,但是在替换敏感词的时候,能够替换掉更多匹配数据
*/
private static final boolean SIMPLE_MATCH = false;
/**
* 忽略字符列表
*/
private static final List<Character> IGNORE_CHAR_LIST = ignoreCharListInit();
/**
* 忽略部分字符
* 如词典中有敏感词:[敏感词],现验证文本[敏 感 词],也会认定为敏感词,因为忽略了空格符
* 同样在 重构字典、往字典中加敏感词时也会使用此断言
*/
private static final Predicate<Character> CHAR_IGNORE =
character -> Character.isSpaceChar(character) || IGNORE_CHAR_LIST.contains(character);
/**
* 重构字典
*/
public static void refactoringBy(List<String> sensitiveWordList) {
refactor(sensitiveWordList);
}
/**
* 往字典中加敏感词
*/
public static void add(List<String> sensitiveWordList) {
sensitiveWordList.forEach(word -> recordToThe(SensitiveWordCache.dictionary, word));
}
/**
* 往字典中加敏感词
*/
public static void add(String sensitiveWord) {
recordToThe(SensitiveWordCache.dictionary, sensitiveWord);
}
/**
* true:text 中有敏感词
*/
public static boolean foundIn(String text) {
if (isEmpty(text)) {
return false;
}
for (int i = 0; i < text.length(); i++) {
if (checkSensitiveWord(text, i) > 0) {
return true;
}
}
return false;
}
/**
* 从 text 中找出敏感词
*/
public static Set<String> findOutFrom(String text) {
if (isEmpty(text)) {
return Collections.emptySet();
}
Set<String> resultSet = new TreeSet<>((o1, o2) -> o1.length() == o2.length() ? o1.compareTo(o2) : o2.length() - o1.length());
for (int i = 0; i < text.length(); i++) {
int endIndex = checkSensitiveWord(text, i);
if (endIndex > 0) {
resultSet.add(text.substring(i, ++endIndex));
}
}
return resultSet;
}
/**
* 替换 text 中的敏感词,每个字符换一个替换符
*
* @param text 文本
* @param replaceChar 替换符
* @return 替换后的文本
*/
public static String replace(String text, String replaceChar) {
Set<String> sensitiveWordSet = findOutFrom(text);
if (sensitiveWordSet.isEmpty()) {
return text;
}
for (String sensitiveWord : sensitiveWordSet) {
text = text.replace(sensitiveWord, replacementOf(replaceChar, sensitiveWord.length()));
}
return text;
}
/**
* 字典缓存
*/
private static class SensitiveWordCache {
/**
* 字典/字典根节点
*/
static Node dictionary;
static {
dictionary = new Node();
dictionary.children = new HashMap<>(16);
}
private SensitiveWordCache() {
}
}
/**
* 重构字典
*
* @param sensitiveWordList 敏感字符列表
*/
private static void refactor(List<String> sensitiveWordList) {
Node newDictionary = new Node();
newDictionary.children = new HashMap<>(16);
synchronized (SensitiveWordCache.class) {
for (String word : sensitiveWordList) {
recordToThe(newDictionary, word);
}
SensitiveWordCache.dictionary = newDictionary;
}
}
/**
* 将敏感字符记录在节点上
*
* @param node 节点
* @param word 敏感字符
*/
private static void recordToThe(Node node, String word) {
Objects.requireNonNull(node);
synchronized (SensitiveWordCache.class) {
for (int i = 0, lastIndex = word.length() - 1; i < word.length(); i++) {
Character key = word.charAt(i);
if (!CHAR_IGNORE.test(key)) {
// 放置子节点
Node next = node.get(key);
if (Objects.isNull(next)) {
next = new Node();
node.putChild(key, next);
}
node = next;
}
if (i == lastIndex) {
node.isEnd = true;
}
}
}
}
/**
* 从 startIndex 开始匹配敏感字符
*
* @param text 文本
* @param startIndex 文本起始位置
* @return 0-没有敏感字符,>0 敏感字符终止位置
*/
private static int checkSensitiveWord(String text, int startIndex) {
int endIndex = 0;
Node node = SensitiveWordCache.dictionary;
for (int i = startIndex; i < text.length(); i++) {
Character key = text.charAt(i);
if (CHAR_IGNORE.test(key)) {
continue;
}
node = node.get(key);
if (Objects.isNull(node)) {
break;
}
if (node.isEnd) {
endIndex = i;
if (SIMPLE_MATCH) {
break;
}
}
}
return endIndex;
}
private static boolean isEmpty(String str) {
return str == null || "".equals(str);
}
/**
* 生成完整的替换符
*
* @param replaceChar 单字符替换符
* @param num 替换数量
* @return 完整替换符
*/
private static String replacementOf(String replaceChar, int num) {
int minJointLength = 2;
if (num < minJointLength) {
return replaceChar;
}
StringBuilder replacement = new StringBuilder();
for (int i = 0; i < num; i++) {
replacement.append(replaceChar);
}
return replacement.toString();
}
/**
* 字典数据节点
*/
private static class Node {
/**
* true:敏感词结尾
*/
boolean isEnd;
/**
* 子节点列表
*/
Map<Character, Node> children;
Node get(Character key) {
return Objects.nonNull(children) ? children.get(key) : null;
}
void putChild(Character key, Node node) {
if (Objects.isNull(children)) {
children = new HashMap<>(16);
}
children.put(key, node);
}
}
/**
* 初始化忽略字符列表
*/
private static List<Character> ignoreCharListInit() {
List<Character> ignoreCharList = new ArrayList<>(10);
ignoreCharList.add('|');
ignoreCharList.add('-');
return Collections.unmodifiableList(ignoreCharList);
}
private SensitiveWordFilters() {
}
}
过滤器测试类
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.Arrays;
class SensitiveWordFiltersTest {
/**
* 重构字典
*/
@Test
void refactoringBy() {
SensitiveWordFilters.refactoringBy(Arrays.asList(getSensitiveWords()));
}
/**
* 往字典中加敏感词
*/
@Test
void add() {
SensitiveWordFilters.add(Arrays.asList("敏感词"));
}
/**
* 判断内容是否包含敏感词
*/
@Test
void foundIn() {
SensitiveWordFilters.refactoringBy(Arrays.asList(getSensitiveWords()));
Assertions.assertTrue(SensitiveWordFilters.foundIn("白银混蛋"));
}
/**
* 从内容中找出敏感词
*/
@Test
void findOutFrom() {
SensitiveWordFilters.refactoringBy(Arrays.asList(getSensitiveWords()));
System.out.println(SensitiveWordFilters.findOutFrom("白银混蛋"));
}
/**
* 替换内容中的敏感词
*/
@Test
void replace() {
SensitiveWordFilters.refactoringBy(Arrays.asList(getSensitiveWords()));
String string = "就算是一个 顶-级 高 手,也会被那个白银 混蛋坑得很惨";
System.out.println(SensitiveWordFilters.replace(string, "*"));
}
private static String[] getSensitiveWords() {
return sensitiveWords.split("\\|");
}
static final String sensitiveWords = "顶级|白银|混蛋";
}