Java实现敏感词过滤 - DFA算法
Java实现敏感词过滤 - DFA算法| Id | Title | DateAdded | SourceUrl | PostType | Body | BlogId | Description | DateUpdated | IsMarkdown | EntryName | CreatedTime | IsActive | AutoDesc | AccessPermission |
| -------------| -------------| -------------| -------------| -------------| -------------| -------------| -------------| -------------| -------------| -------------| -------------| -------------| -------------| -------------|
| 14170504| Java实现敏感词过滤 - DFA算法| 2020-12-21T22:33:00| | BlogPost|
Java实现DFA算法进行敏感词过滤
封装工具类如下:
使用前需对敏感词库进行初始化: SensitiveWordUtil.init(sensitiveWordSet);
package cn.swfilter.util;
import java.util.*;
/**
-
敏感词处理工具 - DFA算法实现
-
@author sam
-
@since 2017/9/4
*/
public class SensitiveWordUtil {
/**
- 敏感词匹配规则
*/
public static final int MinMatchTYpe = 1; //最小匹配规则,如:敏感词库["中国","中国人"],语句:"我是中国人",匹配结果:我是[中国]人
public static final int MaxMatchType = 2; //最大匹配规则,如:敏感词库["中国","中国人"],语句:"我是中国人",匹配结果:我是[中国人]
/**
- 敏感词集合
*/
public static HashMap sensitiveWordMap;
/**
- 初始化敏感词库,构建DFA算法模型
- @param sensitiveWordSet 敏感词库
*/
public static synchronized void init(Set<String> sensitiveWordSet) {
initSensitiveWordMap(sensitiveWordSet);
}
/**
-
初始化敏感词库,构建DFA算法模型
-
@param sensitiveWordSet 敏感词库
*/
private static void initSensitiveWordMap(Set<String> sensitiveWordSet) {
//初始化敏感词容器,减少扩容操作
sensitiveWordMap = new HashMap(sensitiveWordSet.size());
String key;
Map nowMap;
Map<String, String> newWorMap;
//迭代sensitiveWordSet
Iterator<String> iterator = sensitiveWordSet.iterator();
while (iterator.hasNext()) {
//关键字
key = iterator.next();
nowMap = sensitiveWordMap;
for (int i = 0; i < key.length(); i++) {
//转换成char型
char keyChar = key.charAt(i);
//库中获取关键字
Object wordMap = nowMap.get(keyChar);
//如果存在该key,直接赋值,用于下一个循环获取
if (wordMap != null) {
nowMap = (Map) wordMap;
} else {
//不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
newWorMap = new HashMap<>();
//不是最后一个
newWorMap.put("isEnd", "0");
nowMap.put(keyChar, newWorMap);
nowMap = newWorMap;
}
</span><span style="color: #0000ff;">if</span> (i == key.length() - 1<span style="color: #000000;">) {
</span><span style="color: #008000;">//</span><span style="color: #008000;">最后一个</span>
nowMap.put("isEnd", "1"<span style="color: #000000;">);
}
}
}
}
/**
- 判断文字是否包含敏感字符
- @param txt 文字
- @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则
- @return 若包含返回true,否则返回false
*/
public static boolean contains(String txt, int matchType) {
boolean flag = false;
for (int i = 0; i < txt.length(); i++) {
int matchFlag = checkSensitiveWord(txt, i, matchType); //判断是否包含敏感字符
if (matchFlag > 0) { //大于0存在,返回true
flag = true;
}
}
return flag;
}
/**
- 判断文字是否包含敏感字符
- @param txt 文字
- @return 若包含返回true,否则返回false
*/
public static boolean contains(String txt) {
return contains(txt, MaxMatchType);
}
/**
-
获取文字中的敏感词
-
@param txt 文字
-
@param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则
-
@return
*/
public static Set<String> getSensitiveWord(String txt, int matchType) {
Set<String> sensitiveWordList = new HashSet<>();
for (int i = 0; i < txt.length(); i++) {
//判断是否包含敏感字符
int length = checkSensitiveWord(txt, i, matchType);
if (length > 0) {//存在,加入list中
sensitiveWordList.add(txt.substring(i, i + length));
i = i + length - 1;//减1的原因,是因为for会自增
}
}
return sensitiveWordList;
}
/**
- 获取文字中的敏感词
- @param txt 文字
- @return
*/
public static Set<String> getSensitiveWord(String txt) {
return getSensitiveWord(txt, MaxMatchType);
}
/**
-
替换敏感字字符
-
@param txt 文本
-
@param replaceChar 替换的字符,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符:, 替换结果:我爱**
-
@param matchType 敏感词匹配规则
-
@return
*/
public static String replaceSensitiveWord(String txt, char replaceChar, int matchType) {
String resultTxt = txt;
//获取所有的敏感词
Set<String> set = getSensitiveWord(txt, matchType);
Iterator<String> iterator = set.iterator();
String word;
String replaceString;
while (iterator.hasNext()) {
word = iterator.next();
replaceString = getReplaceChars(replaceChar, word.length());
resultTxt = resultTxt.replaceAll(word, replaceString);
}
return resultTxt;
}
/**
- 替换敏感字字符
- @param txt 文本
- @param replaceChar 替换的字符,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符:, 替换结果:我爱**
- @return
*/
public static String replaceSensitiveWord(String txt, char replaceChar) {
return replaceSensitiveWord(txt, replaceChar, MaxMatchType);
}
/**
-
替换敏感字字符
-
@param txt 文本
-
@param replaceStr 替换的字符串,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符串:[屏蔽],替换结果:我爱[屏蔽]
-
@param matchType 敏感词匹配规则
-
@return
*/
public static String replaceSensitiveWord(String txt, String replaceStr, int matchType) {
String resultTxt = txt;
//获取所有的敏感词
Set<String> set = getSensitiveWord(txt, matchType);
Iterator<String> iterator = set.iterator();
String word;
while (iterator.hasNext()) {
word = iterator.next();
resultTxt = resultTxt.replaceAll(word, replaceStr);
}
return resultTxt;
}
/**
- 替换敏感字字符
- @param txt 文本
- @param replaceStr 替换的字符串,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符串:[屏蔽],替换结果:我爱[屏蔽]
- @return
*/
public static String replaceSensitiveWord(String txt, String replaceStr) {
return replaceSensitiveWord(txt, replaceStr, MaxMatchType);
}
/**
-
获取替换字符串
-
@param replaceChar
-
@param length
-
@return
*/
private static String getReplaceChars(char replaceChar, int length) {
String resultReplace = String.valueOf(replaceChar);
for (int i = 1; i < length; i++) {
resultReplace += replaceChar;
}
return resultReplace;
}
/**
- 检查文字中是否包含敏感字符,检查规则如下:<br>
- @param txt
- @param beginIndex
- @param matchType
- @return 如果存在,则返回敏感词字符的长度,不存在返回0
*/
private static int checkSensitiveWord(String txt, int beginIndex, int matchType) {
//敏感词结束标识位:用于敏感词只有1位的情况
boolean flag = false;
//匹配标识数默认为0
int matchFlag = 0;
char word;
Map nowMap = sensitiveWordMap;
for (int i = beginIndex; i < txt.length(); i++) {
word = txt.charAt(i);
//获取指定key
nowMap = (Map) nowMap.get(word);
if (nowMap != null) {//存在,则判断是否为最后一个
//找到相应key,匹配标识+1
matchFlag++;
//如果为最后一个匹配规则,结束循环,返回匹配标识数
if ("1".equals(nowMap.get("isEnd"))) {
//结束标志位为true
flag = true;
//最小规则,直接返回,最大规则还需继续查找
if (MinMatchTYpe == matchType) {
break;
}
}
} else {//不存在,直接返回
break;
}
}
if (matchFlag < 2 || !flag) {//长度必须大于等于1,为词
matchFlag = 0;
}
return matchFlag;
}
public static void main(String[] args) {
Set</span><String> sensitiveWordSet = <span style="color: #0000ff;">new</span> HashSet<><span style="color: #000000;">();
sensitiveWordSet.add(</span>"太多"<span style="color: #000000;">);
sensitiveWordSet.add(</span>"爱恋"<span style="color: #000000;">);
sensitiveWordSet.add(</span>"静静"<span style="color: #000000;">);
sensitiveWordSet.add(</span>"哈哈"<span style="color: #000000;">);
sensitiveWordSet.add(</span>"啦啦"<span style="color: #000000;">);
sensitiveWordSet.add(</span>"感动"<span style="color: #000000;">);
sensitiveWordSet.add(</span>"发呆"<span style="color: #000000;">);
</span><span style="color: #008000;">//</span><span style="color: #008000;">初始化敏感词库</span>
SensitiveWordUtil.init(sensitiveWordSet);
System.out.println(</span>"敏感词的数量:" +<span style="color: #000000;"> SensitiveWordUtil.sensitiveWordMap.size());
String string </span>= "太多的伤感情怀也许只局限于饲养基地 荧幕中的情节。"
+ "然后我们的扮演的角色就是跟随着主人公的喜红客联盟 怒哀乐而过于牵强的把自己的情感也附加于银幕情节中,然后感动就流泪,"
+ "难过就躺在某一个人的怀里尽情的阐述心扉或者手机卡复制器一个贱人一杯红酒一部电影在夜 深人静的晚上,关上电话静静的发呆着。"<span style="color: #000000;">;
System.out.println(</span>"待检测语句字数:" +<span style="color: #000000;"> string.length());
</span><span style="color: #008000;">//</span><span style="color: #008000;">是否含有关键字</span>
<span style="color: #0000ff;">boolean</span> result =<span style="color: #000000;"> SensitiveWordUtil.contains(string);
System.out.println(result);
result </span>=<span style="color: #000000;"> SensitiveWordUtil.contains(string, SensitiveWordUtil.MinMatchTYpe);
System.out.println(result);
</span><span style="color: #008000;">//</span><span style="color: #008000;">获取语句中的敏感词</span>
Set<String> set =<span style="color: #000000;"> SensitiveWordUtil.getSensitiveWord(string);
System.out.println(</span>"语句中包含敏感词的个数为:" + set.size() + "。包含:" +<span style="color: #000000;"> set);
set </span>=<span style="color: #000000;"> SensitiveWordUtil.getSensitiveWord(string, SensitiveWordUtil.MinMatchTYpe);
System.out.println(</span>"语句中包含敏感词的个数为:" + set.size() + "。包含:" +<span style="color: #000000;"> set);
</span><span style="color: #008000;">//</span><span style="color: #008000;">替换语句中的敏感词</span>
String filterStr = SensitiveWordUtil.replaceSensitiveWord(string, '*'<span style="color: #000000;">);
System.out.println(filterStr);
filterStr </span>= SensitiveWordUtil.replaceSensitiveWord(string, '*'<span style="color: #000000;">, SensitiveWordUtil.MinMatchTYpe);
System.out.println(filterStr);
String filterStr2 </span>= SensitiveWordUtil.replaceSensitiveWord(string, "[*敏感词*]"<span style="color: #000000;">);
System.out.println(filterStr2);
filterStr2 </span>= SensitiveWordUtil.replaceSensitiveWord(string, "[*敏感词*]"<span style="color: #000000;">, SensitiveWordUtil.MinMatchTYpe);
System.out.println(filterStr2);
}
}
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 25岁的心里话
· 按钮权限的设计及实现