使用DFA算法对敏感词进行过滤
项目目录结构如下:
其中resources资源目录中:
stopwd.txt :停顿词,匹配时间直接过滤。
wd.txt:敏感词库。
1、WordFilter敏感词过滤类:
1 package com.skyer.sensitivewdfilter; 2 3 import java.io.BufferedReader; 4 import java.io.IOException; 5 import java.io.InputStreamReader; 6 import java.util.ArrayList; 7 import java.util.HashMap; 8 import java.util.HashSet; 9 import java.util.List; 10 import java.util.Map; 11 import java.util.Set; 12 13 /** 14 * 思路: 创建一个FilterSet,枚举了0~65535的所有char是否是某个敏感词开头的状态 15 * 16 * 判断是否是 敏感词开头 | | 是 不是 获取头节点 OK--下一个字 然后逐级遍历,DFA算法 17 */ 18 public class WordFilter { 19 20 private static final FilterSet set = new FilterSet(); // 存储首字 21 private static final Map<Integer, WordNode> nodes = new HashMap<Integer, WordNode>(1024, 1); // 存储节点 22 private static final Set<Integer> stopwdSet = new HashSet<Integer>(); // 停顿词 23 private static final char SIGN = '*'; // 敏感词过滤替换 24 25 static { 26 try { 27 long a = System.nanoTime(); 28 init(); 29 a = System.nanoTime() - a; 30 System.out.println("加载时间 : " + a + "ns"); 31 System.out.println("加载时间 : " + a / 1000000 + "ms"); 32 } catch (Exception e) { 33 throw new RuntimeException("初始化过滤器失败"); 34 } 35 } 36 37 private static void init() { 38 // 获取敏感词 39 addSensitiveWord(readWordFromFile("wd.txt")); 40 addStopWord(readWordFromFile("stopwd.txt")); 41 } 42 43 /** 44 * 增加敏感词 45 */ 46 private static List<String> readWordFromFile(String path) { 47 List<String> words; 48 BufferedReader br = null; 49 try { 50 br = new BufferedReader(new InputStreamReader(WordFilter.class.getClassLoader().getResourceAsStream(path))); 51 words = new ArrayList<String>(1200); 52 for (String buf = ""; (buf = br.readLine()) != null;) { 53 if (buf == null || buf.trim().equals("")) 54 continue; 55 words.add(buf); 56 } 57 } catch (Exception e) { 58 throw new RuntimeException(e); 59 } finally { 60 try { 61 if (br != null) 62 br.close(); 63 } catch (IOException e) { 64 } 65 } 66 return words; 67 } 68 69 /** 70 * 增加停顿词 71 */ 72 private static void addStopWord(final List<String> words) { 73 if (words != null && words.size() > 0) { 74 char[] chs; 75 for (String curr : words) { 76 chs = curr.toCharArray(); 77 for (char c : chs) { 78 stopwdSet.add(charConvert(c)); 79 } 80 } 81 } 82 } 83 84 /** 85 * 添加DFA节点 86 */ 87 private static void addSensitiveWord(final List<String> words) { 88 if (words != null && words.size() > 0) { 89 char[] chs; 90 int fchar; 91 int lastIndex; 92 WordNode fnode; // 首字母节点 93 for (String curr : words) { 94 chs = curr.toCharArray(); 95 fchar = charConvert(chs[0]); 96 if (!set.contains(fchar)) {// 没有首字定义 97 set.add(fchar);// 首字标志位 可重复add 98 fnode = new WordNode(fchar, chs.length == 1); 99 nodes.put(fchar, fnode); 100 } else { 101 fnode = nodes.get(fchar); 102 if (!fnode.isLast() && chs.length == 1) 103 fnode.setLast(true); 104 } 105 lastIndex = chs.length - 1; 106 for (int i = 1; i < chs.length; i++) { 107 fnode = fnode.addIfNoExist(charConvert(chs[i]), i == lastIndex); 108 } 109 } 110 } 111 } 112 113 /** 114 * 过滤判断 将敏感词转化为成屏蔽词 115 */ 116 public static final String doFilter(final String src) { 117 char[] chs = src.toCharArray(); 118 int length = chs.length; 119 int currc; 120 int k; 121 WordNode node; 122 for (int i = 0; i < length; i++) { 123 currc = charConvert(chs[i]); 124 if (!set.contains(currc)) { 125 continue; 126 } 127 node = nodes.get(currc); 128 if (node == null) 129 continue; 130 boolean couldMark = false; 131 int markNum = -1; 132 if (node.isLast()) { 133 couldMark = true; 134 markNum = 0; 135 } 136 k = i; 137 for (; ++k < length;) { 138 int temp = charConvert(chs[k]); 139 if (stopwdSet.contains(temp)) 140 continue; 141 node = node.querySub(temp); 142 if (node == null) 143 break; 144 if (node.isLast()) { 145 couldMark = true; 146 markNum = k - i; 147 } 148 } 149 if (couldMark) { 150 for (k = 0; k <= markNum; k++) { 151 chs[k + i] = SIGN; 152 } 153 i = i + markNum; 154 } 155 } 156 157 return new String(chs); 158 } 159 160 /** 161 * 是否包含敏感词 162 */ 163 public static final boolean isContains(final String src) { 164 char[] chs = src.toCharArray(); 165 int length = chs.length; 166 int currc; 167 int k; 168 WordNode node; 169 for (int i = 0; i < length; i++) { 170 currc = charConvert(chs[i]); 171 if (!set.contains(currc)) { 172 continue; 173 } 174 node = nodes.get(currc); 175 if (node == null) 176 continue; 177 boolean couldMark = false; 178 if (node.isLast()) { 179 couldMark = true; 180 } 181 k = i; 182 for (; ++k < length;) { 183 int temp = charConvert(chs[k]); 184 if (stopwdSet.contains(temp)) 185 continue; 186 node = node.querySub(temp); 187 if (node == null) 188 break; 189 if (node.isLast()) { 190 couldMark = true; 191 } 192 } 193 if (couldMark) { 194 return true; 195 } 196 } 197 198 return false; 199 } 200 201 /** 202 * 大写转化为小写 全角转化为半角 203 */ 204 private static int charConvert(char src) { 205 int r = BCConvert.qj2bj(src); 206 return (r >= 'A' && r <= 'Z') ? r + 32 : r; 207 } 208 209 }
其中:
isContains :是否包含敏感词
doFilter:过滤敏感词
2、WordNode敏感词节点:
1 package com.skyer.sensitivewdfilter; 2 3 import java.util.LinkedList; 4 import java.util.List; 5 6 public class WordNode { 7 8 private int value; // 节点名称 9 10 private List<WordNode> subNodes; // 子节点 11 12 private boolean isLast; // 默认false 13 14 public WordNode(int value) { 15 this.value = value; 16 } 17 18 public WordNode(int value, boolean isLast) { 19 this.value = value; 20 this.isLast = isLast; 21 } 22 23 /** 24 * @return 就是传入的subNode 25 */ 26 private WordNode addSubNode(final WordNode subNode) { 27 if (subNodes == null) 28 subNodes = new LinkedList<WordNode>(); 29 subNodes.add(subNode); 30 return subNode; 31 } 32 33 /** 34 * 有就直接返回该子节点, 没有就创建添加并返回该子节点 35 */ 36 public WordNode addIfNoExist(final int value, final boolean isLast) { 37 if (subNodes == null) { 38 return addSubNode(new WordNode(value, isLast)); 39 } 40 for (WordNode subNode : subNodes) { 41 if (subNode.value == value) { 42 if (!subNode.isLast && isLast) 43 subNode.isLast = true; 44 return subNode; 45 } 46 } 47 return addSubNode(new WordNode(value, isLast)); 48 } 49 50 public WordNode querySub(final int value) { 51 if (subNodes == null) { 52 return null; 53 } 54 for (WordNode subNode : subNodes) { 55 if (subNode.value == value) 56 return subNode; 57 } 58 return null; 59 } 60 61 public boolean isLast() { 62 return isLast; 63 } 64 65 public void setLast(boolean isLast) { 66 this.isLast = isLast; 67 } 68 69 @Override 70 public int hashCode() { 71 return value; 72 } 73 74 }
3、测试类:
1 package com.skyer.test; 2 3 import org.junit.Test; 4 5 import com.skyer.sensitivewdfilter.WordFilter; 6 7 public class TestSensitivewd { 8 9 @Test 10 public void TestFilter() { 11 String s = ""; // 这里写你要过滤的句子(我这里不能写,否则会给博客园屏蔽掉) 12 System.out.println("解析问题: " + s); 13 System.out.println("解析字数 : " + s.length()); 14 String re; 15 long nano = System.nanoTime(); 16 re = WordFilter.doFilter(s); 17 nano = (System.nanoTime() - nano); 18 System.out.println("解析时间 : " + nano + "ns"); 19 System.out.println("解析时间 : " + nano / 1000000 + "ms"); 20 System.out.println(re); 21 System.out.println(); 22 23 nano = System.nanoTime(); 24 System.out.println("是否包含敏感词: " + WordFilter.isContains(s)); 25 nano = (System.nanoTime() - nano); 26 System.out.println("解析时间 : " + nano + "ns"); 27 System.out.println("解析时间 : " + nano / 1000000 + "ms"); 28 } 29 30 }
4、测试结果:
原文参考:http://blog.csdn.net/fengshizty/article/details/52373005
DFA知识:http://www.cnblogs.com/naaoveGIS/archive/2016/10/14/5960352.html