使用DFA算法对敏感词进行过滤

项目目录结构如下:

其中resources资源目录中:

stopwd.txt :停顿词,匹配时间直接过滤。

wd.txt:敏感词库。

1、WordFilter敏感词过滤类:

  1 package com.skyer.sensitivewdfilter;
  2 
  3 import java.io.BufferedReader;
  4 import java.io.IOException;
  5 import java.io.InputStreamReader;
  6 import java.util.ArrayList;
  7 import java.util.HashMap;
  8 import java.util.HashSet;
  9 import java.util.List;
 10 import java.util.Map;
 11 import java.util.Set;
 12 
 13 /**
 14  * 思路: 创建一个FilterSet,枚举了0~65535的所有char是否是某个敏感词开头的状态
 15  * 
 16  * 判断是否是 敏感词开头 | | 是 不是 获取头节点 OK--下一个字 然后逐级遍历,DFA算法
 17  */
 18 public class WordFilter {
 19 
 20     private static final FilterSet set = new FilterSet(); // 存储首字
 21     private static final Map<Integer, WordNode> nodes = new HashMap<Integer, WordNode>(1024, 1); // 存储节点
 22     private static final Set<Integer> stopwdSet = new HashSet<Integer>(); // 停顿词
 23     private static final char SIGN = '*'; // 敏感词过滤替换
 24 
 25     static {
 26         try {
 27             long a = System.nanoTime();
 28             init();
 29             a = System.nanoTime() - a;
 30             System.out.println("加载时间 : " + a + "ns");
 31             System.out.println("加载时间 : " + a / 1000000 + "ms");
 32         } catch (Exception e) {
 33             throw new RuntimeException("初始化过滤器失败");
 34         }
 35     }
 36 
 37     private static void init() {
 38         // 获取敏感词
 39         addSensitiveWord(readWordFromFile("wd.txt"));
 40         addStopWord(readWordFromFile("stopwd.txt"));
 41     }
 42 
 43     /**
 44      * 增加敏感词
 45      */
 46     private static List<String> readWordFromFile(String path) {
 47         List<String> words;
 48         BufferedReader br = null;
 49         try {
 50             br = new BufferedReader(new InputStreamReader(WordFilter.class.getClassLoader().getResourceAsStream(path)));
 51             words = new ArrayList<String>(1200);
 52             for (String buf = ""; (buf = br.readLine()) != null;) {
 53                 if (buf == null || buf.trim().equals(""))
 54                     continue;
 55                 words.add(buf);
 56             }
 57         } catch (Exception e) {
 58             throw new RuntimeException(e);
 59         } finally {
 60             try {
 61                 if (br != null)
 62                     br.close();
 63             } catch (IOException e) {
 64             }
 65         }
 66         return words;
 67     }
 68 
 69     /**
 70      * 增加停顿词
 71      */
 72     private static void addStopWord(final List<String> words) {
 73         if (words != null && words.size() > 0) {
 74             char[] chs;
 75             for (String curr : words) {
 76                 chs = curr.toCharArray();
 77                 for (char c : chs) {
 78                     stopwdSet.add(charConvert(c));
 79                 }
 80             }
 81         }
 82     }
 83 
 84     /**
 85      * 添加DFA节点
 86      */
 87     private static void addSensitiveWord(final List<String> words) {
 88         if (words != null && words.size() > 0) {
 89             char[] chs;
 90             int fchar;
 91             int lastIndex;
 92             WordNode fnode; // 首字母节点
 93             for (String curr : words) {
 94                 chs = curr.toCharArray();
 95                 fchar = charConvert(chs[0]);
 96                 if (!set.contains(fchar)) {// 没有首字定义
 97                     set.add(fchar);// 首字标志位 可重复add
 98                     fnode = new WordNode(fchar, chs.length == 1);
 99                     nodes.put(fchar, fnode);
100                 } else {
101                     fnode = nodes.get(fchar);
102                     if (!fnode.isLast() && chs.length == 1)
103                         fnode.setLast(true);
104                 }
105                 lastIndex = chs.length - 1;
106                 for (int i = 1; i < chs.length; i++) {
107                     fnode = fnode.addIfNoExist(charConvert(chs[i]), i == lastIndex);
108                 }
109             }
110         }
111     }
112 
113     /**
114      * 过滤判断 将敏感词转化为成屏蔽词
115      */
116     public static final String doFilter(final String src) {
117         char[] chs = src.toCharArray();
118         int length = chs.length;
119         int currc;
120         int k;
121         WordNode node;
122         for (int i = 0; i < length; i++) {
123             currc = charConvert(chs[i]);
124             if (!set.contains(currc)) {
125                 continue;
126             }
127             node = nodes.get(currc);
128             if (node == null)
129                 continue;
130             boolean couldMark = false;
131             int markNum = -1;
132             if (node.isLast()) {
133                 couldMark = true;
134                 markNum = 0;
135             }
136             k = i;
137             for (; ++k < length;) {
138                 int temp = charConvert(chs[k]);
139                 if (stopwdSet.contains(temp))
140                     continue;
141                 node = node.querySub(temp);
142                 if (node == null)
143                     break;
144                 if (node.isLast()) {
145                     couldMark = true;
146                     markNum = k - i;
147                 }
148             }
149             if (couldMark) {
150                 for (k = 0; k <= markNum; k++) {
151                     chs[k + i] = SIGN;
152                 }
153                 i = i + markNum;
154             }
155         }
156 
157         return new String(chs);
158     }
159 
160     /**
161      * 是否包含敏感词
162      */
163     public static final boolean isContains(final String src) {
164         char[] chs = src.toCharArray();
165         int length = chs.length;
166         int currc;
167         int k;
168         WordNode node;
169         for (int i = 0; i < length; i++) {
170             currc = charConvert(chs[i]);
171             if (!set.contains(currc)) {
172                 continue;
173             }
174             node = nodes.get(currc);
175             if (node == null)
176                 continue;
177             boolean couldMark = false;
178             if (node.isLast()) {
179                 couldMark = true;
180             }
181             k = i;
182             for (; ++k < length;) {
183                 int temp = charConvert(chs[k]);
184                 if (stopwdSet.contains(temp))
185                     continue;
186                 node = node.querySub(temp);
187                 if (node == null)
188                     break;
189                 if (node.isLast()) {
190                     couldMark = true;
191                 }
192             }
193             if (couldMark) {
194                 return true;
195             }
196         }
197 
198         return false;
199     }
200 
201     /**
202      * 大写转化为小写 全角转化为半角
203      */
204     private static int charConvert(char src) {
205         int r = BCConvert.qj2bj(src);
206         return (r >= 'A' && r <= 'Z') ? r + 32 : r;
207     }
208 
209 }
WordFilter.java

其中:

      isContains :是否包含敏感词

     doFilter:过滤敏感词

2、WordNode敏感词节点:

 1 package com.skyer.sensitivewdfilter;
 2 
 3 import java.util.LinkedList;
 4 import java.util.List;
 5 
 6 public class WordNode {
 7 
 8     private int value; // 节点名称
 9 
10     private List<WordNode> subNodes; // 子节点
11 
12     private boolean isLast; // 默认false
13 
14     public WordNode(int value) {
15         this.value = value;
16     }
17 
18     public WordNode(int value, boolean isLast) {
19         this.value = value;
20         this.isLast = isLast;
21     }
22 
23     /**
24      * @return 就是传入的subNode
25      */
26     private WordNode addSubNode(final WordNode subNode) {
27         if (subNodes == null)
28             subNodes = new LinkedList<WordNode>();
29         subNodes.add(subNode);
30         return subNode;
31     }
32 
33     /**
34      * 有就直接返回该子节点, 没有就创建添加并返回该子节点
35      */
36     public WordNode addIfNoExist(final int value, final boolean isLast) {
37         if (subNodes == null) {
38             return addSubNode(new WordNode(value, isLast));
39         }
40         for (WordNode subNode : subNodes) {
41             if (subNode.value == value) {
42                 if (!subNode.isLast && isLast)
43                     subNode.isLast = true;
44                 return subNode;
45             }
46         }
47         return addSubNode(new WordNode(value, isLast));
48     }
49 
50     public WordNode querySub(final int value) {
51         if (subNodes == null) {
52             return null;
53         }
54         for (WordNode subNode : subNodes) {
55             if (subNode.value == value)
56                 return subNode;
57         }
58         return null;
59     }
60 
61     public boolean isLast() {
62         return isLast;
63     }
64 
65     public void setLast(boolean isLast) {
66         this.isLast = isLast;
67     }
68 
69     @Override
70     public int hashCode() {
71         return value;
72     }
73 
74 }
WordNode.java

3、测试类:

 1 package com.skyer.test;
 2 
 3 import org.junit.Test;
 4 
 5 import com.skyer.sensitivewdfilter.WordFilter;
 6 
 7 public class TestSensitivewd {
 8 
 9     @Test
10     public void TestFilter() {
11         String s = ""; // 这里写你要过滤的句子(我这里不能写,否则会给博客园屏蔽掉)
12         System.out.println("解析问题: " + s);
13         System.out.println("解析字数 : " + s.length());
14         String re;
15         long nano = System.nanoTime();
16         re = WordFilter.doFilter(s);
17         nano = (System.nanoTime() - nano);
18         System.out.println("解析时间 : " + nano + "ns");
19         System.out.println("解析时间 : " + nano / 1000000 + "ms");
20         System.out.println(re);
21         System.out.println();
22 
23         nano = System.nanoTime();
24         System.out.println("是否包含敏感词: " + WordFilter.isContains(s));
25         nano = (System.nanoTime() - nano);
26         System.out.println("解析时间 : " + nano + "ns");
27         System.out.println("解析时间 : " + nano / 1000000 + "ms");
28     }
29 
30 }
TestSensitivewd.java

4、测试结果:

原文参考:http://blog.csdn.net/fengshizty/article/details/52373005

DFA知识:http://www.cnblogs.com/naaoveGIS/archive/2016/10/14/5960352.html

posted @ 2017-05-23 16:30  我滴个小张张  阅读(493)  评论(0编辑  收藏  举报