ikanalyzer分词，计算信息熵排序分词结果

因需求，现需分词接口，故记录之。

1、需要依赖：

1 <!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
2         <dependency>
3             <groupId>com.janeluo</groupId>
4             <artifactId>ikanalyzer</artifactId>
5             <version>2012_u6</version>
6         </dependency>

maven依赖

2、完整代码如下：

  1 public JSONArray entropy(String content, Integer quantity) throws Exception {
  2         List<String> words = extract(DelHtmlTagUtil.delHTMLTag(content), quantity);
  3         JSONArray array = calculateWordEntropy(words);
  4         return array;
  5     }
  6 
  7 /**
  8      * 传入String类型的文章，智能提取单词放入list
  9      *
 10      * @param content  传入分词的内容
 11      * @param quantity 截取关键字在几个单词以上的数量，默认为1
 12      * @return
 13      */
 14     private List<String> extract(String content, Integer quantity) throws IOException {
 15         List<String> list = Lists.newArrayList();
 16         StringReader reader = new StringReader(content);
 17         IKSegmenter ik = new IKSegmenter(reader, true);
 18         Lexeme lex = null;
 19         while ((lex = ik.next()) != null) {
 20             //String typeString = lex.getLexemeTypeString();  词语类型
 21             String word = lex.getLexemeText();
 22             if (word.length() > quantity) {//判断截取关键字在几个单词以上的数量
 23                 list.add(word);
 24             }
 25         }
 26         return list;
 27     }
 28 
 29     private JSONArray calculateWordEntropy(List<String> words) throws Exception{
 30 
 31         int length = words.size();
 32         ArrayList<String[]> wordList = new ArrayList<String[]>();
 33         // 将分好的词每3个一组存到数组中
 34         for (int i = 0; i < length; i++) {
 35 
 36             String[] wordSeg = new String[3];
 37             if (i == 0) {
 38                 wordSeg[0] = "null";
 39                 wordSeg[1] = words.get(i);
 40                 wordSeg[2] = words.get(i + 1);
 41             } else if (i == length - 1) {
 42                 wordSeg[0] = words.get(i - 1);
 43                 wordSeg[1] = words.get(i);
 44                 wordSeg[2] = "null";
 45             } else {
 46                 wordSeg[0] = words.get(i - 1);
 47                 wordSeg[1] = words.get(i);
 48                 wordSeg[2] = words.get(i + 1);
 49             }
 50 
 51             wordList.add(wordSeg);
 52 
 53         }
 54         // 去除重复的词
 55         List<String> lists = Lists.newArrayList();
 56         for (int l = 0; l < length; l++) {
 57             lists.add(words.get(l));
 58         }
 59         List<String> tempList = Lists.newArrayList();
 60         for (String str : lists) {
 61             if (!(tempList.contains(str))) {
 62                 tempList.add(str);
 63             }
 64         }
 65         String[] wordClean = new String[tempList.size()];
 66         for (int m = 0; m < tempList.size(); m++) {
 67             wordClean[m] = tempList.get(m);
 68         }
 69         // 统计每个词的词频
 70         int[] frequent = new int[wordClean.length];
 71         for (int j = 0; j < wordClean.length; j++) {
 72             int count = 0;
 73             for (int k = 0; k < words.size(); k++) {
 74                 if (wordClean[j].equals(words.get(k))) {
 75                     count++;
 76                 }
 77             }
 78             frequent[j] = count;
 79         }
 80         // 将三元组中中间的那个词相同的存到一个list中，然后计算该词的信息熵
 81         double[] allEntropy = new double[wordClean.length];
 82         for (int n = 0; n < wordClean.length; n++) {
 83             ArrayList<String[]> wordSegList = new ArrayList<String[]>();
 84             int count = 1;
 85             for (int p = 0; p < wordList.size(); p++) {
 86                 String[] wordSegStr = wordList.get(p);
 87                 if (wordSegStr[1].equals(wordClean[n])) {
 88                     count++;
 89                     wordSegList.add(wordSegStr);
 90                 }
 91             }
 92             String[] leftword = new String[wordSegList.size()];
 93             String[] rightword = new String[wordSegList.size()];
 94             // 计算左信息熵
 95             for (int i = 0; i < wordSegList.size(); i++) {
 96                 String[] left = wordSegList.get(i);
 97                 leftword[i] = left[0];
 98             }
 99             // 去除左边重复的词
100             List<String> listsLeft = new ArrayList<String>();
101             for (int l = 0; l < leftword.length; l++) {
102                 listsLeft.add(leftword[l]);
103             }
104             List<String> tempListLeft = new ArrayList<String>();
105             for (String str : listsLeft) {
106                 if (!(tempListLeft.contains(str))) {
107                     tempListLeft.add(str);
108                 }
109             }
110             String[] leftWordClean = new String[tempListLeft.size()];
111             for (int m = 0; m < tempListLeft.size(); m++) {
112                 leftWordClean[m] = tempListLeft.get(m);
113             }
114             // 统计左边每个词的词频
115             int[] leftFrequent = new int[leftWordClean.length];
116             for (int j = 0; j < leftWordClean.length; j++) {
117                 int leftcount = 0;
118                 for (int k = 0; k < leftword.length; k++) {
119                     if (leftWordClean[j].equals(leftword[k])) {
120                         leftcount++;
121                     }
122                 }
123                 leftFrequent[j] = leftcount;
124             }
125             // 计算左熵值
126             double leftEntropy = 0;
127             for (int i = 0; i < leftFrequent.length; i++) {
128                 double a = (double) leftFrequent[i] / count;
129                 double b = Math.log((double) leftFrequent[i] / count);
130                 leftEntropy += -a * b;
131                 // leftEntropy +=
132                 // (-(double)(leftFrequent[i]/count))*Math.log((double)(leftFrequent[i]/count));
133             }
134             // 计算右信息熵
135             for (int i = 0; i < wordSegList.size(); i++) {
136                 String[] right = wordSegList.get(i);
137                 rightword[i] = right[2];
138             }
139             // 去除右边重复的词
140             List<String> listsRight = new ArrayList<String>();
141             for (int l = 0; l < rightword.length; l++) {
142                 listsRight.add(rightword[l]);
143             }
144             List<String> tempListRight = new ArrayList<String>();
145             for (String str : listsRight) {
146                 if (!(tempListRight.contains(str))) {
147                     tempListRight.add(str);
148                 }
149             }
150             String[] rightWordClean = new String[tempListRight.size()];
151             for (int m = 0; m < tempListRight.size(); m++) {
152                 rightWordClean[m] = tempListRight.get(m);
153             }
154             // 统计右边每个词的词频
155             int[] rightFrequent = new int[rightWordClean.length];
156             for (int j = 0; j < rightWordClean.length; j++) {
157                 int rightcount = 0;
158                 for (int k = 0; k < rightword.length; k++) {
159                     if (rightWordClean[j].equals(rightword[k])) {
160                         rightcount++;
161                     }
162                 }
163                 rightFrequent[j] = rightcount;
164             }
165             // 计算右熵值
166             double rightEntropy = 0.0;
167             for (int i = 0; i < rightFrequent.length; i++) {
168                 double a = (double) rightFrequent[i] / count;
169                 double b = Math.log((double) rightFrequent[i] / count);
170                 rightEntropy += -a * b;
171                 // rightEntropy +=
172                 // (-(double)(rightFrequent[i]/count))*Math.log((double)(rightFrequent[i]/count));
173             }
174             // 计算词的总信息熵
175             double wordEntropy = leftEntropy + rightEntropy;
176             allEntropy[n] = wordEntropy;
177 
178         }
179         JSONArray list = new JSONArray();
180         for (int i = 0; i < allEntropy.length; i++) {
181             JSONObject obj = new JSONObject();
182             obj.put("name", wordClean[i]);
183             obj.put("entropy", allEntropy[i]);
184             list.add(obj);
185         }
186         Collections.sort(list, (o1, o2) -> {
187             Double d1 = ((JSONObject) o1).getDouble("entropy");
188             Double d2 = ((JSONObject) o2).getDouble("entropy");
189             return d2.compareTo(d1);
190         });
191 
192         return list;
193     }

处理代理

posted @ 2018-07-24 15:30 羽哲阅读(432) 评论(0) 编辑收藏举报

刷新页面返回顶部

羽哲

ikanalyzer分词，计算信息熵排序分词结果

公告