ikanalyzer分词,计算信息熵排序分词结果
因需求,现需分词接口,故记录之。
1、需要依赖:
1 <!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer --> 2 <dependency> 3 <groupId>com.janeluo</groupId> 4 <artifactId>ikanalyzer</artifactId> 5 <version>2012_u6</version> 6 </dependency>
2、完整代码如下:
1 public JSONArray entropy(String content, Integer quantity) throws Exception { 2 List<String> words = extract(DelHtmlTagUtil.delHTMLTag(content), quantity); 3 JSONArray array = calculateWordEntropy(words); 4 return array; 5 } 6 7 /** 8 * 传入String类型的文章,智能提取单词放入list 9 * 10 * @param content 传入分词的内容 11 * @param quantity 截取关键字在几个单词以上的数量,默认为1 12 * @return 13 */ 14 private List<String> extract(String content, Integer quantity) throws IOException { 15 List<String> list = Lists.newArrayList(); 16 StringReader reader = new StringReader(content); 17 IKSegmenter ik = new IKSegmenter(reader, true); 18 Lexeme lex = null; 19 while ((lex = ik.next()) != null) { 20 //String typeString = lex.getLexemeTypeString(); 词语类型 21 String word = lex.getLexemeText(); 22 if (word.length() > quantity) {//判断截取关键字在几个单词以上的数量 23 list.add(word); 24 } 25 } 26 return list; 27 } 28 29 private JSONArray calculateWordEntropy(List<String> words) throws Exception{ 30 31 int length = words.size(); 32 ArrayList<String[]> wordList = new ArrayList<String[]>(); 33 // 将分好的词每3个一组存到数组中 34 for (int i = 0; i < length; i++) { 35 36 String[] wordSeg = new String[3]; 37 if (i == 0) { 38 wordSeg[0] = "null"; 39 wordSeg[1] = words.get(i); 40 wordSeg[2] = words.get(i + 1); 41 } else if (i == length - 1) { 42 wordSeg[0] = words.get(i - 1); 43 wordSeg[1] = words.get(i); 44 wordSeg[2] = "null"; 45 } else { 46 wordSeg[0] = words.get(i - 1); 47 wordSeg[1] = words.get(i); 48 wordSeg[2] = words.get(i + 1); 49 } 50 51 wordList.add(wordSeg); 52 53 } 54 // 去除重复的词 55 List<String> lists = Lists.newArrayList(); 56 for (int l = 0; l < length; l++) { 57 lists.add(words.get(l)); 58 } 59 List<String> tempList = Lists.newArrayList(); 60 for (String str : lists) { 61 if (!(tempList.contains(str))) { 62 tempList.add(str); 63 } 64 } 65 String[] wordClean = new String[tempList.size()]; 66 for (int m = 0; m < tempList.size(); m++) { 67 wordClean[m] = tempList.get(m); 68 } 69 // 统计每个词的词频 70 int[] frequent = new int[wordClean.length]; 71 for (int j = 0; j < wordClean.length; j++) { 72 int count = 0; 73 for (int k = 0; k < words.size(); k++) { 74 if (wordClean[j].equals(words.get(k))) { 75 count++; 76 } 77 } 78 frequent[j] = count; 79 } 80 // 将三元组中中间的那个词相同的存到一个list中,然后计算该词的信息熵 81 double[] allEntropy = new double[wordClean.length]; 82 for (int n = 0; n < wordClean.length; n++) { 83 ArrayList<String[]> wordSegList = new ArrayList<String[]>(); 84 int count = 1; 85 for (int p = 0; p < wordList.size(); p++) { 86 String[] wordSegStr = wordList.get(p); 87 if (wordSegStr[1].equals(wordClean[n])) { 88 count++; 89 wordSegList.add(wordSegStr); 90 } 91 } 92 String[] leftword = new String[wordSegList.size()]; 93 String[] rightword = new String[wordSegList.size()]; 94 // 计算左信息熵 95 for (int i = 0; i < wordSegList.size(); i++) { 96 String[] left = wordSegList.get(i); 97 leftword[i] = left[0]; 98 } 99 // 去除左边重复的词 100 List<String> listsLeft = new ArrayList<String>(); 101 for (int l = 0; l < leftword.length; l++) { 102 listsLeft.add(leftword[l]); 103 } 104 List<String> tempListLeft = new ArrayList<String>(); 105 for (String str : listsLeft) { 106 if (!(tempListLeft.contains(str))) { 107 tempListLeft.add(str); 108 } 109 } 110 String[] leftWordClean = new String[tempListLeft.size()]; 111 for (int m = 0; m < tempListLeft.size(); m++) { 112 leftWordClean[m] = tempListLeft.get(m); 113 } 114 // 统计左边每个词的词频 115 int[] leftFrequent = new int[leftWordClean.length]; 116 for (int j = 0; j < leftWordClean.length; j++) { 117 int leftcount = 0; 118 for (int k = 0; k < leftword.length; k++) { 119 if (leftWordClean[j].equals(leftword[k])) { 120 leftcount++; 121 } 122 } 123 leftFrequent[j] = leftcount; 124 } 125 // 计算左熵值 126 double leftEntropy = 0; 127 for (int i = 0; i < leftFrequent.length; i++) { 128 double a = (double) leftFrequent[i] / count; 129 double b = Math.log((double) leftFrequent[i] / count); 130 leftEntropy += -a * b; 131 // leftEntropy += 132 // (-(double)(leftFrequent[i]/count))*Math.log((double)(leftFrequent[i]/count)); 133 } 134 // 计算右信息熵 135 for (int i = 0; i < wordSegList.size(); i++) { 136 String[] right = wordSegList.get(i); 137 rightword[i] = right[2]; 138 } 139 // 去除右边重复的词 140 List<String> listsRight = new ArrayList<String>(); 141 for (int l = 0; l < rightword.length; l++) { 142 listsRight.add(rightword[l]); 143 } 144 List<String> tempListRight = new ArrayList<String>(); 145 for (String str : listsRight) { 146 if (!(tempListRight.contains(str))) { 147 tempListRight.add(str); 148 } 149 } 150 String[] rightWordClean = new String[tempListRight.size()]; 151 for (int m = 0; m < tempListRight.size(); m++) { 152 rightWordClean[m] = tempListRight.get(m); 153 } 154 // 统计右边每个词的词频 155 int[] rightFrequent = new int[rightWordClean.length]; 156 for (int j = 0; j < rightWordClean.length; j++) { 157 int rightcount = 0; 158 for (int k = 0; k < rightword.length; k++) { 159 if (rightWordClean[j].equals(rightword[k])) { 160 rightcount++; 161 } 162 } 163 rightFrequent[j] = rightcount; 164 } 165 // 计算右熵值 166 double rightEntropy = 0.0; 167 for (int i = 0; i < rightFrequent.length; i++) { 168 double a = (double) rightFrequent[i] / count; 169 double b = Math.log((double) rightFrequent[i] / count); 170 rightEntropy += -a * b; 171 // rightEntropy += 172 // (-(double)(rightFrequent[i]/count))*Math.log((double)(rightFrequent[i]/count)); 173 } 174 // 计算词的总信息熵 175 double wordEntropy = leftEntropy + rightEntropy; 176 allEntropy[n] = wordEntropy; 177 178 } 179 JSONArray list = new JSONArray(); 180 for (int i = 0; i < allEntropy.length; i++) { 181 JSONObject obj = new JSONObject(); 182 obj.put("name", wordClean[i]); 183 obj.put("entropy", allEntropy[i]); 184 list.add(obj); 185 } 186 Collections.sort(list, (o1, o2) -> { 187 Double d1 = ((JSONObject) o1).getDouble("entropy"); 188 Double d2 = ((JSONObject) o2).getDouble("entropy"); 189 return d2.compareTo(d1); 190 }); 191 192 return list; 193 }