java 分词统计
- 依赖(谷歌基于lucene的中文分词IKAnalyzer)
<dependency>
<groupId>org.wltea</groupId>
<artifactId>ikanalyzer</artifactId>
<version>5.0.2</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>8.2.0</version>
</dependency>
- 代码
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.Reader;
import java.io.StringReader;
import java.util.*;
public class WordSegmentationStatistics {
/**
* 通过IKAnalyzer分词,返回单词和出现出现次数
* @param content 原始文本
* @return map(key是单词,value是出现次数)
*/
public static Map<String, Integer> textSegmentation(String content) {
Map<String, Integer> map = new HashMap<>();
try(Reader reader = new StringReader(content);
IKAnalyzer ss = new IKAnalyzer();
TokenStream tokenStream = ss.tokenStream("", reader)
) {
tokenStream.reset();
while (tokenStream.incrementToken()) {
CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
String word = termAttribute.toString();
if (word.length() > 1) {
Integer num = map.get(word);
if (num == null) {
map.put(word, 1);
} else {
map.put(word, num + 1);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return map;
}
/**
* 过滤单词,出现n次以下的过滤掉
* @param map(key是单词,value是出现次数)
* @param n 最少出现次数
*/
public static void wordFrequencyFilter(Map<String, Integer> map,int n) {
Iterator<Map.Entry<String,Integer>> iterator = map.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry<String,Integer> entry = iterator.next();
Integer v = entry.getValue();
if (v < n) {
iterator.remove();
}
}
}
/**
* 将词云排序,并提取词频最高的n个
* @param map(key是单词,value是出现次数)
* @param n 频率最高的n个
* @return 排序后的List<Entry>
*/
public static List<Map.Entry<String,Integer>> wordFrequencyRank(Map<String, Integer> map, int n) {
PriorityQueue<Map.Entry<String,Integer>> queue = new PriorityQueue<>((a,b)->b.getValue() - a.getValue());
queue.addAll(map.entrySet());
List<Map.Entry<String,Integer>> list = new ArrayList<>(n);
int count = 0;
while (!queue.isEmpty() && count < n) {
count++;
list.add(queue.poll());
}
return list;
}
public static void main(String[] args) {
Map<String,Integer> map = new HashMap<>(100);
for (int i = 1; i <= 50; i++) {
map.put(String.valueOf(i), (int) (Math.random() * 10));
}
wordFrequencyFilter(map,3);
System.out.println(map.size()+":"+map);
List<Map.Entry<String,Integer>> list = wordFrequencyRank(map,50);
System.out.println(list.size()+":"+list);
}
}
不积跬步无以至千里