java 分词统计

  1. 依赖(谷歌基于lucene的中文分词IKAnalyzer)
<dependency>
  <groupId>org.wltea</groupId>
  <artifactId>ikanalyzer</artifactId>
  <version>5.0.2</version>
</dependency>

<dependency>
  <groupId>org.apache.lucene</groupId>
  <artifactId>lucene-core</artifactId>
  <version>8.2.0</version>
</dependency>
  1. 代码
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.Reader;
import java.io.StringReader;
import java.util.*;

public class WordSegmentationStatistics  {

    /**
     * 通过IKAnalyzer分词,返回单词和出现出现次数
     * @param content 原始文本
     * @return map(key是单词,value是出现次数)
     */
    public static Map<String, Integer> textSegmentation(String content) {
        Map<String, Integer> map = new HashMap<>();
        try(Reader reader = new StringReader(content);
            IKAnalyzer ss = new IKAnalyzer();
            TokenStream tokenStream = ss.tokenStream("", reader)
        ) {
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
                String word = termAttribute.toString();
                if (word.length() > 1) {
                    Integer num = map.get(word);
                    if (num == null) {
                        map.put(word, 1);
                    } else {
                        map.put(word, num + 1);
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return map;
    }

    /**
     * 过滤单词,出现n次以下的过滤掉
     * @param map(key是单词,value是出现次数)
     * @param n 最少出现次数
     */
    public static void wordFrequencyFilter(Map<String, Integer> map,int n) {
        Iterator<Map.Entry<String,Integer>> iterator = map.entrySet().iterator();
        while (iterator.hasNext()) {
            Map.Entry<String,Integer> entry = iterator.next();
            Integer v = entry.getValue();
            if (v < n) {
                iterator.remove();
            }
        }
    }

    /**
     * 将词云排序,并提取词频最高的n个
     * @param map(key是单词,value是出现次数)
     * @param n 频率最高的n个
     * @return 排序后的List<Entry>
     */
    public static List<Map.Entry<String,Integer>> wordFrequencyRank(Map<String, Integer> map, int n) {
        PriorityQueue<Map.Entry<String,Integer>> queue = new PriorityQueue<>((a,b)->b.getValue() - a.getValue());
        queue.addAll(map.entrySet());
        List<Map.Entry<String,Integer>> list = new ArrayList<>(n);
        int count = 0;
        while (!queue.isEmpty() && count < n) {
            count++;
            list.add(queue.poll());
        }
        return list;
    }

    public static void main(String[] args) {
        Map<String,Integer> map = new HashMap<>(100);
        for (int i = 1; i <= 50; i++) {
            map.put(String.valueOf(i), (int) (Math.random() * 10));
        }
        wordFrequencyFilter(map,3);
        System.out.println(map.size()+":"+map);
        List<Map.Entry<String,Integer>> list = wordFrequencyRank(map,50);
        System.out.println(list.size()+":"+list);
    }

}
posted @ 2023-04-25 10:12  小小爬虫  阅读(79)  评论(0编辑  收藏  举报