IKAnalyzerUtil中文分词

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import xxx.xxx.xxx.utils.PinYin4j;

/**
 * 分词工具
 * 
 * @author light-zhang
 *
 */
public class IKAnalyzerUtil {

    private static final Logger logger = LoggerFactory.getLogger(IKAnalyzerUtil.class);
    /**
     * 敏感词集合
     */
    public static Map<String, String> sensitiveWordMap;

    public static Set<String> sensitiveWordList;

    /**
     * 初始化敏感词库
     * 
     * @param sensitiveWordSet
     *            敏感词库
     */
    public static synchronized void init(Set<String> sensitiveWordSet) {
        sensitiveWordMap = new HashMap<String, String>(sensitiveWordSet.size());
        for (String sensitiveWord : sensitiveWordSet) {// 初始化敏感词容器,减少扩容操作
            sensitiveWordMap.put(sensitiveWord, sensitiveWord);
        }
    }

    /**
     * 判断文字是否包含敏感字符
     *
     * @param txt
     *            文字
     * @return 若包含返回true,否则返回false
     */
    public static boolean contains(String txt) throws Exception {
        boolean flag = false;
        Set<String> wordList = segment(txt);
        for (String word : wordList) {
            if (sensitiveWordMap.get(word) != null) {
                return true;
            }
        }
        return flag;
    }

    /**
     * 获取文字中的敏感词
     *
     * @param txt
     *            文字
     * @return
     */
    public static Set<String> getSensitiveWord(String txt) throws IOException {
        Set<String> sensitiveWordList = new HashSet<>();
        Set<String> wordList = segment(txt);
        for (String word : wordList) {
            if (sensitiveWordMap.get(word) != null) {
                sensitiveWordList.add(word);
            }
        }
        return sensitiveWordList;
    }

    /**
     * 替换敏感字字符
     *
     * @param txt
     *            文本
     * @param replaceChar
     *            替换的字符,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符:*, 替换结果:我爱***
     * @return
     */
    public static String replaceSensitiveWord(String txt, char replaceChar) throws IOException {
        String resultTxt = txt;
        // 获取所有的敏感词
        sensitiveWordList = getSensitiveWord(txt);
        String replaceString;
        for (String sensitiveWord : sensitiveWordList) {
            replaceString = getReplaceChars(replaceChar, sensitiveWord.length());
            resultTxt = resultTxt.replaceAll(sensitiveWord, replaceString);
        }
        return resultTxt;
    }

    /**
     * 替换敏感字字符
     *
     * @param txt
     *            文本
     * @param replaceStr
     *            替换的字符串,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符串:[屏蔽],替换结果:我爱[屏蔽]
     * @return
     */
    public static String replaceSensitiveWord(String txt, String replaceStr) throws IOException {
        String resultTxt = txt;
        // 获取所有的敏感词
        sensitiveWordList = getSensitiveWord(txt);
        for (String sensitiveWord : sensitiveWordList) {
            resultTxt = resultTxt.replaceAll(sensitiveWord, replaceStr);
        }
        return resultTxt;
    }

    /**
     * 获取替换字符串
     *
     * @param replaceChar
     * @param length
     * @return
     */
    private static String getReplaceChars(char replaceChar, int length) {
        String resultReplace = String.valueOf(replaceChar);
        for (int i = 1; i < length; i++) {
            resultReplace += replaceChar;
        }

        return resultReplace;
    }

    /**
     * 对语句进行分词
     *
     * @param text
     *            语句
     * @return 分词后的集合
     * @throws IOException
     */
    public static Set<String> segment(String text) {
        try {
            sensitiveWordSet();// 敏感词初始化
            List<String> list = new ArrayList<String>(2);
            StringReader re = new StringReader(text);
            IKSegmenter ik = new IKSegmenter(re, true);
            Lexeme lex;
            while ((lex = ik.next()) != null) {
                list.add(lex.getLexemeText());
            }
            return new HashSet<String>(list);
        } catch (Exception e) {
            logger.debug("分词出现错误,错误信息{}", e.getMessage());
        }
        return null;
    }

    /**
     * 敏感词初始化
     */
    public static void sensitiveWordSet() {
        Set<String> sensitiveWordSet = new HashSet<>();
        sensitiveWordSet.add("中国");
        sensitiveWordSet.add("政府");
        IKAnalyzerUtil.init(sensitiveWordSet);// 初始化敏感词库
    }

    public static void main(String[] args) throws IOException {
        String text = "小欧隔着屏幕都能感受到你的喜悦呢,为顾客提供优质的服务是OPPO一贯的追求,感谢您的点赞!OPPO也一直致力于给用户带来惊喜的用户体验,FindX拥有一体无瑕机身,屏占比高达93.8%,全隐藏式3D摄像头再搭配全新渐变色,实现四周渐变显得格外惊艳,相信这次又会给您带来些不一样的使用体验哦!";
        System.out.println(StringUtils.join(IKAnalyzerUtil.segment(text), ","));
        System.out.println(PinYin4j.getPinYin(StringUtils.join(IKAnalyzerUtil.segment(text), ",")));
    }

}

 2.在src目录下创建文件:IKAnalyzer.cfg.xml

  

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
    <comment>IK Analyzer 扩展配置</comment>
    <!--这里配置自己的扩展字典 路径地址不要用绝对路径 -->
    <entry key="ext_dict">
        cc/test/common/word/product.dic;
        cc/test/common/word/search.dic
    </entry>
    <!--这里配置自己的扩展停止词字典 路径不要用绝对路径-->
    <entry key="ext_stopwords">
        cc/test/common/word/stopWords.dic
    </entry>
</properties>

 

3.在 cc/test/common/word/ 创建扩展词典

阿根鲜
鸡丁
香辣鸡丁
妙香
精装品
皮蛋
土豆片
干土豆片
傲娇
土鸭皮蛋
皮蛋
椒香
豆笋
香辣
豆皮
吮指
牛肉
牛肉酱
鲜香味
蒲家
禧盈门
红汤
川味
火锅
底料
酥骨
鸡尖
清门蟹
螃蟹
脆李
青脆李
单袋
山椒味
包邮
脱骨
李子
新鲜
直达
顺丰
中通
套餐装
脆香
下酒菜
藕丁
藕霸
藕霸啊
香菇酱
鱿鱼须
卤毛肚
毛肚
下饭
下饭酱
就酱
香辣口味
现摘
现摘现发
特级果
一级果
三门清蟹
毛肚
脆
李
脆李

 

 

 

posted @ 2018-07-27 10:12  light-zhang  阅读(279)  评论(0编辑  收藏  举报