jieba分词/jieba-analysis(java版)

简介

支持分词模式
Search模式,用于对用户查询词分词
Index模式,用于对索引文档分词
特性
支持多种分词模式
全角统一转成半角
用户词典功能
conf 目录有整理的搜狗细胞词库
因为性能原因,最新的快照版本去除词性标注,也希望有更好的 Pull Request 可以提供该功能。

简单使用

获取jieba-analysis

<dependency>
  <groupId>com.huaban</groupId>
  <artifactId>jieba-analysis</artifactId>
  <version>1.0.2</version>
</dependency>

案例

复制代码
@Test
public void testDemo() {
    JiebaSegmenter segmenter = new JiebaSegmenter();
    String[] sentences =
        new String[] {"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", "我不喜欢日本和服。", "雷猴回归人间。",
                      "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "结果婚的和尚未结过婚的"};
    for (String sentence : sentences) {
        System.out.println(segmenter.process(sentence, SegMode.INDEX).toString());
    }
}
复制代码

原文链接:https://github.com/huaban/jieba-analysis

我的应用

复制代码
package com.analysis;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.List;
import java.util.UUID;

import org.junit.Before;
import org.junit.Test;

import com.huaban.analysis.jieba.JiebaSegmenter;
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
import com.huaban.analysis.jieba.SegToken;

public class jiebaTest {

</span><span style="color: #0000ff;">private</span> Connection con = <span style="color: #0000ff;">null</span><span style="color: #000000;">;
</span><span style="color: #0000ff;">private</span> PreparedStatement pstmt = <span style="color: #0000ff;">null</span><span style="color: #000000;">;

</span><span style="color: #008000;">/**</span><span style="color: #008000;">
 * 连接
 </span><span style="color: #008000;">*/</span><span style="color: #000000;">
@Before
</span><span style="color: #0000ff;">public</span> <span style="color: #0000ff;">void</span> beforeDemo() <span style="color: #0000ff;">throws</span><span style="color: #000000;"> Exception {
    Class.forName(</span>"com.mysql.jdbc.Driver"<span style="color: #000000;">);
    String url </span>= "jdbc:mysql://localhost:3306/test?user=root&amp;password=root"<span style="color: #000000;">;
    con </span>=<span style="color: #000000;"> DriverManager.getConnection(url);
}

</span><span style="color: #008000;">/**</span><span style="color: #008000;">
 * 分词查询测试
 </span><span style="color: #008000;">*/</span><span style="color: #000000;">
@Test
</span><span style="color: #0000ff;">public</span> <span style="color: #0000ff;">void</span> getDemo() <span style="color: #0000ff;">throws</span><span style="color: #000000;"> Exception {
    BufferedReader br </span>= <span style="color: #0000ff;">new</span> BufferedReader(<span style="color: #0000ff;">new</span><span style="color: #000000;"> InputStreamReader(System.in));
    String str </span>=<span style="color: #000000;"> br.readLine();

    String sql </span>= "select * from t_jieba where name = ?"<span style="color: #000000;">;
    pstmt </span>=<span style="color: #000000;"> con.prepareStatement(sql);

    pstmt.setString(</span>1<span style="color: #000000;">, str);
    ResultSet rs </span>=<span style="color: #000000;"> pstmt.executeQuery();

    </span><span style="color: #0000ff;">while</span><span style="color: #000000;"> (rs.next()) {
        System.out.println(rs.getInt(</span>1)+"--"+rs.getString(2)+"--"+rs.getString(3)+"--"+rs.getString(4)+"--"+rs.getString(5<span style="color: #000000;">));
        pstmt.clearParameters();
        String sql1 </span>= "update t_jieba set times = ? where id = ?"<span style="color: #000000;">;
        pstmt </span>=<span style="color: #000000;"> con.prepareStatement(sql1);
        pstmt.setInt(</span>1, 1+ <span style="color: #0000ff;">new</span> Integer(rs.getString(5<span style="color: #000000;">)));
        pstmt.setInt(</span>2, rs.getInt(1<span style="color: #000000;">));
        pstmt.executeUpdate();
    }
    
    rs.close();
    pstmt.close();
}

</span><span style="color: #008000;">/**</span><span style="color: #008000;">
 * 分词插入测试
 </span><span style="color: #008000;">*/</span><span style="color: #000000;">
@Test
</span><span style="color: #0000ff;">public</span> <span style="color: #0000ff;">void</span> addDemo() <span style="color: #0000ff;">throws</span><span style="color: #000000;"> Exception {
    String sql </span>= "insert into t_jieba (name,cid,c_name,times) select ?,?,?,? from DUAL where not EXISTS(select name from t_jieba where name=?)"<span style="color: #000000;">;
    pstmt </span>=<span style="color: #000000;"> con.prepareStatement(sql);
    JiebaSegmenter segmenter </span>= <span style="color: #0000ff;">new</span><span style="color: #000000;"> JiebaSegmenter();
    String[] sentences </span>= <span style="color: #0000ff;">new</span> String[] { "大话数据结构", "深入浅出设计模式", "JavaEE开发的颠覆者: Spring Boot实战", "java从入门到放弃"<span style="color: #000000;"> };
    </span><span style="color: #0000ff;">for</span><span style="color: #000000;"> (String sentence : sentences) {
        </span><span style="color: #008000;">//</span><span style="color: #008000;">System.out.println(segmenter.process(sentence, SegMode.INDEX).toString());</span>
        String uuid =<span style="color: #000000;"> UUID.randomUUID().toString();
        uuid </span>= uuid.replace("-", ""<span style="color: #000000;">);
        List</span>&lt;SegToken&gt; list =<span style="color: #000000;"> segmenter.process(sentence, SegMode.INDEX);
        </span><span style="color: #0000ff;">for</span><span style="color: #000000;"> (SegToken segToken : list) {
            String name </span>=<span style="color: #000000;"> segToken.word.trim();
            </span><span style="color: #0000ff;">if</span> (name != <span style="color: #0000ff;">null</span> &amp;&amp; !""<span style="color: #000000;">.equals(name)) {
                pstmt.setString(</span>1<span style="color: #000000;">, segToken.word);
                pstmt.setString(</span>2<span style="color: #000000;">, uuid);
                pstmt.setString(</span>3<span style="color: #000000;">, sentence);
                pstmt.setString(</span>4, "0"<span style="color: #000000;">);
                pstmt.setString(</span>5<span style="color: #000000;">, segToken.word);
                pstmt.executeUpdate();
                pstmt.clearParameters();
            }
        }
    }
    pstmt.close();
    System.out.println(</span>"插入成功!"<span style="color: #000000;">);
}

}

复制代码
原文地址:https://www.cnblogs.com/bky-lzw/p/7799238.html
posted @ 2019-08-26 17:28  星朝  阅读(2833)  评论(0编辑  收藏  举报