jieba分词/jieba-analysis(java版)
简介
支持分词模式
Search模式,用于对用户查询词分词
Index模式,用于对索引文档分词
特性
支持多种分词模式
全角统一转成半角
用户词典功能
conf 目录有整理的搜狗细胞词库
因为性能原因,最新的快照版本去除词性标注,也希望有更好的 Pull Request 可以提供该功能。
简单使用
获取jieba-analysis
<dependency> <groupId>com.huaban</groupId> <artifactId>jieba-analysis</artifactId> <version>1.0.2</version> </dependency>
案例
@Test public void testDemo() { JiebaSegmenter segmenter = new JiebaSegmenter(); String[] sentences = new String[] {"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", "我不喜欢日本和服。", "雷猴回归人间。", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "结果婚的和尚未结过婚的"}; for (String sentence : sentences) { System.out.println(segmenter.process(sentence, SegMode.INDEX).toString()); } }
原文链接:https://github.com/huaban/jieba-analysis
我的应用
package com.analysis;import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.List;
import java.util.UUID;import org.junit.Before;
import org.junit.Test;import com.huaban.analysis.jieba.JiebaSegmenter;
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
import com.huaban.analysis.jieba.SegToken;public class jiebaTest {
</span><span style="color: #0000ff;">private</span> Connection con = <span style="color: #0000ff;">null</span><span style="color: #000000;">; </span><span style="color: #0000ff;">private</span> PreparedStatement pstmt = <span style="color: #0000ff;">null</span><span style="color: #000000;">; </span><span style="color: #008000;">/**</span><span style="color: #008000;"> * 连接 </span><span style="color: #008000;">*/</span><span style="color: #000000;"> @Before </span><span style="color: #0000ff;">public</span> <span style="color: #0000ff;">void</span> beforeDemo() <span style="color: #0000ff;">throws</span><span style="color: #000000;"> Exception { Class.forName(</span>"com.mysql.jdbc.Driver"<span style="color: #000000;">); String url </span>= "jdbc:mysql://localhost:3306/test?user=root&password=root"<span style="color: #000000;">; con </span>=<span style="color: #000000;"> DriverManager.getConnection(url); } </span><span style="color: #008000;">/**</span><span style="color: #008000;"> * 分词查询测试 </span><span style="color: #008000;">*/</span><span style="color: #000000;"> @Test </span><span style="color: #0000ff;">public</span> <span style="color: #0000ff;">void</span> getDemo() <span style="color: #0000ff;">throws</span><span style="color: #000000;"> Exception { BufferedReader br </span>= <span style="color: #0000ff;">new</span> BufferedReader(<span style="color: #0000ff;">new</span><span style="color: #000000;"> InputStreamReader(System.in)); String str </span>=<span style="color: #000000;"> br.readLine(); String sql </span>= "select * from t_jieba where name = ?"<span style="color: #000000;">; pstmt </span>=<span style="color: #000000;"> con.prepareStatement(sql); pstmt.setString(</span>1<span style="color: #000000;">, str); ResultSet rs </span>=<span style="color: #000000;"> pstmt.executeQuery(); </span><span style="color: #0000ff;">while</span><span style="color: #000000;"> (rs.next()) { System.out.println(rs.getInt(</span>1)+"--"+rs.getString(2)+"--"+rs.getString(3)+"--"+rs.getString(4)+"--"+rs.getString(5<span style="color: #000000;">)); pstmt.clearParameters(); String sql1 </span>= "update t_jieba set times = ? where id = ?"<span style="color: #000000;">; pstmt </span>=<span style="color: #000000;"> con.prepareStatement(sql1); pstmt.setInt(</span>1, 1+ <span style="color: #0000ff;">new</span> Integer(rs.getString(5<span style="color: #000000;">))); pstmt.setInt(</span>2, rs.getInt(1<span style="color: #000000;">)); pstmt.executeUpdate(); } rs.close(); pstmt.close(); } </span><span style="color: #008000;">/**</span><span style="color: #008000;"> * 分词插入测试 </span><span style="color: #008000;">*/</span><span style="color: #000000;"> @Test </span><span style="color: #0000ff;">public</span> <span style="color: #0000ff;">void</span> addDemo() <span style="color: #0000ff;">throws</span><span style="color: #000000;"> Exception { String sql </span>= "insert into t_jieba (name,cid,c_name,times) select ?,?,?,? from DUAL where not EXISTS(select name from t_jieba where name=?)"<span style="color: #000000;">; pstmt </span>=<span style="color: #000000;"> con.prepareStatement(sql); JiebaSegmenter segmenter </span>= <span style="color: #0000ff;">new</span><span style="color: #000000;"> JiebaSegmenter(); String[] sentences </span>= <span style="color: #0000ff;">new</span> String[] { "大话数据结构", "深入浅出设计模式", "JavaEE开发的颠覆者: Spring Boot实战", "java从入门到放弃"<span style="color: #000000;"> }; </span><span style="color: #0000ff;">for</span><span style="color: #000000;"> (String sentence : sentences) { </span><span style="color: #008000;">//</span><span style="color: #008000;">System.out.println(segmenter.process(sentence, SegMode.INDEX).toString());</span> String uuid =<span style="color: #000000;"> UUID.randomUUID().toString(); uuid </span>= uuid.replace("-", ""<span style="color: #000000;">); List</span><SegToken> list =<span style="color: #000000;"> segmenter.process(sentence, SegMode.INDEX); </span><span style="color: #0000ff;">for</span><span style="color: #000000;"> (SegToken segToken : list) { String name </span>=<span style="color: #000000;"> segToken.word.trim(); </span><span style="color: #0000ff;">if</span> (name != <span style="color: #0000ff;">null</span> && !""<span style="color: #000000;">.equals(name)) { pstmt.setString(</span>1<span style="color: #000000;">, segToken.word); pstmt.setString(</span>2<span style="color: #000000;">, uuid); pstmt.setString(</span>3<span style="color: #000000;">, sentence); pstmt.setString(</span>4, "0"<span style="color: #000000;">); pstmt.setString(</span>5<span style="color: #000000;">, segToken.word); pstmt.executeUpdate(); pstmt.clearParameters(); } } } pstmt.close(); System.out.println(</span>"插入成功!"<span style="color: #000000;">); }
}