elasticsearch实战 修改IK源码实现词组动态更新
下载IK源码
https://github.com/medcl/elasticsearch-analysis-ik/tree/v5.2.0
选择你对应ik的版本(ps:版本最好一致)
http://localhost:9200/?pretty查看es版本 我的是6.5.1
修改源码
1.创建一个ext包同时增加3个类文件
DBHelper
package org.wltea.analyzer.ext; import org.apache.logging.log4j.Logger; import org.elasticsearch.common.logging.Loggers; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import java.util.Map; public class DBHelper { Logger logger= Loggers.getLogger(DBHelper.class); public static String url = null; public static String dbUser = null; public static String dbPwd = null; public static String dbTable = null; private Connection conn; public static Map<String, Date> lastImportTimeMap = new HashMap<String, Date>(); static{ try { Class.forName("com.mysql.jdbc.Driver");// 加载Mysql数据驱动 } catch (Exception e) { e.printStackTrace(); } } private Connection getConn() throws Exception { try { conn = DriverManager.getConnection(url, dbUser, dbPwd);// 创建数据连接 } catch (Exception e) { logger.warn("异常了"); e.printStackTrace(); } return conn; } /** * 从数据库获得分词信息 * @param key 字段名 * @param type 分词类型 0扩展分词 1停分词 * @param delete 是否有效 0有效 1无效 * @param flag 是否每次加载最新的 * @param synonyStr * @return * @throws Exception */ public String getKey(String key, Integer type,boolean delete,boolean flag,String synonyStr) throws Exception { conn = getConn(); StringBuilder data = new StringBuilder(); PreparedStatement ps = null; ResultSet rs = null; try { StringBuilder sql = new StringBuilder("select * from " + dbTable + " where 1=1"); //lastImportTime 最新更新时间 Date lastImportTime = DBHelper.lastImportTimeMap.get(key); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); if (lastImportTime != null && flag) { sql.append(" and update_time > '" + sdf.format(lastImportTime) + "'"); } sql.append(" and " + key + " !=''"); if(type!=null){ sql.append("and word_type="+type); } if(delete){ sql.append(" and delete_type="+1); }else{ sql.append(" and delete_type="+0); } lastImportTime = new Date(); lastImportTimeMap.put(key,lastImportTime); //如果打印出来的时间 和本地时间不一样,则要注意JVM时区是否和服务器系统时区一致 logger.warn("sql==={}",sql.toString()); System.out.print(conn); ps = conn.prepareStatement(sql.toString()); rs = ps.executeQuery(); while (rs.next()) { String value = rs.getString(key); if (StringUtils.isNotBlank(value)) { if (StringUtils.isNotBlank(synonyStr)) { data.append(value + synonyStr); } else { data.append(value + ","); } } } } catch (Exception e) { e.printStackTrace(); } finally { try { if (ps != null) { ps.close(); } if (rs != null) { rs.close(); } conn.close(); } catch (Exception e) { e.printStackTrace(); } } return data.toString(); } //测试 // public static void main(String[] args) throws Exception { // DBHelper dbHelper=new DBHelper(); // String extWords=dbHelper.getKey("ext_word",true); // List<String> extList = Arrays.asList(extWords.split(",")); // System.out.println(extList); // // System.out.println(getKey("stopword")); // // System.out.println(getKey("synonym")); // LocalDate now=LocalDate.now(); // // } }
DBRunnable
package org.wltea.analyzer.ext; import org.apache.logging.log4j.Logger; import org.elasticsearch.common.logging.Loggers; import org.wltea.analyzer.dic.Dictionary; import java.util.Arrays; import java.util.List; public class DBRunnable implements Runnable { Logger logger = Loggers.getLogger(DBRunnable.class); private String wordField; public DBRunnable(String wordField) { super(); this.wordField = wordField; } @Override public void run() { logger.warn("开始加载词库========"); //获取词库 Dictionary dic = Dictionary.getSingleton(); DBHelper dbHelper = new DBHelper(); try { String extWords = dbHelper.getKey(wordField, 0,false,true,","); String stopWords = dbHelper.getKey(wordField, 1,false,true,","); String extDelWords = dbHelper.getKey(wordField, 0,true,true,","); String extStopWords = dbHelper.getKey(wordField, 1,true,true,","); if(StringUtils.isNotBlank(extWords)){ List<String> extList = Arrays.asList(extWords.split(",")); //把扩展词加载到主词库中 dic.addWords(extList); logger.warn("加载扩展词成功========"); logger.warn("extWords为==={}",extWords); } if(StringUtils.isNotBlank(stopWords)){ List<String> stopList = Arrays.asList(stopWords.split(",")); //把扩展词加载到主词库中 dic.addStopWords(stopList); logger.warn("加载停用词成功========"); logger.warn("stopWords为==={}",stopWords); } //移除词库 if(StringUtils.isNotBlank(extDelWords)){ List<String> stopList = Arrays.asList(extDelWords.split(",")); //把扩展词加载到主词库中 dic.disableWords(stopList); logger.warn("移除扩展词成功========"); logger.warn("extDelWords==={}",extDelWords); } if(StringUtils.isNotBlank(extStopWords)){ List<String> stopList = Arrays.asList(extStopWords.split(",")); //把扩展词加载到主词库中 dic.disableStopWords(stopList); logger.warn("移除停用词成功========"); logger.warn("extStopWords==={}",extStopWords); } } catch (Exception e) { logger.warn("加载扩展词失败========{}",e); } } }
StringUtils
package org.wltea.analyzer.ext; public class StringUtils { /** * 判断字符串是否为空 为空返回true 否则返回false * @param str * @return */ public static boolean isBlank(String str) { int strLen; if (str == null || (strLen = str.length()) == 0) { return true; } for (int i = 0; i < strLen; i++) { if ((Character.isWhitespace(str.charAt(i)) == false)) { return false; } } return true; } /** * 判断字符串是否不为空 为空返回false 否则返回true * @param str * @return */ public static boolean isNotBlank(String str) { return !StringUtils.isBlank(str); } }
2.Dictionary增加几个方法
/** * 批量加载新停用词条 * * @param words * Collection<String>词条列表 */ public void addStopWords(Collection<String> words) { if (words != null) { for (String word : words) { if (word != null) { // 批量加载词条到主内存词典中 _StopWords.fillSegment(word.trim().toCharArray()); } } } } /** * 批量移除停用词条 * * @param words * Collection<String>词条列表 */ public void disableStopWords(Collection<String> words) { if (words != null) { for (String word : words) { if (word != null) { // 批量加载词条到主内存词典中 _StopWords.disableSegment(word.trim().toCharArray()); } } } } /** * 读取jdbc配置初始化 定时更新数据库词组定时任务 * * @throws IOException */ public void initReloadMysqlWordJob() throws IOException { logger.warn("============IKAnalyzer=============="); Path file = PathUtils.get(getDictRoot(), "jdbc.properties"); Properties prop = new Properties(); prop.load(new FileInputStream(file.toFile())); logger.info("===========load jdbc.properties========"); for(Object key : prop.keySet()) { logger.info("==========>>" + key + "=" + prop.getProperty(String.valueOf(key))); } boolean autoReloadDic=Boolean.valueOf(prop.getProperty("autoReloadDic")); if(autoReloadDic){ String dbUser = prop.getProperty("dbUser"); String dbPwd = prop.getProperty("dbPwd"); //获取每隔多久从数据库更新信息 默认60S Integer flushTime = Integer.valueOf(prop.getProperty("flushTime")); String dbTable = prop.getProperty("dbTable","t_es_ik_dic"); DBHelper.dbTable=dbTable; DBHelper.dbUser=dbUser; DBHelper.dbPwd=dbPwd; DBHelper.url=prop.getProperty("dbUrl"); String wordFieldName = prop.getProperty("wordFieldName"); ScheduledExecutorService scheduledExecutorService = Executors.newSingleThreadScheduledExecutor(); scheduledExecutorService.scheduleAtFixedRate(new DBRunnable(wordFieldName), 0, flushTime, TimeUnit.SECONDS); } }
4.在init方法启用job
public static synchronized Dictionary initial(Configuration cfg) { if (singleton == null) { synchronized (Dictionary.class) { if (singleton == null) { singleton = new Dictionary(cfg); singleton.loadMainDict(); singleton.loadSurnameDict(); singleton.loadQuantifierDict(); singleton.loadSuffixDict(); singleton.loadPrepDict(); singleton.loadStopWordDict(); try { singleton.initReloadMysqlWordJob(); } catch (IOException e) { logger.error("动态加载mysql词组失败...."); e.printStackTrace(); } if(cfg.isEnableRemoteDict()){ // 建立监控线程 for (String location : singleton.getRemoteExtDictionarys()) { // 10 秒是初始延迟可以修改的 60是间隔时间 单位秒 pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); } for (String location : singleton.getRemoteExtStopWordDictionarys()) { pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS); } } return singleton; } } } return singleton; }
将ik安装导入es
1.打包
2.将zip文件移动到es的plugins文件夹
解压并重命名为ik
3.ik目录的config创建一个jdbc.properties文件
dbUrl=jdbc:mysql://ip/port #数据库连接 dbUser=user #数据库用户名 dbPwd=password #数据库密码 dbTable=md_es_ik_dic #词库表 wordFieldName=word #词组字段 flushTime=5 #刷新时间 (秒) autoReloadDic=true #是否启用
4.创建数据库表
DROP TABLE IF EXISTS `md_es_ik_dic`; CREATE TABLE `md_es_ik_dic` ( `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增id', `word` varchar(100) DEFAULT '' COMMENT '扩展分词', `word_type` varchar(100) DEFAULT '' COMMENT '0:扩展分词 1:停用分词 ', `delete_type` tinyint(4) DEFAULT '0' COMMENT '0表示未删除,1表示删除', `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=16 DEFAULT CHARSET=utf8 COMMENT='词库维护表';
5.es lib增加一个mysql数据库驱动文件
6.启动es测试
get请求es:http://127.0.0.1:9200/_analyze
{ "analyzer":"ik_max_word", "text":"我是一名小正太" }
分词结果
{ "tokens": [ { "token": "我", "start_offset": 0, "end_offset": 1, "type": "CN_CHAR", "position": 0 }, { "token": "是", "start_offset": 1, "end_offset": 2, "type": "CN_CHAR", "position": 1 }, { "token": "一名", "start_offset": 2, "end_offset": 4, "type": "CN_WORD", "position": 2 }, { "token": "一", "start_offset": 2, "end_offset": 3, "type": "TYPE_CNUM", "position": 3 }, { "token": "名", "start_offset": 3, "end_offset": 4, "type": "COUNT", "position": 4 }, { "token": "小", "start_offset": 4, "end_offset": 5, "type": "CN_CHAR", "position": 5 }, { "token": "正", "start_offset": 5, "end_offset": 6, "type": "CN_CHAR", "position": 6 }, { "token": "太", "start_offset": 6, "end_offset": 7, "type": "CN_CHAR", "position": 7 } ] }
如果我们需要小正太分词也分一个词在数据库新增
es日期打印
再次测试分词结果
{ "tokens": [ { "token": "我", "start_offset": 0, "end_offset": 1, "type": "CN_CHAR", "position": 0 }, { "token": "是", "start_offset": 1, "end_offset": 2, "type": "CN_CHAR", "position": 1 }, { "token": "一名", "start_offset": 2, "end_offset": 4, "type": "CN_WORD", "position": 2 }, { "token": "一", "start_offset": 2, "end_offset": 3, "type": "TYPE_CNUM", "position": 3 }, { "token": "名", "start_offset": 3, "end_offset": 4, "type": "COUNT", "position": 4 }, { "token": "小正太", "start_offset": 4, "end_offset": 7, "type": "CN_WORD", "position": 5 } ] }
可以看到小正太分成了一个词
可能遇到的问题
启动报错:Plugin [analysis-ik] was built for Elasticsearch version 6.5.0 but version 6.5.1 is running
因为要求es版本和ik版本要完全一致,可以尝试一下修改ik目录下的plugin-descriptor.properties
改成es版本
找不到数据库驱动
ikpom增加数据库驱动依赖 es lib放入数据库驱动jar
The last packet sent successfully to the server was 0 milliseconds ago. The driver has not received any packets from the server.
permission java.net.SocketPermission "ip:port", "listen,accept,connect,resolve";
修改jre下的lib/security java.policy
我的是在:/Library/Java/JavaVirtualMachines/jdk1.8.0_181.jdk/Contents/Home/jre/lib/security
增加:
permission java.net.SocketPermission "ip:port", "listen,accept,connect,resolve";
可能会出现当前文件只读 切换为root权限修改即可