IK 分词器支持特殊符号、标点符号、驼峰分词
从 GitHub 下载对应版本的 IK 分词器源码,修改 CharacterUtil.identifyCharType 方法,将特殊符号、标点符号当做中文进行处理即可。
添加:
//驼峰作为分词点
if (input >= 'A' && input <= 'Z') return CHAR_CHINESE;
else if (ub == Character.UnicodeBlock.GREEK // 希腊符号 // 希腊扩展符号 || ub == Character.UnicodeBlock.GREEK_EXTENDED // 拉丁字符 || ub == Character.UnicodeBlock.BASIC_LATIN // 拉丁补充字符 || ub == Character.UnicodeBlock.LATIN_1_SUPPLEMENT // 拉丁扩展A字符 || ub == Character.UnicodeBlock.LATIN_EXTENDED_A // 拉丁扩展B字符 || ub == Character.UnicodeBlock.LATIN_EXTENDED_B) { return CHAR_CHINESE; } else if (ub == Character.UnicodeBlock.GENERAL_PUNCTUATION || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS || ub == Character.UnicodeBlock.VERTICAL_FORMS) { // 标点符号 return CHAR_CHINESE; }
完整的 CharacterUtil.identifyCharType 方法:
static int identifyCharType(char input) {
if (input >= '0' && input <= '9') {
return CHAR_ARABIC;
} else if ((input >= 'a' && input <= 'z')
|| (input >= 'A' && input <= 'Z')) {
// 驼峰作为分词点
if (input >= 'A' && input <= 'Z') return CHAR_CHINESE;
else return CHAR_ENGLISH;
} else {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) {
//目前已知的中文字符UTF-8集合
return CHAR_CHINESE;
} else if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
//韩文字符集
|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES
|| ub == Character.UnicodeBlock.HANGUL_JAMO
|| ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
//日文字符集
|| ub == Character.UnicodeBlock.HIRAGANA //平假名
|| ub == Character.UnicodeBlock.KATAKANA //片假名
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) {
return CHAR_OTHER_CJK;
} else if (ub == Character.UnicodeBlock.GREEK // 希腊符号
// 希腊扩展符号
|| ub == Character.UnicodeBlock.GREEK_EXTENDED
// 拉丁字符
|| ub == Character.UnicodeBlock.BASIC_LATIN
// 拉丁补充字符
|| ub == Character.UnicodeBlock.LATIN_1_SUPPLEMENT
// 拉丁扩展A字符
|| ub == Character.UnicodeBlock.LATIN_EXTENDED_A
// 拉丁扩展B字符
|| ub == Character.UnicodeBlock.LATIN_EXTENDED_B) {
return CHAR_CHINESE;
} else if (ub == Character.UnicodeBlock.GENERAL_PUNCTUATION
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
|| ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS
|| ub == Character.UnicodeBlock.VERTICAL_FORMS) {
// 标点符号
return CHAR_CHINESE;
}
}
//其他的不做处理的字符
return CHAR_USELESS;
}
之后通过 maven 打包为 jar,替换掉原来使用的 ik 包中的 jar 即可。
另外因为需要驼峰分词,而 IK 分词器默认会将英文进行大写转小写,这里也需要将相关方法修改一下( CharacterUtil.regularize 方法):
static char regularize(char input, boolean lowercase) { if (input == 12288) { input = (char) 32; } else if (input > 65280 && input < 65375) { input = (char) (input - 65248); } else if (input >= 'A' && input <= 'Z' && lowercase) { input += 32; } return input; }
创建索引时再配合词根过滤器使用更佳:
{ "settings": { "index.number_of_shards": 3, "index.max_result_window": 20000000, "number_of_replicas": 1, "index.refresh_interval": "60s", "index.highlight.max_analyzed_offset": "10000", "analysis": { "analyzer": { "ik_local_analyzer": { "type": "custom", "tokenizer": "ik_max_word", "filter": [ "stemmer" ] } } } }, "mappings": { "properties": { "id": { "type": "keyword" }, "businessNumber": { "type": "keyword" }, "remoteIp": { "type": "text" }, "remoteSystemId": { "type": "keyword" }, "remoteSystemName": { "type": "keyword" }, "businessType": { "type": "keyword" }, "logType": { "type": "keyword" }, "logContent": { "type": "text", "analyzer": "ik_local_analyzer" }, "createTime": { "type": "long" }, "elasticAck": { "type": "keyword", "index": false } } } }
当你看清人们的真相,于是你知道了,你可以忍受孤独