elasticsearch 拼音检索能力研究

gitchennan/elasticsearch-analysis-lc-pinyin

配置参数少，功能满足需求。

对应版本

elasticsearch2.3.2 对应 elasticsearch-analysis-lc-pinyin 分支 2.4.2.1 或者 tag 2.2.2.1

创建一个类型

elasticsearch-analysis-lc-pinyin 的 README 是根据 elasticsearch5.0 编写的，给出的创建一个类型的语法如下

curl -XPOST http://localhost:9200/index/_mapping/brand -d'
{
  "brand": {
    "properties": {
      "name": {
        "type": "text",
        "analyzer": "lc_index",
        "search_analyzer": "lc_search",
        "term_vector": "with_positions_offsets"
      }
    }
  }
}'

type=text 是 elasticsearch5.0 之后的类型，所以无法创建成功，稍作修改 type=text，使用如下语法创建一个类型

curl -XPOST http://localhost:9200/index/_mapping/brand -d'
{
  "brand": {
    "properties": {
      "name": {
        "type": "string",
        "analyzer": "lc_index",
        "search_analyzer": "lc_search",
        "term_vector": "with_positions_offsets"
      }
    }
  }
}'

index 索引结构如下

{
  "index": {
    "aliases": {},
    "mappings": {
      "brand": {
        "properties": {
          "name": {
            "type": "string",
            "term_vector": "with_positions_offsets",
            "analyzer": "lc_index",
            "search_analyzer": "lc_search"
          }
        }
      }
    },
    "settings": {
      "index": {
        "creation_date": "1490152096129",
        "number_of_shards": "5",
        "number_of_replicas": "1",
        "uuid": "Lp1sSHGhQZyZ57LKO5KwRQ",
        "version": {
          "created": "2030299"
        }
      }
    },
    "warmers": {}
  }
}

存入几条数据

curl -XPOST http://localhost:9200/index/brand/1 -d'{"name":"百度"}'
curl -XPOST http://localhost:9200/index/brand/8 -d'{"name":"百度糯米"}'
curl -XPOST http://localhost:9200/index/brand/2 -d'{"name":"阿里巴巴"}'
curl -XPOST http://localhost:9200/index/brand/3 -d'{"name":"腾讯科技"}'
curl -XPOST http://localhost:9200/index/brand/4 -d'{"name":"网易游戏"}'
curl -XPOST http://localhost:9200/index/brand/9 -d'{"name":"大众点评"}'
curl -XPOST http://localhost:9200/index/brand/10 -d'{"name":"携程旅行网"}'

查出目前的所有数据

http://localhost:9200/index/_search
{
  "took": 70,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 7,
    "max_score": 1,
    "hits": [
      {
        "_index": "index",
        "_type": "brand",
        "_id": "8",
        "_score": 1,
        "_source": {
          "name": "百度糯米"
        }
      },
      {
        "_index": "index",
        "_type": "brand",
        "_id": "9",
        "_score": 1,
        "_source": {
          "name": "大众点评"
        }
      },
      {
        "_index": "index",
        "_type": "brand",
        "_id": "10",
        "_score": 1,
        "_source": {
          "name": "携程旅行网"
        }
      },
      {
        "_index": "index",
        "_type": "brand",
        "_id": "2",
        "_score": 1,
        "_source": {
          "name": "阿里巴巴"
        }
      },
      {
        "_index": "index",
        "_type": "brand",
        "_id": "4",
        "_score": 1,
        "_source": {
          "name": "网易游戏"
        }
      },
      {
        "_index": "index",
        "_type": "brand",
        "_id": "1",
        "_score": 1,
        "_source": {
          "name": "百度"
        }
      },
      {
        "_index": "index",
        "_type": "brand",
        "_id": "3",
        "_score": 1,
        "_source": {
          "name": "腾讯科技"
        }
      }
    ]
  }
}

插件自带分词器 lc_index

原文：lc_index : 该分词器用于索引数据时指定，将中文转换为全拼和首字，同时保留中文
分词器分词效果

curl -X POST -d '{
  "analyzer" : "lc_index",
  "text" : ["刘德华"]
}' "http://localhost:9200/lc/_analyze"

{
  "tokens": [
    {
      "token": "刘",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "l",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "德",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "de",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "d",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "华",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    },
    {
      "token": "hua",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    },
    {
      "token": "h",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    }
  ]
}

插件自带分词器 lc_search

原文：lc_search: 该分词器用于拼音搜索时指定，按最小拼音分词个数拆分拼音，优先拆分全拼

curl -X POST -d '{
  "analyzer" : "lc_search",
  "text" : ["刘德华"]
}' "http://localhost:9200/index/_analyze"

{
  "tokens": [
    {
      "token": "刘",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "德",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "华",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    }
  ]
}

拼音全拼

搜索 baidu，结果正确

curl -X POST -d '{
    "query": {
        "match": {
          "name": {
            "query": "baidu",
            "analyzer": "lc_search",
            "type": "phrase"
          }
        }
    },
    "highlight" : {
        "pre_tags" : ["<tag1>"],
        "post_tags" : ["</tag1>"],
        "fields" : {
            "name" : {}
        }
    }
}' "http://localhost:9200/index/brand/_search"

{
  "took": 4,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 2,
    "max_score": 1.4054651,
    "hits": [
      {
        "_index": "index",
        "_type": "brand",
        "_id": "8",
        "_score": 1.4054651,
        "_source": {
          "name": "百度糯米"
        },
        "highlight": {
          "name": [
            "<tag1>百度</tag1>糯米"
          ]
        }
      },
      {
        "_index": "index",
        "_type": "brand",
        "_id": "1",
        "_score": 0.38356602,
        "_source": {
          "name": "百度"
        },
        "highlight": {
          "name": [
            "<tag1>百度</tag1>"
          ]
        }
      }
    ]
  }
}

单字拼音全拼与中文混合

搜索 xie程lu行，结果正确

{
  "took": 11,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 2.459564,
    "hits": [
      {
        "_index": "index",
        "_type": "brand",
        "_id": "10",
        "_score": 2.459564,
        "_source": {
          "name": "携程旅行网"
        },
        "highlight": {
          "name": [
            "<tag1>携程旅行</tag1>网"
          ]
        }
      }
    ]
  }
}

单字拼音首字母与中文混合

搜索 携cl行，结果正确

curl -X POST -d '{
    "query": {
        "match": {
          "name": {
            "query": "携cl行",
            "analyzer": "lc_search",
            "type": "phrase"
          }
        }
    },
    "highlight" : {
        "pre_tags" : ["<tag1>"],
        "post_tags" : ["</tag1>"],
        "fields" : {
            "name" : {}
        }
    }
}' "http://localhost:9200/index/brand/_search"

{
  "took": 6,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 2.459564,
    "hits": [
      {
        "_index": "index",
        "_type": "brand",
        "_id": "10",
        "_score": 2.459564,
        "_source": {
          "name": "携程旅行网"
        },
        "highlight": {
          "name": [
            "<tag1>携程旅行</tag1>网"
          ]
        }
      }
    ]
  }
}

拼音首字母

搜索 albb，结果正确

curl -X POST -d '{
    "query": {
        "match": {
          "name": {
            "query": "albb",
            "analyzer": "lc_search",
            "type": "phrase"
          }
        }
    },
    "highlight" : {
        "pre_tags" : ["<tag1>"],
        "post_tags" : ["</tag1>"],
        "fields" : {
            "name" : {}
        }
    }
}' "http://localhost:9200/index/brand/_search"

{
  "took": 4,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 2.828427,
    "hits": [
      {
        "_index": "index",
        "_type": "brand",
        "_id": "2",
        "_score": 2.828427,
        "_source": {
          "name": "阿里巴巴"
        },
        "highlight": {
          "name": [
            "<tag1>阿里巴巴</tag1>"
          ]
        }
      }
    ]
  }
}

结论

elasticsearch-analysis-lc-pinyin 按照全拼、首字母，拼音中文混合搜索

elasticsearch-analysis-pinyin v1.7.2

github 项目 elasticsearch-analysis-pinyin v1.7.2 完全是为 elasticsearch 2.3.2 服务

first_letter 改变

first_letter=prefix padding_char=" "

curl -X POST -d '{
"mappings": {
		 "folk": {
			"properties": {
			   "text": {
				  "type": "string",
				  "analyzer": "pinyin_analyzer"
			   }
			}
		 }
	  },
	"settings": {
		"index" : {
			"analysis" : {
				"analyzer" : {
					"pinyin_analyzer" : {
						"tokenizer" : "my_pinyin",
						"filter" : ["word_delimiter"]
					}
				},
				"tokenizer" : {
					"my_pinyin" : {
						"type" : "pinyin",
						"first_letter" : "prefix",
						"padding_char" : " "
					}
				}
			}
		}
	}
}' "http://localhost:9200/medcl"

拼音效果如下

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华"]
}' "http://localhost:9200/medcl/_analyze"

{
  "tokens": [
    {
      "token": "ldh",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 0
    },
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 1
    },
    {
      "token": "de",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 2
    },
    {
      "token": "hua",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 3
    }
  ]
}

first_letter=append padding_char=" "

curl -X POST -d '{
"mappings": {
		 "folk": {
			"properties": {
			   "text": {
				  "type": "string",
				  "analyzer": "pinyin_analyzer"
			   }
			}
		 }
	  },
	"settings": {
		"index" : {
			"analysis" : {
				"analyzer" : {
					"pinyin_analyzer" : {
						"tokenizer" : "my_pinyin",
						"filter" : ["word_delimiter"]
					}
				},
				"tokenizer" : {
					"my_pinyin" : {
						"type" : "pinyin",
						"first_letter" : "append",
						"padding_char" : " "
					}
				}
			}
		}
	}
}' "http://localhost:9200/medcl2"

拼音效果如下

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华"]
}' "http://localhost:9200/medcl2/_analyze"

{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 2
    },
    {
      "token": "ldh",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 3
    }
  ]
}

first_letter=only padding_char=" "

curl -X POST -d '{
"mappings": {
		 "folk": {
			"properties": {
			   "text": {
				  "type": "string",
				  "analyzer": "pinyin_analyzer"
			   }
			}
		 }
	  },
	"settings": {
		"index" : {
			"analysis" : {
				"analyzer" : {
					"pinyin_analyzer" : {
						"tokenizer" : "my_pinyin",
						"filter" : ["word_delimiter"]
					}
				},
				"tokenizer" : {
					"my_pinyin" : {
						"type" : "pinyin",
						"first_letter" : "only",
						"padding_char" : " "
					}
				}
			}
		}
	}
}' "http://localhost:9200/medcl3"

拼音效果如下

curl -X POST -H "Cache-Control: no-cache" -H "Postman-Token: 67015c0d-cd07-961b-4c46-da90f7d558d8" -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华"]
}' "http://localhost:9200/medcl3/_analyze"

{
  "tokens": [
    {
      "token": "ldh",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 0
    }
  ]
}

first_letter=none padding_char=" "

curl -X POST -d '{
"mappings": {
		 "folk": {
			"properties": {
			   "text": {
				  "type": "string",
				  "analyzer": "pinyin_analyzer"
			   }
			}
		 }
	  },
	"settings": {
		"index" : {
			"analysis" : {
				"analyzer" : {
					"pinyin_analyzer" : {
						"tokenizer" : "my_pinyin",
						"filter" : ["word_delimiter"]
					}
				},
				"tokenizer" : {
					"my_pinyin" : {
						"type" : "pinyin",
						"first_letter" : "none",
						"padding_char" : " "
					}
				}
			}
		}
	}
}' "http://localhost:9200/medcl4"

拼音效果如下

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华"]
}' "http://localhost:9200/medcl4/_analyze"

{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 2
    }
  ]
}

padding_char 改变

first_letter=prefix padding_char=""

curl -X POST -d '{
"mappings": {
		 "folk": {
			"properties": {
			   "text": {
				  "type": "string",
				  "analyzer": "pinyin_analyzer"
			   }
			}
		 }
	  },
	"settings": {
		"index" : {
			"analysis" : {
				"analyzer" : {
					"pinyin_analyzer" : {
						"tokenizer" : "my_pinyin",
						"filter" : ["word_delimiter"]
					}
				},
				"tokenizer" : {
					"my_pinyin" : {
						"type" : "pinyin",
						"first_letter" : "prefix",
						"padding_char" : ""
					}
				}
			}
		}
	}
}' "http://localhost:9200/medcl5"

拼音效果如下

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华"]
}' "http://localhost:9200/medcl5/_analyze"

{
  "tokens": [
    {
      "token": "ldhliudehua",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 0
    }
  ]
}

first_letter=append padding_char=""

curl -X PUT -d '{
"mappings": {
		 "folk": {
			"properties": {
			   "text": {
				  "type": "string",
				  "analyzer": "pinyin_analyzer"
			   }
			}
		 }
	  },
	"settings": {
		"index" : {
			"analysis" : {
				"analyzer" : {
					"pinyin_analyzer" : {
						"tokenizer" : "my_pinyin",
						"filter" : ["word_delimiter"]
					}
				},
				"tokenizer" : {
					"my_pinyin" : {
						"type" : "pinyin",
						"first_letter" : "append",
						"padding_char" : ""
					}
				}
			}
		}
	}
}' "http://localhost:9200/medcl7"

拼音效果如下

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华"]
}' "http://localhost:9200/medcl7/_analyze"

{
  "tokens": [
    {
      "token": "liudehualdh",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 0
    }
  ]
}

first_letter=only padding_char=""

curl -X PUT -d '{
"mappings": {
		 "folk": {
			"properties": {
			   "text": {
				  "type": "string",
				  "analyzer": "pinyin_analyzer"
			   }
			}
		 }
	  },
	"settings": {
		"index" : {
			"analysis" : {
				"analyzer" : {
					"pinyin_analyzer" : {
						"tokenizer" : "my_pinyin",
						"filter" : ["word_delimiter"]
					}
				},
				"tokenizer" : {
					"my_pinyin" : {
						"type" : "pinyin",
						"first_letter" : "only",
						"padding_char" : ""
					}
				}
			}
		}
	}
}' "http://localhost:9200/medcl8"

拼音效果如下

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华"]
}' "http://localhost:9200/medcl8/_analyze"


{
  "tokens": [
    {
      "token": "ldh",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 0
    }
  ]
}

first_letter=none padding_char=""

curl -X PUT -d '{
"mappings": {
		 "folk": {
			"properties": {
			   "text": {
				  "type": "string",
				  "analyzer": "pinyin_analyzer"
			   }
			}
		 }
	  },
	"settings": {
		"index" : {
			"analysis" : {
				"analyzer" : {
					"pinyin_analyzer" : {
						"tokenizer" : "my_pinyin",
						"filter" : ["word_delimiter"]
					}
				},
				"tokenizer" : {
					"my_pinyin" : {
						"type" : "pinyin",
						"first_letter" : "none",
						"padding_char" : ""
					}
				}
			}
		}
	}
}' "http://localhost:9200/medcl9"

拼音效果如下

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华"]
}' "http://localhost:9200/medcl9/_analyze"

{
  "tokens": [
    {
      "token": "liudehua",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 0
    }
  ]
}

结论

elasticsearch 2.3.2 对应 elasticsearch-analysis-pinyin 1.7.2，pinyin 1.7.2 可配置参数有：first_letter 和 padding_char。
padding_char 的作用是将字符串按照什么字符分隔，比如 padding_char = " "，那么 刘德华 将先被分隔为 刘，德，华；如果 padding_char = ""，那么 刘德华 将不会被分隔
first_letter 取值：prefix，append，only，none。
padding_char 与 first_letter 的组合会影响拼音输出的结果

elasticsearch-analysis-pinyin 2.x 分支

github 项目 elasticsearch-analysis-pinyin 2.x 分支 是为 elasticsearch 2.x 服务，经过测试 elasticsearch 2.3.2 也可以使用该插件。

官方文档中的说明

remove_duplicated_term when this option enabled, duplicated term will be removed to save index, eg: de的>de, default: false, NOTE: position related query maybe influenced
keep_first_letter when this option enabled, eg: 刘德华>ldh, default: true
keep_separate_first_letter when this option enabled, will keep first letters separately, eg: 刘德华>l,d,h, default: false, NOTE: query result maybe too fuzziness due to term too frequency
limit_first_letter_length set max length of the first_letter result, default: 16
keep_full_pinyin when this option enabled, eg: 刘德华> [liu,de,hua], default: true
keep_joined_full_pinyin when this option enabled, eg: 刘德华> [liudehua], default: false
keep_none_chinese keep non chinese letter or number in result, default: true
keep_none_chinese_together keep non chinese letter together, default: true, eg: DJ音乐家 -> DJ,yin,yue,jia, when set to false, eg: DJ音乐家 -> D,J,yin,yue,jia, NOTE: keep_none_chinese should be enabled first
keep_none_chinese_in_first_letter keep non Chinese letters in first letter, eg: 刘德华AT2016->ldhat2016, default: true
none_chinese_pinyin_tokenize break non chinese letters into separate pinyin term if they are pinyin, default: true, eg: liudehuaalibaba13zhuanghan -> liu,de,hua,a,li,ba,ba,13,zhuang,han, NOTE: keep_none_chinese and keep_none_chinese_together should be enabled first
keep_original when this option enabled, will keep original input as well, default: false
lowercase lowercase non Chinese letters, default: true
trim_whitespace default: true

基准配置

基准配置参数

"keep_joined_full_pinyin": "false",
"lowercase": "true",
"keep_original": "false",
"keep_none_chinese_together": "true",
"remove_duplicated_term": "false",
"keep_first_letter": "true",
"keep_separate_first_letter": "false",
"trim_whitespace": "true",
"keep_none_chinese": "true",
"limit_first_letter_length": "16",
"keep_full_pinyin": "true"

创建索引与分词器

curl -X POST -d '{
	"mappings": {
		"folk": {
			"properties": {
			   "text": {
				  "type": "string",
				  "analyzer": "pinyin_analyzer"
			   }
			}
		}
	},
	"settings": {
			"index" : {
	        "analysis" : {
	            "analyzer" : {
	                "pinyin_analyzer" : {
	                    "tokenizer" : "my_pinyin"
	                    }
	            },
	            "tokenizer" : {
	                "my_pinyin" : {
	                    "type" : "pinyin",
          						"remove_duplicated_term" : false,
          						"keep_joined_full_pinyin" : false,
	                    "keep_separate_first_letter" : false,
          						"keep_first_letter" : true,
          						"limit_first_letter_length" : 16,
	                    "keep_full_pinyin" : true,
	                    "keep_original" : true,
	                    "keep_none_chinese" : true,
						          "keep_none_chinese_together" : true,
	                    "lowercase" : true,
						          "trim_whitespace" : true
	                }
	            }
	        }
	    }
	}
}' "http://localhost:9200/medcl20"

生成索引结构

curl -X GET "http://localhost:9200/medcl20"
{
  "medcl20": {
    "aliases": {},
    "mappings": {
      "folk": {
        "properties": {
          "text": {
            "type": "string",
            "analyzer": "pinyin_analyzer"
          }
        }
      }
    },
    "settings": {
      "index": {
        "creation_date": "1490170676090",
        "analysis": {
          "analyzer": {
            "pinyin_analyzer": {
              "tokenizer": "my_pinyin"
            }
          },
          "tokenizer": {
            "my_pinyin": {
              "keep_joined_full_pinyin": "false",
              "lowercase": "true",
              "keep_original": "true",
              "keep_none_chinese_together": "true",
              "remove_duplicated_term": "false",
              "keep_first_letter": "true",
              "keep_separate_first_letter": "false",
              "trim_whitespace": "true",
              "type": "pinyin",
              "keep_none_chinese": "true",
              "limit_first_letter_length": "16",
              "keep_full_pinyin": "true"
            }
          }
        },
        "number_of_shards": "5",
        "number_of_replicas": "1",
        "uuid": "31Y9PizQQ2KQn_Fl6bpPNw",
        "version": {
          "created": "2030299"
        }
      }
    },
    "warmers": {}
  }
}

分词器分词效果

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华"]
}' "http://localhost:9200/medcl20/_analyze"

{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    },
    {
      "token": "刘德华",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 3
    },
    {
      "token": "ldh",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 4
    }
  ]
}

keep_original

keep_original = true

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华"]
}' "http://localhost:9200/medcl20/_analyze"

{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    },
    {
      "token": "刘德华",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 3
    },
    {
      "token": "ldh",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 4
    }
  ]
}

keep_original = false

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华"]
}' "http://localhost:9200/medcl20/_analyze"

{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    },
    {
      "token": "ldh",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 3
    }
  ]
}

keep_original 功能

keep_original=true 将保留原字符串，比如存入索引的数据为 刘德华 那么 刘德华 将也会被保存到索引中。keep_original=false 则不保存原字符串到索引

trim_whitespace

trim_whitespace=true

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["   最爱   刘德华   的帅气帅气的   "]
}' "http://localhost:9200/medcl20/_analyze"

{
  "tokens": [
    {
      "token": "zui",
      "start_offset": 3,
      "end_offset": 4,
      "type": "word",
      "position": 0
    },
    {
      "token": "ai",
      "start_offset": 4,
      "end_offset": 5,
      "type": "word",
      "position": 1
    },
    {
      "token": "liu",
      "start_offset": 8,
      "end_offset": 9,
      "type": "word",
      "position": 2
    },
    {
      "token": "de",
      "start_offset": 9,
      "end_offset": 10,
      "type": "word",
      "position": 3
    },
    {
      "token": "hua",
      "start_offset": 10,
      "end_offset": 11,
      "type": "word",
      "position": 4
    },
    {
      "token": "de",
      "start_offset": 14,
      "end_offset": 15,
      "type": "word",
      "position": 5
    },
    {
      "token": "shuai",
      "start_offset": 15,
      "end_offset": 16,
      "type": "word",
      "position": 6
    },
    {
      "token": "qi",
      "start_offset": 16,
      "end_offset": 17,
      "type": "word",
      "position": 7
    },
    {
      "token": "shuai",
      "start_offset": 17,
      "end_offset": 18,
      "type": "word",
      "position": 8
    },
    {
      "token": "qi",
      "start_offset": 18,
      "end_offset": 19,
      "type": "word",
      "position": 9
    },
    {
      "token": "de",
      "start_offset": 19,
      "end_offset": 20,
      "type": "word",
      "position": 10
    },
    {
      "token": "最爱   刘德华   的帅气帅气的",
      "start_offset": 0,
      "end_offset": 23,
      "type": "word",
      "position": 11
    },
    {
      "token": "zaldhdsqsqd",
      "start_offset": 0,
      "end_offset": 11,
      "type": "word",
      "position": 12
    }
  ]
}

trim_whitespace=false

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["   最爱   刘德华   的帅气帅气的   "]
}' "http://localhost:9200/medcl20/_analyze"

{
  "tokens": [
    {
      "token": "zui",
      "start_offset": 3,
      "end_offset": 4,
      "type": "word",
      "position": 0
    },
    {
      "token": "ai",
      "start_offset": 4,
      "end_offset": 5,
      "type": "word",
      "position": 1
    },
    {
      "token": "liu",
      "start_offset": 8,
      "end_offset": 9,
      "type": "word",
      "position": 2
    },
    {
      "token": "de",
      "start_offset": 9,
      "end_offset": 10,
      "type": "word",
      "position": 3
    },
    {
      "token": "hua",
      "start_offset": 10,
      "end_offset": 11,
      "type": "word",
      "position": 4
    },
    {
      "token": "de",
      "start_offset": 14,
      "end_offset": 15,
      "type": "word",
      "position": 5
    },
    {
      "token": "shuai",
      "start_offset": 15,
      "end_offset": 16,
      "type": "word",
      "position": 6
    },
    {
      "token": "qi",
      "start_offset": 16,
      "end_offset": 17,
      "type": "word",
      "position": 7
    },
    {
      "token": "shuai",
      "start_offset": 17,
      "end_offset": 18,
      "type": "word",
      "position": 8
    },
    {
      "token": "qi",
      "start_offset": 18,
      "end_offset": 19,
      "type": "word",
      "position": 9
    },
    {
      "token": "de",
      "start_offset": 19,
      "end_offset": 20,
      "type": "word",
      "position": 10
    },
    {
      "token": "   最爱   刘德华   的帅气帅气的   ",
      "start_offset": 0,
      "end_offset": 23,
      "type": "word",
      "position": 11
    },
    {
      "token": "zaldhdsqsqd",
      "start_offset": 0,
      "end_offset": 11,
      "type": "word",
      "position": 12
    }
  ]
}

trim_whitespace 功能

去除字符串首尾空格字符，不去除字符串中间的空格。这个参数只有当 keep_original=true 时才能够看到效果。 例如当字符串为： 最爱刘德华的帅气帅气的 ，trim_whitespace=true 则原字符串将被保存为 最爱刘德华的帅气帅气的，如果 trim_whitespace=false 则原字符串将被保存为 最爱刘德华的帅气帅气的 。如果 keep_original=false，那么原字符串没有被保存，也将看不到效果。

keep_joined_full_pinyin

keep_joined_full_pinyin = false

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华"]
}' "http://localhost:9200/medcl21/_analyze"

{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    },
    {
      "token": "刘德华",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 3
    },
    {
      "token": "ldh",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 4
    }
  ]
}

keep_joined_full_pinyin = true

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华"]
}' "http://localhost:9200/medcl22/_analyze"

{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    },
    {
      "token": "刘德华",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 3
    },
    {
      "token": "liudehua",
      "start_offset": 0,
      "end_offset": 8,
      "type": "word",
      "position": 4
    },
    {
      "token": "ldh",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 5
    }
  ]
}

keep_joined_full_pinyin 功能

keep_joined_full_pinyin=true 将保存字符串拼音全拼，false 则不保存。例如，当 kepp_joined_full_pinyin=true 时，文本 刘德华 的拼音全拼 liudehua 将会被保留；当 keep_joined_full_pinyin=false 则全拼liudehua

remove_duplicated_term

remove_duplicated_term = false

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华刘德华帅帅帅，帅帅帅"]
}' "http://localhost:9200/medcl20/_analyze"

{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    },
    {
      "token": "liu",
      "start_offset": 3,
      "end_offset": 4,
      "type": "word",
      "position": 3
    },
    {
      "token": "de",
      "start_offset": 4,
      "end_offset": 5,
      "type": "word",
      "position": 4
    },
    {
      "token": "hua",
      "start_offset": 5,
      "end_offset": 6,
      "type": "word",
      "position": 5
    },
    {
      "token": "shuai",
      "start_offset": 6,
      "end_offset": 7,
      "type": "word",
      "position": 6
    },
    {
      "token": "shuai",
      "start_offset": 7,
      "end_offset": 8,
      "type": "word",
      "position": 7
    },
    {
      "token": "shuai",
      "start_offset": 8,
      "end_offset": 9,
      "type": "word",
      "position": 8
    },
    {
      "token": "shuai",
      "start_offset": 10,
      "end_offset": 11,
      "type": "word",
      "position": 9
    },
    {
      "token": "shuai",
      "start_offset": 11,
      "end_offset": 12,
      "type": "word",
      "position": 10
    },
    {
      "token": "shuai",
      "start_offset": 12,
      "end_offset": 13,
      "type": "word",
      "position": 11
    },
    {
      "token": "刘德华刘德华帅帅帅，帅帅帅",
      "start_offset": 0,
      "end_offset": 13,
      "type": "word",
      "position": 12
    },
    {
      "token": "ldhldhssssss",
      "start_offset": 0,
      "end_offset": 12,
      "type": "word",
      "position": 13
    }
  ]
}

remove_duplicated_term = true

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华刘德华帅帅帅，帅帅帅"]
}' "http://localhost:9200/medcl26/_analyze"

{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    },
    {
      "token": "shuai",
      "start_offset": 6,
      "end_offset": 7,
      "type": "word",
      "position": 3
    },
    {
      "token": "刘德华刘德华帅帅帅，帅帅帅",
      "start_offset": 0,
      "end_offset": 13,
      "type": "word",
      "position": 4
    },
    {
      "token": "ldhldhssssss",
      "start_offset": 0,
      "end_offset": 12,
      "type": "word",
      "position": 5
    }
  ]
}

remove_duplicated_term 功能

remove_duplicated_term=true 则会将文本中相同的拼音只保存一份，比如 刘德华刘德华 只会保留一份拼音 liu，de，hua；相对的 remove_duplicated_term=false 则会保留两份 liu，de，hua。注意：remove_duplicated_term 并不会影响文本首字母的文本，刘德华刘德华 生成的首字母拼音始终都为 ldhldh

remove_duplicated_term = true 并且 keep_joined_full_pinyin = true

curl -X POST -d '{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘德华刘德华帅帅帅，帅帅帅"]
}' "http://localhost:9200/medcl27/_analyze"

{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    },
    {
      "token": "shuai",
      "start_offset": 6,
      "end_offset": 7,
      "type": "word",
      "position": 3
    },
    {
      "token": "刘德华刘德华帅帅帅，帅帅帅",
      "start_offset": 0,
      "end_offset": 13,
      "type": "word",
      "position": 4
    },
    {
      "token": "liudehualiudehuashuaishuaishuaishuaishuaishuai",
      "start_offset": 0,
      "end_offset": 46,
      "type": "word",
      "position": 5
    },
    {
      "token": "ldhldhssssss",
      "start_offset": 0,
      "end_offset": 12,
      "type": "word",
      "position": 6
    }
  ]
}

remove_duplicated_term 功能

remove_duplicated_term = true 会过滤相同的拼音，但是不影响全拼，刘德华刘德华 生成的字符串全拼为 liudehualiudehua

keep_none_chinese

keep_none_chinese = true

POST /medcl20/_analyze HTTP/1.1
Host: localhost:9200

{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘*20*德b华DJ"]
}

{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "20",
      "start_offset": 3,
      "end_offset": 5,
      "type": "word",
      "position": 1
    },
    {
      "token": "de",
      "start_offset": 5,
      "end_offset": 6,
      "type": "word",
      "position": 2
    },
    {
      "token": "b",
      "start_offset": 6,
      "end_offset": 7,
      "type": "word",
      "position": 3
    },
    {
      "token": "hua",
      "start_offset": 7,
      "end_offset": 8,
      "type": "word",
      "position": 4
    },
    {
      "token": "d",
      "start_offset": 7,
      "end_offset": 9,
      "type": "word",
      "position": 5
    },
    {
      "token": "j",
      "start_offset": 7,
      "end_offset": 9,
      "type": "word",
      "position": 6
    },
    {
      "token": "刘*20*德b华dj",
      "start_offset": 0,
      "end_offset": 10,
      "type": "word",
      "position": 7
    },
    {
      "token": "l20dbhdj",
      "start_offset": 0,
      "end_offset": 8,
      "type": "word",
      "position": 8
    }
  ]
}

keep_none_chinese = false

POST /medcl28/_analyze HTTP/1.1
Host: localhost:9200

{
  "analyzer" : "pinyin_analyzer",
  "text" : ["刘*20*德b华DJ"]
}

{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 5,
      "end_offset": 6,
      "type": "word",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 7,
      "end_offset": 8,
      "type": "word",
      "position": 2
    },
    {
      "token": "刘*20*德b华dj",
      "start_offset": 0,
      "end_offset": 10,
      "type": "word",
      "position": 3
    },
    {
      "token": "l20dbhdj",
      "start_offset": 0,
      "end_offset": 8,
      "type": "word",
      "position": 4
    }
  ]
}

keep_none_chinese 功能

keep_none_chinese = true 则非中文字母以及数字将会被保留，但是要确定所有的特别字符都是无法被保留下来的。例如，文本 刘*20*德b华dj 中的数字 20，字母 b 与 dj 将会被保留，而特殊字符 * 是不会保留的；当 keep_none_chinese=false 则非中文字母以及数字将不会被保留，上述文本中的数字 20，字母 b 与 dj 将不会被保留。注意：参数 keep_none_chinese 是不会影响首字母以及所有字符组成全拼的拼音，上述文本生成的首字母拼音为 l20dbhdj，所有字符组成的全拼为：liu20debhuadj，特别字符始终是被过滤去除的。

欢迎转载，请注明本文链接，谢谢你。
2017.4.12 20:44

posted on 2017-04-12 20:44 xiaoheike 阅读(1481) 评论(1) 编辑收藏举报

刷新页面返回顶部

xiaoheike

elasticsearch 拼音检索能力研究

gitchennan/elasticsearch-analysis-lc-pinyin

对应版本

创建一个类型

插件自带分词器 lc_index

插件自带分词器 lc_search

拼音全拼

单字拼音全拼与中文混合

单字拼音首字母与中文混合

拼音首字母

结论

elasticsearch-analysis-pinyin v1.7.2

first_letter 改变

first_letter=prefix padding_char=" "

first_letter=append padding_char=" "

first_letter=only padding_char=" "

first_letter=none padding_char=" "

padding_char 改变

first_letter=prefix padding_char=""

first_letter=append padding_char=""

first_letter=only padding_char=""

first_letter=none padding_char=""

结论

elasticsearch-analysis-pinyin 2.x 分支

官方文档中的说明

基准配置

keep_original

keep_original = true

keep_original = false

keep_original 功能

trim_whitespace

trim_whitespace=true

trim_whitespace=false

trim_whitespace 功能

keep_joined_full_pinyin

keep_joined_full_pinyin = false

keep_joined_full_pinyin = true

keep_joined_full_pinyin 功能

remove_duplicated_term

remove_duplicated_term = false

remove_duplicated_term = true

remove_duplicated_term 功能

remove_duplicated_term = true 并且 keep_joined_full_pinyin = true

remove_duplicated_term 功能

keep_none_chinese

keep_none_chinese = true

keep_none_chinese = false

keep_none_chinese 功能

公告

导航