Elasticsearch08-前缀搜索、通配符搜索、正则搜索、模糊匹配 - 陕西小楞娃

前缀搜索、通配符搜索、正则搜索

说明:
1.一般很少用，因为都是全文检索倒排索引，就算检索到了 也会继续往下检索
2.以xx开头的搜索，不计算相关度评分，和filter比 性能差、没有bitcache.前缀搜索，尽量把前缀长度设置的更长。

数据

POST /prefix_index_ik/_bulk
{ "index": { "_id": "1"} }
{ "text": "城管打电话喊商贩去摆摊摊" }
{ "index": { "_id": "2"} }
{ "text": "笑果文化回应商贩老农去摆摊" }
{ "index": { "_id": "3"} }
{ "text": "老农耗时17年种出椅子树" }
{ "index": { "_id": "4"} }
{ "text": "夫妻结婚30多年AA制,被城管抓" }
{ "index": { "_id": "5"} }
{ "text": "黑人见义勇为阻止抢劫反被铐住" }


POST /prefix_index_standard/_bulk
{ "index": { "_id": "1"} }
{ "text": "my english" }
{ "index": { "_id": "2"} }
{ "text": "my english is good" }
{ "index": { "_id": "3"} }
{ "text": "my chinese is good" }
{ "index": { "_id": "4"} }
{ "text": "my japanese is nice" }
{ "index": { "_id": "5"} }
{ "text": "my disk is full" }

前缀搜索

GET prefix_index_standard/_search
{
  "query": {
    "prefix": {
      "text": {
        "value": "ch"
      }
    }
  }
}

GET prefix_index_standard/_search
{
  "query": {
    "prefix": {
      "text": "ch"
    }
  }
}



#设置默认的 启动索引 加快前缀搜索速度
PUT prefix_index_ik_test
{
  "mappings": {
    "properties": {
      "text": {
        "type": "text",
        "analyzer": "ik_max_word", 
        "index_prefixes": {
          "min_chars":2,
          "max_chars":4
        }    
      }
    }
  }
}

通配符搜索

说明：
1.通配符运算符是匹配一个或多个字符的占位符。例如，*通配符运算符匹配零个或多个字符。


GET prefix_index_standard/_search
{
  "query": {
    "wildcard": {
      "text": {
        "value": "eng?ish" #只能占位一个
      }
    }
  }
}

GET prefix_index_standard/_search
{
  "query": {
    "wildcard": {
      "text": {
        "value": "en*ish", #占位多个或一个
        "boost": 1.0
      }
    }
  }
}

正则搜索

GET prefix_index_standard/_search
{
  "query": {
    "regexp": {
      "text": {
        "value": "[\\s\\S]*english[\\s\\S]*",
        "flags": "ALL",
        "max_determinized_states": 10000,
        "rewrite": "constant_score"
      }
    }
  }
}


# 没有结果，因为standard的解析器将日期拆成了数字
GET product/_search
{
  "query": {
    "regexp": {
      "desc": {
        "value": ".*2020-05-20.*",
        "flags": "ALL"
      }
    }
  }
}


# ik分词器不会拆开
GET /_analyze
{
  "text": "shouji zhong 2020-05-20 de zhandouji",
  "analyzer": "ik_max_word"
}

练习ik分词器

PUT index_ik
{
  "mappings": {
    "properties": {
      "text": {
        "type": "text",
        "analyzer": "ik_max_word",
        "search_analyzer": "ik_max_word"
          
      }
    }
  }
}

PUT /index_ik/_doc/1
{
  "testid":"123456",
  "text":"shouji zhong 2020-05-20 de zhandouji"
}

GET index_ik/_search
{
  "query": {
    "regexp": {
      "text": {
        "value": ".*<1-4>.*",
        "flags": "INTERVAL" #启用<>操作符， 启用'-'1到4个
      }
    }
  }
}

Fuzzy模糊查询

说明: 
1.fuzziness:最大误差(一个字符串与另外一个字符串的最大距离),并非越大越好, 召回率高 但是结果不准确
2.混淆字符 (box → fox)    
3.缺少字符 (black → lack)
4.多出字符 (sic → sick)    
5.颠倒次序 (act → cat)

GET /prefix_index_standard/_search 
{
  "query": {
    "fuzzy": {
      "text": {
        "value": "japaenes", 
        "fuzziness": 2  #选填
      }
    }
  }
}


GET /prefix_index_standard/_search 
{
  "query": {
    "fuzzy": {
      "text": {
        "value": "japaenes",
        "fuzziness": "AUTO"
      }
    }
  }
}

短语前缀搜索

GET /prefix_index_standard/_search
{
  "query": {
    "match_phrase_prefix": {
      "text": {
        "query": "my jap ",
        "analyzer": "whitespace",
        "max_expansions": 1,
        "slop": 2,
        "boost": 1
      }
    }
  }
}


#match_phrase_prefix和前缀搜索区别

#没有查询到结果
GET  /prefix_index_standard/_search
{
  "query": {
    "match_phrase": {
      "text": "my japane" 
    }
  }
}

GET  /prefix_index_standard/_search
{
  "query": {
    "match_phrase_prefix": {
      "text": "my jap" # 先以jap在倒排索引中查询，在进行match_phrase查询
    }
  }
}





GET /prefix_index_standard/_search
{
  "query": {
    "match_phrase_prefix": {
      "text": {
        "query": "my nice",
        "max_expansions": 50,
        "slop": 4 #允许短语间的单词间隔
      }
    }
  }
}

N-gram

说明:
#min_gram =1   "max_gram": 1
#r a l m

#min_gram =1   "max_gram": 2
#r a l m
#re al lo me

#min_gram =2   "max_gram": 3
#re al lo me
#reb alw lov me



PUT index_edge_ngram
{
  "settings": {
    "analysis": {
      "filter": {
        "2_3_edge_ngram": {
          "type": "edge_ngram",
          "min_gram": 2,
          "max_gram": 3
        }
      },
      "analyzer": {
        "my_edge_ngram": {
          "type":"custom",
          "tokenizer": "standard",
          "filter": [ "2_3_edge_ngram" ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "text": {
        "type": "text",
        "analyzer":"my_edge_ngram",
        "search_analyzer": "standard"
      }
    }
  }
}



GET /index_edge_ngram/_mapping


POST /index_edge_ngram/_bulk
{ "index": { "_id": "1"} }
{ "text": "my english" }
{ "index": { "_id": "2"} }
{ "text": "my english is good" }
{ "index": { "_id": "3"} }
{ "text": "my chinese is good" }
{ "index": { "_id": "4"} }
{ "text": "my japanese is nice" }
{ "index": { "_id": "5"} }
{ "text": "my disk is full" }




GET /index_edge_ngram/_search
{
  "query": {
    "match_phrase": {
      "text": "my eng is goo"
    }
  }
}

posted on 2021-09-09 22:16 陕西小楞娃阅读(581) 评论(0) 编辑收藏举报

刷新页面返回顶部