Elasticsearch由浅入深（九）搜索引擎：query DSL、filter与query、query搜索实战

search api的基本语法

语法概要：

GET /_search
{}

GET /index1,index2/type1,type2/_search
{}

GET /_search
{
  "from": 0,
  "size": 10
}

http协议中get是否可以带上request body？

HTTP协议，一般不允许get请求带上request body，但是因为get更加适合描述查询数据的操作，因此还是这么用了。

很多浏览器，或者是服务器，也都支持GET+request body模式

如果遇到不支持的场景，也可以用POST /_search

GET /_search?from=0&size=10

POST /_search
{
  "from":0,
  "size":10
}

query DSL

一个例子让你明白什么是query DSL

GET /_search
{
    "query": {
        "match_all": {}
    }
}

Query DSL的基本语法

GET /{index}/_search/{type}
{
    "各种条件"
}

示例：

GET /test_index/test_type/_search 
{
  "query": {
    "match": {
      "test_field": "test"
    }
  }
}


{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 3,
    "max_score": 0.843298,
    "hits": [
      {
        "_index": "test_index",
        "_type": "test_type",
        "_id": "6",
        "_score": 0.843298,
        "_source": {
          "test_field": "test test"
        }
      },
      {
        "_index": "test_index",
        "_type": "test_type",
        "_id": "8",
        "_score": 0.43445712,
        "_source": {
          "test_field": "test client 2"
        }
      },
      {
        "_index": "test_index",
        "_type": "test_type",
        "_id": "7",
        "_score": 0.25316024,
        "_source": {
          "test_field": "test client 1"
        }
      }
    ]
  }
}

组合多个搜索条件

搜索需求：title必须包含elasticsearch，content可以包含elasticsearch也可以不包含，author_id必须不为111

构造数据：

PUT /website/article/1
{
  "title":"my elasticsearch article",
  "content":"es is very bad",
  "author_id":110
}

PUT /website/article/2
{
  "title":"my hadoop article",
  "content":"hadoop is very bad",
  "author_id":111
}

PUT /website/article/3
{
  "title":"my hadoop article",
  "content":"hadoop is very good",
  "author_id":111
}

组合查询：

GET /website/article/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "title": "elasticsearch"
          }
        }
      ],
      "should": [
        {
          "match": {
            "content": "elasticsearch"
          }
        }
      ],
      "must_not": [
        {
          "match": {
            "author_id": 111
          }
        }
      ]
    }
  }
}

查询结果：

{
  "took": 7,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 0.25316024,
    "hits": [
      {
        "_index": "website",
        "_type": "article",
        "_id": "1",
        "_score": 0.25316024,
        "_source": {
          "title": "my elasticsearch article",
          "content": "es is very bad",
          "author_id": 110
        }
      }
    ]
  }
}

View Code

filter与query

初始化数据：

PUT /company/employee/2
{
  "address": {
    "country": "china",
    "province": "jiangsu",
    "city": "nanjing"
  },
  "name": "tom",
  "age": 30,
  "join_date": "2016-01-01"
}

PUT /company/employee/3
{
  "address": {
    "country": "china",
    "province": "shanxi",
    "city": "xian"
  },
  "name": "marry",
  "age": 35,
  "join_date": "2015-01-01"
}

搜索请求：年龄必须大于等于30，同时join_date必须是2016-01-01

GET /company/employee/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "join_date": "2016-01-01"
          }
        }
      ],
      "filter": {
        "range": {
          "age": {
            "gte": 30
          }
        }
      }
    }
  }
}

filter与query对比大揭秘

filter，仅仅只是按照搜索条件过滤出需要的数据而已，不计算任何相关度分数，对相关度没有任何影响
query，会去计算每个document相对于搜索条件的相关度，并按照相关度进行排序

一般来说，如果你是在进行搜索，需要将最匹配搜索条件的数据先返回，那么用query；如果你只是要根据一些条件筛选出一部分数据，不关注其排序，那么用filter

除非是你的这些搜索条件，你希望越符合这些搜索条件的document越排在前面返回，那么这些搜索条件要放在query中；如果你不希望一些搜索条件来影响你的document排序，那么就放在filter中即可

filter与query性能

filter，不需要计算相关度分数，不需要按照相关度分数进行排序，同时还有内置的自动cache最常使用filter的数据
query，相反，要计算相关度分数，按照分数进行排序，而且无法cache结果

Elasticsearch 实战各种query搜索

各种query搜索语法

match_all

GET /_search
{
    "query": {
        "match_all": {}
    }
}

match

GET /{index}/_search
{
  "query": {
    "match": {
      "FIELD": "TEXT"
    }
  }
}

multi match

GET /{index}/_search
{
  "query": {
    "multi_match": {
      "query": "",
      "fields": []
    }
  }
}

示例

GET /test_index/test_type/_search
{
  "query": {
    "multi_match": {
      "query": "test",
      "fields": ["test_field", "test_field1"]
    }
  }
}

View Code

range query

GET /{index}/_search
{
  "query": {
    "range": {
      "FIELD": {
        "gte": 10,
        "lte": 20
      }
    }
  }
}

示例

GET /company/employee/_search 
{
  "query": {
    "range": {
      "age": {
        "gte": 30
      }
    }
  }
}

View Code

term query(与match相比不分词)

GET /{index}/_search
{
  "query": {
    "term": {
      "FIELD": {
        "value": "VALUE"
      }
    }
  }
}

示例

GET /test_index/test_type/_search 
{
  "query": {
    "term": {
      "test_field": "test hello"
    }
  }
}

View Code

terms query

GET /{index}/_search
{
  "query": {
    "terms": {
      "FIELD": [
        "VALUE1",
        "VALUE2"
      ]
    }
  }
}

示例

GET /_search
{
    "query": { "terms": { "tag": [ "search", "full_text", "nosql" ] }}
}

View Code

exist query

GET /{index}/_search
{
  "query": {
    "exists": {
       "field": ""
    }
  }
}

多搜索条件组合查询

bool: must, must_not, should, filter

每个子查询都会计算一个document针对它的相关度分数，然后bool综合所有分数，合并为一个分数，当然filter是不会计算分数的。

GET /company/employee/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "range": {
          "age": {
            "gte": 30
          }
        }
      }
    }
  }
}

定位不合法的搜索

一般用在那种特别复杂庞大的搜索下，比如你一下子写了上百行的搜索，这个时候可以先用validate api去验证一下，搜索是否合法

GET /test_index/test_type/_validate/query?explain
{
  "query": {
    "math": {
      "test_field": "test"
    }
  }
}

{
  "valid": false,
  "error": "org.elasticsearch.common.ParsingException: no [query] registered for [math]"
}

正常数据

GET /test_index/test_type/_validate/query?explain
{
  "query":{
    "match":{
      "test_field":"test"
    }
  }
}


{
  "valid": true,
  "_shards": {
    "total": 1,
    "successful": 1,
    "failed": 0
  },
  "explanations": [
    {
      "index": "test_index",
      "valid": true,
      "explanation": "+test_field:test #(#_type:test_type)"
    }
  ]
}

定制搜索结果的排序规则

默认情况下，返回的document是按照_score降序排列的。如果我们想自己定义排序规则怎么办，此时只需要使用sort即可

语法：

# 主要语法
"sort": [
    {
      "FIELD": {
        "order": "desc"
      }
    }
  ]
# 整体位置
GET /{index}/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "exists": {
          "field": ""
        }
      },
      "boost": 1.2
    }
  },
  "sort": [
    {
      "FIELD": {
        "order": "desc"
      }
    }
  ]
}

示例：

GET company/employee/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "range": {
          "age": {
            "gte": 30
          }
        }
      }
    }
  },
  "sort": [
    {
      "join_date": {
        "order": "asc"
      }
    }
  ]
}

将一个field索引两次来解决字符串排序问题

如果某个字段的类型是text，在创建索引的时候，针对每个document，对应的这个text字段都会对内容进行分词。由于ES不允许对已经存在的field的类型进行修改，就会导致该字段一直都是会被分词，那么如果之后有需求想对该字段排序，就不行了。具体看下面展示的示例。

# 删除原来的删除索引
DELETE /website

# 手动建立索引 
PUT /website
{
  "mappings": {
    "article": {
      "properties": {
        "title":{
          "type": "text",
          "fields": {
            "raw":{
              "type": "string",
              "index": "not_analyzed"
            }
          },
          "fielddata": true
        },
        "content":{
          "type": "text"
        },
        "post_date":{
          "type": "date"
        },
        "author_id":{
          "type": "long"
        }
      }
    }
  }
}

插入模拟数据

PUT /website/article/1
{
  "title": "second article",
  "content": "this is my second article",
  "post_date": "2017-01-01",
  "author_id": 110
}

PUT /website/article/2
{
  "title": "first article",
  "content": "this is my first article",
  "post_date": "2017-02-01",
  "author_id": 110
}

PUT /website/article/3
{
  "title": "third article",
  "content": "this is my third article",
  "post_date": "2017-03-01",
  "author_id": 110
}

按照不分词排序

GET /website/article/_search
{
  "query": {
    "match_all": {}
  },
  "sort": [
    {
      "title.raw": {
        "order": "desc"
      }
    }
  ]
}