elasticsearch聚合查询实践
概念
用于聚合的字段必须是 exact value
,即doc_value=true
。分词字段不可进行聚合,对于 text
字段如需使用聚合,需开启 fielddata
,不推荐因容易造成 OOM。
聚合分类
- Bucket aggregations(桶聚合)
- Metric aggregations(指标聚合)
- Pipeline aggregations(管道聚合)
聚合语法
request
GET /my-index/_search
{
"aggs": {
"my-agg-name": {
"terms": {
"field": "my-field"
}
}
}
}
response
{
"took": 78,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 5,
"relation": "eq"
},
"max_score": 1.0,
"hits": [...]
},
"aggregations": {
"my-agg-name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
聚合作用范围及排序
query
和filter
,是先选定数据范围,再聚合桶;post_filter
对聚合桶没影响,桶全部返回,只对查询结果进行过滤返回,功能类似 mysql 中的 having;global
的作用是覆盖掉query
的查询作用。
聚合原理及 terms 精准度
Terms Aggregation 的返回中有两个特殊的数值
doc_count_error_upper_bound
:被遗漏的 term 分桶,包含的文档,有可能的最大值sum_other_doc_count
:除了返回结果 bucket 的 terms 以外,其他的 terms 的文档总数(总数-返回的总数)
聚合实验
实验数据引用自《Elasticsearch 核心技术与实战》- 阮一鸣(eBay Pronto 平台技术负责人)
创建索引
PUT /employees/
{
"mappings" : {
"properties" : {
"age" : {
"type" : "integer"
},
"gender" : {
"type" : "keyword"
},
"job" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 50
}
}
},
"name" : {
"type" : "keyword"
},
"salary" : {
"type" : "integer"
}
}
}
}
批量写入数据
PUT /employees/_bulk
{ "index" : { "_id" : "1" } }
{ "name" : "Emma","age":32,"job":"Product Manager","gender":"female","salary":35000 }
{ "index" : { "_id" : "2" } }
{ "name" : "Underwood","age":41,"job":"Dev Manager","gender":"male","salary": 50000}
{ "index" : { "_id" : "3" } }
{ "name" : "Tran","age":25,"job":"Web Designer","gender":"male","salary":18000 }
{ "index" : { "_id" : "4" } }
{ "name" : "Rivera","age":26,"job":"Web Designer","gender":"female","salary": 22000}
{ "index" : { "_id" : "5" } }
{ "name" : "Rose","age":25,"job":"QA","gender":"female","salary":18000 }
{ "index" : { "_id" : "6" } }
{ "name" : "Lucy","age":31,"job":"QA","gender":"female","salary": 25000}
{ "index" : { "_id" : "7" } }
{ "name" : "Byrd","age":27,"job":"QA","gender":"male","salary":20000 }
{ "index" : { "_id" : "8" } }
{ "name" : "Foster","age":27,"job":"Java Programmer","gender":"male","salary": 20000}
{ "index" : { "_id" : "9" } }
{ "name" : "Gregory","age":32,"job":"Java Programmer","gender":"male","salary":22000 }
{ "index" : { "_id" : "10" } }
{ "name" : "Bryant","age":20,"job":"Java Programmer","gender":"male","salary": 9000}
{ "index" : { "_id" : "11" } }
{ "name" : "Jenny","age":36,"job":"Java Programmer","gender":"female","salary":38000 }
{ "index" : { "_id" : "12" } }
{ "name" : "Mcdonald","age":31,"job":"Java Programmer","gender":"male","salary": 32000}
{ "index" : { "_id" : "13" } }
{ "name" : "Jonthna","age":30,"job":"Java Programmer","gender":"female","salary":30000 }
{ "index" : { "_id" : "14" } }
{ "name" : "Marshall","age":32,"job":"Javascript Programmer","gender":"male","salary": 25000}
{ "index" : { "_id" : "15" } }
{ "name" : "King","age":33,"job":"Java Programmer","gender":"male","salary":28000 }
{ "index" : { "_id" : "16" } }
{ "name" : "Mccarthy","age":21,"job":"Javascript Programmer","gender":"male","salary": 16000}
{ "index" : { "_id" : "17" } }
{ "name" : "Goodwin","age":25,"job":"Javascript Programmer","gender":"male","salary": 16000}
{ "index" : { "_id" : "18" } }
{ "name" : "Catherine","age":29,"job":"Javascript Programmer","gender":"female","salary": 20000}
{ "index" : { "_id" : "19" } }
{ "name" : "Boone","age":30,"job":"DBA","gender":"male","salary": 30000}
{ "index" : { "_id" : "20" } }
{ "name" : "Kathy","age":29,"job":"DBA","gender":"female","salary": 20000}
桶聚合
对 keword 进行聚合
GET employees/_search
{
"size": 0,
"aggs": {
"jobs": {
"terms": {
"field":"job.keyword"
}
}
}
}
指标聚合
多个 Metric 聚合,找到最低最高和平均 salary
GET employees/_search
{
"size": 0,
"aggs": {
"max_salary": {
"max": {
"field": "salary"
}
},
"min_salary": {
"min": {
"field": "salary"
}
},
"avg_salary": {
"avg": {
"field": "salary"
}
}
}
}
多次嵌套。根据工作类型分桶,然后按照性别分桶,计算 salary 的统计信息
GET employees/_search
{
"size": 0,
"aggs": {
"job_gender_stats": {
"terms": {
"field": "job.keyword"
},
"aggs": {
"gender_stats": {
"terms": {
"field": "gender"
},
"aggs": {
"salary_stats": {
"stats": {
"field": "salary"
}
}
}
}
}
}
}
}
response
{
"took" : 6,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 20,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"job_gender_stats" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Java Programmer",
"doc_count" : 7,
"gender_stats" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "male",
"doc_count" : 5,
"salary_stats" : {
"count" : 5,
"min" : 9000.0,
"max" : 32000.0,
"avg" : 22200.0,
"sum" : 111000.0
}
},
{
"key" : "female",
"doc_count" : 2,
"salary_stats" : {
"count" : 2,
"min" : 30000.0,
"max" : 38000.0,
"avg" : 34000.0,
"sum" : 68000.0
}
}
]
}
},
......
]
}
}
}
Pipeline 聚合
平均 salary 的统计分析
GET employees/_search
{
"size": 0,
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword",
"size": 10
},
"aggs": {
"avg_salary": {
"avg": {
"field": "salary"
}
}
}
},
"stats_salary_by_job":{
"stats_bucket": {
"buckets_path": "jobs>avg_salary"
}
}
}
}
实践一:多商户数据权限聚合分页
collapse
+ cardinality
实现分页去重查询
GET my_order/_search
{
"from": 0,
"size": 6,
"track_total_hits": true,
"query": {
"bool": {
"must": [
{
"terms": {
"tenant_id": [
1,
2,
3,
4
]
}
}
]
}
},
"aggs": {
"cidAgg": {
"cardinality": {
"field": "cid"
}
}
},
"collapse": {
"field": "cid"
}
}
注:不支持 search_after
,导出推荐 scroll
实践二:多维度嵌套聚合
date_histogram
日期直方图 + terms
分桶聚合过去一周每天产生的工单量,每天各品类工单量,每天各品类排名前 N 的爆品等等。
GET my_order/_search
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"range": {
"created_at": {
"gte": "2023-11-10",
"lte": "2023-11-16"
}
}
}
]
}
},
"aggs": {
"ranges": {
"date_histogram": {
"field": "created_at",
"format": "yyyy-MM-dd",
"calendar_interval": "day"
},
"aggs": {
"order_type_agg": {
"terms": {
"field": "order_type"
}
}
}
}
}
}
实践三:删除 ES 索引重复数据
核酸检测数据量大,数据存储选型如使用 elasticsearch
、click house
等列数据库,数据重复是绕不开的话题,应用可通过计划任务等方式检测到重复数据并及时处理。
单字段查重
GET my_order/_search
{
"size": 0,
"query": {
"term": {
"tenant_id": 1
}
},
"aggs": {
"duplicateCount": {
"terms": {
"field": "cid",
"size": 1000,
"min_doc_count": 2
}
}
}
}
多字段查重
GET my_order/_search
{
"size": 0,
"aggs": {
"duplicateCount": {
"terms": {
"script": {
"lang": "painless",
"source": "doc['tenant_id'].value + doc['cid'].value"
},
"size": 100,
"min_doc_count": 2
}
}
}
}
数据查重并在 duplicateDocuments
数组展示细节
GET my_order/_search
{
"size": 0,
"aggs": {
"duplicateCount": {
"terms": {
"script": {
"lang": "painless",
"source": "doc['tenant_id'].value + doc['cid'].value"
},
"size": 100,
"min_doc_count": 2
},
"aggs": {
"duplicateDocuments": {
"top_hits": {}
}
}
}
}
}
查询到的重复数据记入日志,核实后使用_delete_by_query
删除
POST my_order/_delete_by_query?conflicts=proceed&max_docs=1
{
"query": {
"term": {
"cid": 2
}
}
}
max_docs
为 response 当前 key 中bucket.doc_count
的数量-1
附php
版本 demo 供参考
public function clearDuplicate()
{
$index = 'my_order';
$client = ClientBuilder::create()->build();
$params = [
'index' => $index,
'size' => 0,
'body' => [
'query' => [
...
],
'aggs' => [
'duplicateCount' => [
'terms' => [
'field' => 'cid',
'size' => 1000,
'min_doc_count' => 2
]
]
],
],
];
$result = $client->search($params);
$bucket = ArrayHelper::getValue($result, 'aggregations.duplicateCount.buckets');
if (!is_array($bucket) || empty($bucket)) {
return;
}
foreach ($bucket as $item) {
$maxDocs = ArrayHelper::getValue($item, 'doc_count', 0) - 1;
$key = ArrayHelper::getValue($item, 'key');
if ($maxDocs < 1 || empty($key)) {
continue;
}
$client->deleteByQuery([
'index' => $index,
'conflicts' => 'proceed',
'max_docs' => $maxDocs,
'body' => [
'query' => [
'bool' => [
...
],
],
],
]);
}
}
附:实验环境
linux 操作系统
$ uname -a
Linux LAPTOP-QK4HAU1D 5.15.90.1-microsoft-standard-WSL2 #1 SMP Fri Jan 27 02:56:13 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
$ cat /etc/issue
Ubuntu 22.04.2 LTS \n \l
elasticsearch 版本
GET /
{
"name" : "elasticsearch",
"cluster_name" : "docker-cluster",
"cluster_uuid" : "6xwN3rfbQ2KGgQdt8IUKqg",
"version" : {
"number" : "7.16.2",
"build_flavor" : "default",
"build_type" : "docker",
"build_hash" : "2b937c44140b6559905130a8650c64dbd0879cfb",
"build_date" : "2021-12-18T19:42:46.604893745Z",
"build_snapshot" : false,
"lucene_version" : "8.10.1",
"minimum_wire_compatibility_version" : "6.8.0",
"minimum_index_compatibility_version" : "6.0.0-beta1"
},
"tagline" : "You Know, for Search"
}
参考:
[1] https://gitee.com/geektime-geekbang/geektime-ELK
[2] https://www.elastic.co/guide/en/elasticsearch/reference/7.17/search-aggregations.html
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 上周热点回顾(3.3-3.9)
· AI 智能体引爆开源社区「GitHub 热点速览」
· 写一个简单的SQL生成工具