elasticsearch-聚合(七)
参考:
https://blog.csdn.net/zyc88888/article/details/83016513
实际还是建议看官网https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-pipeline-cumulative-cardinality-aggregation.html
聚合几种类型
1.Bucketing桶分聚合:
根据filed或者脚本将相同的数据放到一组
2.Metric:指标聚合:
对文档的指定字段进行计算 max min sum等
3.Pipeline:管道聚合
对其他聚合的结果进行聚合
聚合的语法
"aggregations" : { //定义聚合对象,也可用 "aggs" "<aggregation_name>" : { //聚合的名称,用户自定义 "<aggregation_type>" : { //聚合类型,比如 "histogram" <aggregation_body> //每个聚合类型都有其自己的结构定义 } [,"meta" : { [<meta_data_body>] } ]? [,"aggregations" : { [<sub_aggregation>]+ } ]? //可以定义多个 sub-aggregation } [,"<aggregation_name_2>" : { ... } ]* //定义额外的多个平级 aggregation,只有 Bucketing 类型才有意义 }
Bucket聚合
group By
"aggs": {
"depotCode": {
"terms": {
"size":600,//默认会聚合10个 这里设置预期最大值
"field": "depotCode"
}
}
}
Histogram Aggregation(multi-bucket)
直方图聚合——基于文档中的某个【数值类型】字段,通过计算来动态的分桶。
公式
rem = value % interval
if (rem < 0) {
rem += interval
}
bucket_key = value - rem
配置参数
field:字段,必须为数值类型
interval:分桶间距
min_doc_count:最少文档数桶过滤,只有不少于这么多文档的桶才会返回
extended_bounds:范围扩展
order:对桶排序,如果 histogram 聚合有一个权值聚合类型的"直接"子聚合,那么排序可以使用子聚合中的结果
offset:桶边界位移,默认从0开始
keyed:hash结构返回,默认以数组形式返回每一个桶
missing:配置缺省默认值
{
"query": {
"bool": {
"filter": {
"term": {
"category": 337063315819859968
}
}
}
},
"size": 0,
"aggs": {
"productId": {
"histogram": {
"field": "price1",
"interval":50,
"min_doc_count" : 1,
"extended_bounds" : {
"min" : 0,
"max" : 500
},
"order" : { "_count" : "desc" },
"keyed":true
}
}
}
}
Data Histogram Aggregation
统计某个时间修改数据
日期直方图聚合——基于日期类型,以【日期间隔】来桶分聚合。
可用的时间间隔类型为:year、quarter、month、week、day、hour、minute、second,其中,除了year、quarter 和 month,其余可用小数形式。
配置参数
- field:
- interval:
- format:定义日期的格式,配置后会返回一个 key_as_string 的字符串类型日期(默认只有key)
- time_zone:定义时区,用作时间值的调整
- offset:
- missing:
{ "size":0, "query": { "bool": { "filter": { "term": { "category": 337063315819859968 } } } }, "aggs" : { "articles_over_time" : { "date_histogram" : { "field" : "lastUpdateTime", "interval" : "month", "format" : "yyyy-MM-dd", "time_zone": "+08:00" } } } }
Metric聚合
avg(avg)
配置参数
- field:用于计算的字段
- script:由脚本生成用来计算的 value
- missing:文档缺省字段时的默认值
求平均值
#sql select avg(price1) from product where category=337063315819859968 { "size":0, "query":{ "term":{ "category":"337063315819859968"//查询指定分类产品 } }, "aggs": { "price_avg": { "avg": { "field": "price1"//计算平均值 } } } }
输出:

{ "took": 70, "timed_out": false, "_shards": { "total": 1, "successful": 1, "failed": 0 }, "hits": { "total": 650241, "max_score": 0.0, "hits": [] }, "aggregations": { "price_avg": { "value": 48029.56151334659 } } }
MAX
配置参数
- field:用于计算的字段
- script:由脚本生成用来计算的 value
- missing:文档缺省字段时的默认值
#select max(price1) max_price from product where category=337063315819859968 { "size":0, "query":{ "term":{ "category":"337063315819859968"//查询指定分类产品 } }, "aggs": { "max_price": {//自定义命名 "max": { "field": "price1" //求最大值的字段 } } } }
Min
配置参数
- field:用于计算的字段
- script:由脚本生成用来计算的 value
- missing:文档缺省字段时的默认值
#select min(price1) min_price from product where category=337063315819859968 { "size":0, "query":{ "term":{ "category":"337063315819859968"//查询指定分类产品 } }, "aggs": { "min_price": {//自定义命名 "max": { "field": "price1" //求最小值的字段 } } } }
Sum
#select sum(stock) sum_stock from product where category=337063315819859968
{ "size":0, "query":{ "term":{ "category":"337063315819859968" } }, "aggs": { "sum_stock": { "sum": { "field": "stock" } } } }
count
#select count(brand) from product
{ "size":0, "aggs" : { "grades_count" : { "value_count" : { "field" : "brand" } } //计算 grade 字段共有多少个值,和 cardinality 聚合不同的 } }
Cardinality(DISTINCT count)
注:数据量大于10000会有误差 参考:https://zhuanlan.zhihu.com/p/627168641
参考:https://www.cnblogs.com/LQBlog/p/10539245.html#autoid-6-4-0
配置参数
- field:用于计算的字段
- script:由脚本生成用来计算的 value
- precision_threshold:
- missing:文档缺省字段时的默认值
#select count(DISTINCT product_type) from pro_product p where p.product_code in('707440097123094530','ZY599811920987955200') { "size":0, "query":{ "terms":{ "productCode":["707440097123094530","ZY599811920987955200"] //in查询 } }, "aggs": { "name_cardinality": { "cardinality": { "field": "productType" //productType去重统计 } } } }
输出:

{ "took": 4, "timed_out": false, "_shards": { "total": 1, "successful": 1, "failed": 0 }, "hits": { "total": 2, "max_score": 0.0, "hits": [] }, "aggregations": { "name_cardinality": { "value": 1 } } }
stats(多指数统计)
配置参数
- field:用于计算的字段
- script:由脚本生成用来计算的 value
- missing:文档缺省字段时的默认值
#select count(1),max(price1),min(price1),avg(price1),sum(price1) from pro_product p where p.category_id =337063315819859968 { "size":0, "query":{ "term":{ "category":337063315819859968 #查询指定分类 } }, "aggs": { "price1_stats": { "stats": { "field": "price1" #统计字段 } } } }
输出结果

{ "took": 1299, "timed_out": false, "_shards": { "total": 1, "successful": 1, "failed": 0 }, "hits": { "total": 650260, "max_score": 0.0, "hits": [] }, "aggregations": { "price1_stats": { "count": 650260, //总数 "min": 0.0,//最小值 "max": 1.894667E7,//最大值 "avg": 48029.07665856734,//平均值 "sum": 3.1231387388E10//求和 } } }
stats扩展
{ "size":0, "query":{ "term":{ "category":337063315819859968 } }, "aggs": { "price1_stats": { "extended_stats": { "field": "price1" } } } }
输出

{ "took": 1015, "timed_out": false, "_shards": { "total": 1, "successful": 1, "failed": 0 }, "hits": { "total": 650260, "max_score": 0.0, "hits": [] }, "aggregations": { "price1_stats": { "count": 650260,//条数 "min": 0.0,//最小值 "max": 1.894667E7,//最大值 "avg": 48029.07665856734,//平局值 "sum": 3.1231387388E10,//总和 "sum_of_squares": 2.4913556601468932E16,//平方和 "variance": 3.600643112363865E10,//方差 "std_deviation": 189753.6063521288,标准差 "std_deviation_bounds": {//平均值/加减两个标准查的值 "upper": 427536.28936282493, "lower": -331478.1360456902 } } } }
geo_bounds(最小矩形)
地理边界聚合——基于文档的某个字段(geo-point类型字段),计算出该字段所有地理坐标点的边界(左上角/右下角坐标点)。
这个聚合操作计算能够覆盖所有查询结果中geo_point的最小区域,返回的是覆盖所有位置的最小矩形
{ "size":0, "query":{ "term":{ "category":337063315819859968 //指定分类产品 } }, "aggs": { "getpoint_geo_bounds": { "geo_bounds": { "field": "geoPoint" //销售门店经纬度 } } } }
输出

{ "took": 748, "timed_out": false, "_shards": { "total": 1, "successful": 1, "failed": 0 }, "hits": { "total": 650260, "max_score": 0.0, "hits": [] }, "aggregations": { "getpoint_geo_bounds": { "bounds": { "top_left": {#左上角精度维度 "lat": 52.945659, "lon": 0.0 }, "bottom_right": {#右下角精度维度 "lat": 0.0, "lon": 131.162756 } } } } }
Centroid(重心经纬度)
暂时不知道有啥用(重心点经纬度?)
{ "size":0, "query":{ "term":{ "category":337063315819859968 } }, "aggs": { "geoPoint_centroid": { "geo_centroid": { "field": "geoPoint" } } } }
geo_distance(指定坐标附近距离聚合)
Percentiles Aggregation
分百聚合——基于聚合文档中某个数值类型的值,求这些值中
{ "size": 0, "aggs": { "latency_percentiles": { "percentiles": { "field": "latency", "percents": [ ##统计50% 95% 99的%的百分位 50, 95, 99 ] } }, "latency_avg": { "avg": { "field": "latency" #请求耗时字段 } } } }
输出
{ "took": 3, "timed_out": false, "_shards": { "total": 5, "successful": 5, "skipped": 0, "failed": 0 }, "hits": { "total": 12, "max_score": 0, "hits": [] }, "aggregations": { "latency_avg": { "value": 201.91666666666666 }, "latency_percentiles": { "values": { "50.0": 108.5, #50%的请求访问在 108毫秒 "95.0": 627.4999999999997, #95%的请求耗时627毫秒 "99.0": 654#百分之99的请求在654毫秒 } } } }
Script Metric Aggregation
- init_script:用于计算的字段
- map_script:由脚本生成用来计算的 value
- combine_script:文档缺省字段时的默认值
- reduce_script:
{ "query": { "match_all": {} }, "aggs": { "profit": { "scripted_metric": { "init_script": "_agg[‘transactions‘] = []",#类似初始化一个数组变量保存结果集 后续做统计 "map_script": "if (doc[‘type‘].value == \"sale\") { _agg.transactions.add(doc[‘amount‘].value) }",//#如果type=sale 则计算amout "combine_script": "profit = 0; for (t in _agg.transactions) { profit += t }; return profit", #未测试不太理解 "reduce_script": "profit = 0; for (a in _aggs) { profit += a }; return profit"#未测试不太理解 } } } }
运行机制:
1. 在每个分组文档遍历(或匹配之前),执行init_script脚本,初始化统计变量。
2. 在遍历每个匹配的分组文档时,执行map_script脚本。
3. 在每个shards(分片)遍历完所有的分组文档时,执行combine_script脚本,得到每个分片的汇总统计结果。
4. 每个shards分片的结果会收集起来汇总到{"params": { "_aggs":[] } },然后执行reduce_script脚本,得到最终的统计结果。
测试环境不支持脚本 未测试
Top hits Aggregation
最高匹配权值聚合——跟踪聚合中相关性最高的文档。
该聚合一般用做 sub-aggregation,以此来聚合每个桶中的最高匹配的文档。
先说我们的一个需求 :同一个商品如果是自营 有多少个门店 就在es里面有多少个文档,表示商品id会重复 如果是商家则是一对一关系
需求是 搜索定位最近的商品信息 针对自营门店的则返回一条
以下是es数据结构
因为测试环境不支持 script 所以以下未做测试 后面再测
{ "query": { "bool": { "filter": { "term": { "category": 337063315819859968 } } } }, "size": 0, "aggs": { "top-tags": { "terms": { "_script": "doc['productId']+doc['depotType']", #这里改为script根据 商品id+门店类型进行桶分 "size": 5 }, "aggs": { "top_tag_hits": { "top_hits": { "sort": [{ "stock": { #这里可以改为经纬度排序 "order": "desc" } }], "_source": { "include": [ "id" #只取id ] }, "size": 1 #取最近一条 每个桶里面的重复数据 支持form 和size 分页 } } } } } }
因为es不支持针对聚合结果进行分页,可以只取id然后在内存进行偏移 分页 而且一般前台都是向下滑动分页 商品信息一般不会涉及到深分页
然后根据id 可以走redis 因为我们数据都有全量到redis 从redis获取具体信息 ,或者再查es
实际例子
聚合分页+排序
对应sql
select * from(select acceptEmpCode,count(*),sum(awardPunishmentAmount) from table
group by acceptEmpCode
order by sum(awardPunishmentAmount) desc,count(*) desc,acceptEmpCode asc ) tab limit 1,1
{ "size": 0, "query": { "bool": { "must": [], "adjust_pure_negative": true, "boost": 1.0 } }, "track_total_hits": 2147483647, "aggregations": { "acceptEmpCodeTerms": { "terms": { "field": "acceptEmpCode", "min_doc_count": 1, "shard_min_doc_count": 0, "show_term_doc_count_error": false, "order": [ { "awardPunishmentAmountSum": "desc" }, { "_count": "desc" }, { "_key": "asc" } ] }, "aggregations": { "awardPunishmentAmountSum": { "sum": { "field": "awardPunishmentAmount" } }, "myBucketSort": { "bucket_sort": { "from": 1, "size": 1, "gap_policy": "SKIP" } } } } } }
获取总页数
{ "size": 0, // 只关心聚合结果,不需要返回具体文档 "query": { "bool": { "must": [ // 查询条件 ] } }, "aggregations": { "unique_acceptEmpCode": { "cardinality": { "field": "acceptEmpCode" } } } }
组织树下钻统计
纠正一下设计思路
根据树型组织树筛选
存了个org_path=
数据行 1:A1-A2-A3-A4
数据行 2:B2-B3-B4-B5
数据化3:A1-A6-A7-A8
统计比如搜索A1 则需要搜索出A1下统计行数
A2=1
A6=2
通过sql实现
--取第三级就改为三 select SUBSTRING_INDEX(SUBSTRING_INDEX(org_path, '-', 2), '-', -1),count(1) from table where LOCATE("A1",org_path)>0 group by SUBSTRING_INDEX(SUBSTRING_INDEX(f.accept_org_path, '-', 2), '-', -1) AS second_value
我实际可用的sql 标红部分表示对应级数
SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(f.accept_org_path, '-', 3), '-', -1) AS org_code,count(1) FROM award_punishment_form f where LOCATE("10010000",f.accept_org_path)>0 group by SUBSTRING_INDEX(SUBSTRING_INDEX(f.accept_org_path, '-', 3), '-', -1)
如果采用es的话如何实现?可能需要通过脚本方式抓取,就算能抓取 需求还有定义 当组织为某个类型时就只能下转到某个类型的组织。可能不是连续的了。
但是需求有定义只能下钻3级,那我们就可以通过es 多设计3个字段 这样就统计简单了。假设B2为特殊类型,针对B2类型的时候就冗余对应筛选组织进去
org_code1,org_code2,org_code3
A1 A2 A3
这里只是记录换个思维方式
es查询
{ "size": 0, "query": { "bool": { "must": [ { "range": { "createdTime": { "from": "2024-11-10 20:14:41", "to": null, "include_lower": true, "include_upper": true, "boost": 1.0 } } } ] } }, "track_total_hits": 2147483647, "aggregations": { "billingOrgCodeTerms": { "terms": { "field": "billingOrgCodeL1", //动态指定统计组织维度等级 "size": 10, "min_doc_count": 1, "shard_min_doc_count": 0, "show_term_doc_count_error": false, "order": [ { "_count": "desc" }, { "_key": "asc" } ] } } } }
响应

{ "took": 0, "timed_out": false, "_shards": { "total": 1, "successful": 1, "skipped": 0, "failed": 0 }, "hits": { "total": { "value": 6, "relation": "eq" }, "max_score": null, "hits": [] }, "aggregations": { "billingOrgCodeTerms": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "10030000", "doc_count": 3 }, { "key": "10040000", "doc_count": 2 }, { "key": "10010000", "doc_count": 1 } ] } } }
group by配合cas when 做多维度统计
统计各个组织审核状态的数量,并计算通过率
SELECT * from (select billingOrgCodeL1, sum(case when auditStatus=10 then 1 else 0 end) as auditStatus_10, sum((case when auditStatus=20 then 1 else 0 end) as auditStatus_20, sum((case when auditStatus=30 then 1 else 0 end) as auditStatus_30,count(*) orgCodeCount,(sum((case when auditStatus=30 then 1 else 0 end)/count(*)) AS auditStatusPassPercentage from table group by billingOrgCodeL1)tab order by auditStatusPassPercentage desc
{ "size": 1, "query": { "bool": { "must": [ { "term": { "awardPunishmentType": { "value": 1, "boost": 1.0 } } }, { "term": { "formType": { "value": 1, "boost": 1.0 } } }, { "range": { "createdTime": { "from": "2024-11-11 16:00:00", "to": null, "include_lower": true, "include_upper": true, "boost": 1.0 } } }, { "range": { "createdTime": { "from": null, "to": "2024-11-12 15:59:59", "include_lower": true, "include_upper": true, "boost": 1.0 } } } ], "adjust_pure_negative": true, "boost": 1.0 } }, "track_total_hits": 2147483647, "aggs": { "billingOrgCodeTerms": { "terms": { "field": "billingOrgCodeL1", "size": 10, "min_doc_count": 1, "shard_min_doc_count": 0, "show_term_doc_count_error": false }, "aggregations": { "orgCodeCount": { "value_count": { "field": "billingOrgCodeL1" } }, "auditStatus_10": { "filter": { "term": { "auditStatus": "10" } } }, "auditStatus_20": { "filter": { "term": { "auditStatus": "20" } } }, "auditStatus_30": { "filter": { "term": { "auditStatus": "30" } } }, "auditStatusPassPercentage": { "bucket_script": { "buckets_path": { "auditStatusCount": "auditStatus_30._count", // 引用 auditStatus_30 的 doc_count 通过率 "orgCodeCount": "orgCodeCount" }, "script": "params.auditStatusCount / params.orgCodeCount * 100" } } } } } }
group by sum+count去重统计
根据统计各个单据类型的总金额以及关联人数,并根据员工编号去重
select awardPunishmentType,sum(awardPunishmentAmount),count(DISTINCT acceptEmpCode) from table group by awardPunishmentType
{ "aggs": { "awardPunishmentType": { "terms": { "field": "awardPunishmentType" }, "aggs": { "punishmentAmount": { "sum": { "field": "awardPunishmentAmount" } }, "distinctEmpcodeCount": { "cardinality": { "field": "acceptEmpCode" } } } } } }
输出

{ "took": 1, "timed_out": false, "_shards": { "total": 1, "successful": 1, "skipped": 0, "failed": 0 }, "hits": { "total": { "value": 7, "relation": "eq" }, "max_score": null, "hits": [] }, "aggregations": { "awardPunishmentType": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": 1, "doc_count": 6, "empcodeCount": { "value": 3 }, "punishmentAmount": { "value": 70.0 } }, { "key": 2, "doc_count": 1, "empcodeCount": { "value": 1 }, "punishmentAmount": { "value": 10.0 } } ] } } }
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!