Elasticsearch - 聚合查询(二)

以电视为案例,熟悉各类聚合查询

PUT /tvs
PUT /tvs/_mapping
{
  "properties": {
    "price": {
      "type": "long"
    },
    "color": {
      "type": "keyword"
    },
    "brand": {
      "type": "keyword"
    },
    "tv_date": {
      "type": "date"
    }
  }
}

# 批量插入数据
POST /tvs/_bulk
{"index": {}}
{"price": 2000, "color": "蓝色", "brand": "小米", "tv_date": "2024-01-10"}
{"index": {}}
{"price": 1000, "color": "red", "brand": "长虹", "tv_date": "2023-01-10"}
{"index": {}}
{"price": 2000, "color": "red", "brand": "长虹", "tv_date": "2023-02-10"}
{"index": {}}
{"price": 3000, "color": "green", "brand": "小米", "tv_date": "2023-07-10"}
{"index": {}}
{"price": 1500, "color": "蓝色", "brand": "TCL", "tv_date": "2023-05-14"}
{"index": {}}
{"price": 1700, "color": "green", "brand": "TCL", "tv_date": "2023-05-17"}
{"index": {}}
{"price": 2000, "color": "red", "brand": "长虹", "tv_date": "2024-02-10"}
{"index": {}}
{"price": 8000, "color": "red", "brand": "三星", "tv_date": "2024-01-10"}
{"index": {}}
{"price": 2000, "color": "蓝色", "brand": "小米", "tv_date": "2023-01-10"}

统计哪种颜色的电视销量最高

count:bucket,terms,自动就会有一个doc_count,就相当于是count

GET /tvs/_search
{
  "size": 0,
  "aggs": {
    "group_color": {
      "terms": {
        "field": "color"
      }
    }
  }
}

返回结果:

返回结果解析
hits.hits:我们指定了size是0,所以hits.hits就是空的
aggregations:聚合结果
group_color:我们指定的某个聚合的名称
buckets:根据我们指定的field划分出的buckets
key:每个bucket对应的那个值
doc_count:这个bucket分组内,有多少个数据数量,其实就是这种颜色的销量

每种颜色对应的bucket中的数据的默认的排序规则:按照doc_couunt降序排序

统计每种颜色电视平均价格

avg:avg aggs,求平均值

GET /tvs/_search
{
  "size": 0,
  "aggs": {
    "group_color": {
      "terms": {
        "field": "color"
      },
      "aggs": {
        "avg_price": {
          "avg": {
            "field": "price"
          }
        }
      }
    }
  }
}

每个颜色下,平均价格及每个颜色下,每个品牌的平均价格

步骤:

  1. 颜色分组
  2. 颜色的平均价格
  3. 颜色下品牌分组
  4. 颜色下品牌的平均价格
GET /tvs/_search
{
  "size": 0,
  "aggs": {
    "group_color": { # 1. 颜色分组
      "terms": {
        "field": "color"
      },
      "aggs": {
        "avg_color_price": {# 2. 颜色的平均价格
          "avg": {
            "field": "price"
          }
        },
        "group_color_brand": {# 3. 颜色下品牌分组
          "terms": {
            "field": "brand"
          },
          "aggs": {
            "avg_color_brand_price": { # 4. 颜色下品牌的平均价格
              "avg": {
                "field": "price"
              }
            }
          }
        }
      }
    }
  }
}

返回结果:

{
  "took": 63,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 8,
      "relation": "eq"
    },
    "max_score": null,
    "hits": []
  },
  "aggregations": {
    "group_color": { # 颜色分组
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "red",
          "doc_count": 4,
          "group_color_brand": { # 颜色下品牌分组
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "长虹",
                "doc_count": 3,
                "avg_color_brand_price": { # 颜色下品牌的平均价格
                  "value": 1666.6666666666667
                }
              },
              {
                "key": "三星",
                "doc_count": 1,
                "avg_color_brand_price": {
                  "value": 8000
                }
              }
            ]
          },
          "avg_color_price": { # 颜色的平均价格
            "value": 3250
          }
        },{...},{...}
      ]
    }
  }
}

求出每个颜色的销售数量、平均价格、最大价格、最小价格、价格总和

count:bucket,terms,自动就会有一个doc_count,就相当于于是count

avg:avgaggs,求平均值

max:求一个bucket内,指定field值最大的那个数据

min:求一个bucket内,指定field值最小的那个数据

sum:求一个bucket内,指定field值的总和

GET /tvs/_search
{
  "size": 0,
  "aggs": {
    "group_color": {
      "terms": {
        "field": "color"
      },
      "aggs": {
        "avg_price": {
          "avg": {
            "field": "price"
          }
        },
        "max_price": {
          "max": {
            "field": "price"
          }
        },
        "min_price": {
          "min": {
            "field": "price"
          }
        },
        "sum_price": {
          "sum": {
            "field": "price"
          }
        }
      }
    }
  }
}

划分范围histogram,求出价格每2000为一个区间,每个区间的销售总额

histogram:类似于terms,也是进行bucket分组操作,接收一个field,按照这个field的值的各个范围区间,进行bucket分组操作

GET /tvs/_search
{
  "size": 0,
  "aggs": {
    "price_histogram": {
      "histogram": {
        "field": "price",
        "interval": 2000
      },
      "aggs": {
        "sum": {
          "sum": {
            "field": "price"
          }
        }
      }
    }
  }
}

结果:

"aggregations": {
    "price_histogram": {
      "buckets": [
        {
          "key": 0,
          "doc_count": 3,
          "sum": {
            "value": 4200
          }
        },
        {
          "key": 2000,
          "doc_count": 4,
          "sum": {
            "value": 9000
          }
        },
        {
          "key": 4000,
          "doc_count": 0,
          "sum": {
            "value": 0
          }
        },
        {
          "key": 6000,
          "doc_count": 0,
          "sum": {
            "value": 0
          }
        },
        {
          "key": 8000,
          "doc_count": 1,
          "sum": {
            "value": 8000
          }
        }
      ]
    }
  }

interval:2000,划分范围,02000,2000-4000,40006000,6000~8000, 8000~10000,buckets

bucket有了之后,一样的,去对每个bucket执行avg,count,suim,max,min,等各种metric操作,聚合分析

按照日期分组聚合,求出每个月销售个数

date_histogram:按照我们指定的某个date类型的日期field以及日期

calendar_interval: 只支持单位时间,值如下:

1m: minute
1h: hour
1d: day
1w: week
1M: month
1q: quarter(季度)
1y: year

fixed_interval: 支持指定多个单位时间,值如下:

ms: milliseconds(毫秒)
s: seconds
m: minutes
h: hours
d: days

min_doc_count:即使某个日期interval, 2017-01-01-01-011-31中,一条数据都没有, 那么这个区间也是要返回,不然默认是会过滤掉这个区间的

extended_bounds,min,max:划分bucket的时候,会限定在这个起始日
期和截止日期内

GET /tvs/_search
{
  "size": 0,
  "aggs": {
    "date_sales": {
      "date_histogram": {
        "field": "tv_date",
        "calendar_interval": "1M", 
        "format": "yyyy-MM-dd",
        "min_doc_count": 0,
        "extended_bounds": {
          "min": "2023-02-01",
          "max": "2023-06-01"
        }
      }
    }
  }
}

统计每季度每个品牌的销售额,及每个季度销售总额

{
  "size": 0,
  "aggs": {
    "data_sales": { # 按照季度分桶
      "date_histogram": {
        "field": "tv_date",
        "calendar_interval": "quarter",
        "format": "yyyy-MM-dd"
      },
      "aggs": {
        "group_brand": { #每个季度下的品牌分桶
          "terms": {
            "field": "brand"
          },
          "aggs": {
            "brand_price_sum": {# 每个季度下的品牌销售总额
              "sum": {
                "field": "price"
              }
            }
          }
        },
        "sum_quarter": { #每个季度销售总额
          "sum": {
            "field": "price"
          }
        }
      }
    }
  }
}

搜索与聚合结合,查询某个品牌按颜色销量

GET /tvs/_search
{
  "size": 0, 
  "query": {
    "term": {
      "brand": {
        "value": "小米"
      }
    }
  },
  "aggs": {
    "group_color": {
      "terms": {
        "field": "color"
      }
    }
  }
}

global bucket:单个品牌与所有品牌均价对比

GET /tvs/_search
{
  "size": 0, 
  "query": {
    "term": {
      "brand": {
        "value": "小米"
      }
    }
  },
  "aggs": {
    "xiaomi_avg": {
      "avg": {
        "field": "price"
      }
    },
    "all": {
      "global": {}, # 查询全局忽略掉上边的查询条件
      "aggs": {
        "all_avg_price": {
          "avg": {
            "field": "price"
          }
        }
      }
    }
  }
}

返回结果:

  "aggregations": {
    "all": {
      "doc_count": 8,
      "all_avg_price": {
        "value": 2650
      }
    },
    "xiaomi_avg": {
      "value": 2500
    }
  }

过滤+聚合:统计价格大于1200的电视平均价格

GET /tvs/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "range": {
          "price": {
            "gte": 1200
          }
        }
      }
    }
  },
  "size": 0,
  "aggs": {
    "avg_price": {
      "avg": {
        "field": "price"
      }
    }
  }
}

bucket filter:统计某品牌最近一个月的平均价格

GET /tvs/_search
{
  "query": { # 查询品牌
    "term": {
      "brand": {
        "value": "小米"
      }
    }
  },
  "size": 0,
  "aggs": {
    "recent_1year": { # 查询近1年的数据
      "filter": {
        "range": {
          "tv_date": {
            "gte": "now-1y" # tv_date >= 当前时间-1年
          }
        }
      },
      "aggs": {
        "recent_1year_avg_price": { # 查询近1年的数据取平均值
          "avg": {
            "field": "price"
          }
        }
      }
    },
    "recent_180d": { # 查询近180天的数据
      "filter": {
        "range": {
          "tv_date": {
            "gte": "now-180d" # tv_date >= 当前时间-180day
          }
        }
      },
      "aggs": {
        "recent_180d_avg_price": { # 查询近180天的数据取平均值
          "avg": {
            "field": "price"
          }
        }
      }
    },
    "recent_30d": { # 查询近30天的数据
      "filter": {
        "range": {
          "tv_date": {
            "gte": "now-30d"
          }
        }
      },
      "aggs": {
        "recent_30d_avg_price": { # 查询近30天的数据取平均值
          "avg": {
            "field": "price"
          }
        }
      }
    }
  }
}

aggs.filter: 针对的是聚合去做的。
如果放query里面的filter,是全局的,会对所有的数据都有影响。
bucket filter:对不同的bucket下的aggs,进行filter。

返回结果:

  "aggregations": {
    "recent_30d": {
      "doc_count": 0,
      "recent_30d_avg_price": { # 近30天没有数据, 平均价格为0
        "value": null
      }
    },
    "recent_1year": {
      "doc_count": 3,
      "recent_1year_avg_price": {
        "value": 2333.3333333333335
      }
    },
    "recent_180d": {
      "doc_count": 2,
      "recent_180d_avg_price": {
        "value": 2000
      }
    }
  }

按每种颜色的平均销售额降序排序

GET /tvs/_search
{
  "size": 0,
  "aggs": {
    "group_color": { # 1. 颜色分组
      "terms": {
        "field": "color",
        "order": { # 3. 颜色的平均值降序排列。可以直接用子聚合里边的变量排序
          "avg_price": "desc"
        }
      },
      "aggs": {
        "avg_price": { # 2. 颜色的平均值
          "avg": {
            "field": "price"
          }
        }
      }
    }
  }
}
posted @ 2024-05-07 10:56  py卡卡  阅读(6)  评论(0编辑  收藏  举报