【ES】ElasticSearch中"distinct","count"和"group by"的实现

distinct

实现 1:

SELECT DISTINCT(user_id) FROM table WHERE user_id_type = 3;

{
  "query": {
    "term": {
      "user_id_type": 3
    }
  },
  "collapse": {
    "field": "user_id"
  }
}

注:有时候这个 user_id 不是keyword的话,就需要在它后面加一个keyword,不然不支持的
报错大概是这个样子的:

{
    "error": {
        "root_cause": [
            {
                "type": "search_context_exception",
                "reason": "unknown type for collapse field `user_id`, only keywords and numbers are accepted"
            }
        ],
        "type": "search_phase_execution_exception",
        "reason": "all shards failed",
        "phase": "query",
        "grouped": true,
        "failed_shards": [
            {
                "shard": 0,
                "index": "event-*",
                "node": "98_Cvv7KT8m9jmN8",
                "reason": {
                    "type": "search_context_exception",
                    "reason": "unknown type for collapse field `user_id`, only keywords and numbers are accepted"
                }
            }
        ]
    },
    "status": 500
}

加上keyword就是这样子:

{
  "query": {
    "term": {
      "user_id_type": 3
    }
  },
  "collapse": {
    "field": "user_id.keyword"
  }
}

输出:

{
  ...
  "hits": {
    "hits": [
      {
        "_index": "es_qd_mkt_visitor_packet_dev_v1_20180621",
        "_type": "ad_crowd",
        "_source": {
          "user_id": "wx2af8414b502d4ca2_oHtrD0Vxv-_8c678figJNHmtaVQQ",
          "user_id_type": 3
        },
        "fields": {
          "user_id": [
            "wx2af8414b502d4ca2_oHtrD0Vxv-_8c678figJNHmtaVQQ"
          ]
        }
      }
    ]
  }
}

count + distinct

SELECT COUNT(DISTINCT(user_id)) FROM table WHERE user_id_type = 3;

去重后统计总数

{
  "query": {
    "term": {
      "user_id_type": 3
    }
  },
  "aggs": {
    "count": {
      "cardinality": {
        "field": "user_id.keyword"
      }
    }
  }
}

结果:

{
  ...
  "hits": {
  ...
  },
  "aggregations": {
    "count": {
      "value": 7
    }
  }
}

这里的 hits 是匹配的其中一条数据,是做示例用的。如果只需要总数,这个 hits 感觉没啥用。
真正的结果是在 aggregations.count.value

count + group by

SELECT COUNT(user_id) FROM table GROUP BY user_id_type;

{
  "aggs": {
    "user_type": {
      "terms": {
        "field": "user_id_type.keyword"
      }
    }
  }
}

结果:

{
  ...
  "hits": {
    ...
  },
  "aggregations": {
    "user_type": {
      ...
      "buckets": [
        {
          "key": 4,
          "doc_count": 1220
        },
        {
          "key": 3,
          "doc_count": 488
        }
      ]
    }
  }
}

count + distinct + group by

SELECT COUNT(DISTINCT(user_id)) FROM table GROUP BY user_id_type;

{
  "aggs": {
    "user_type": {
      "terms": {
        "field": "user_id_type.keyword"
      },
      "aggs": {
        "count": {
          "cardinality": {
            "field": "user_id.keyword"
          }
        }
      }
    }
  }
}

** 总结:对于既有group by又有distinct的查询要求,需要在aggs中嵌套子aggs **

{
  ...
  "hits": {
    ...
  },
  "aggregations": {
    "user_type": {
      ...
      "buckets": [
        {
          "key": 4,
          "doc_count": 1220, //去重前数据1220条
          "count": {
            "value": 276 //去重后数据276条
          }
        },
        {
          "key": 3,
          "doc_count": 488, //去重前数据488条
          "count": {
            "value": 121 //去重后数据121条
          }
        }
      ]
    }
  }
}
posted @ 2023-08-17 14:32  aaacarrot  阅读(1753)  评论(0编辑  收藏  举报