hive Group操作
GROUPING SETS使用
grouping sets是一种将多个group by 逻辑写在一个sql语句中的便利写法。
--GROUP BY a, b 设置分组字段
--GROUPING SETS ((a,b)) 分组方式
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS ((a,b))
等于
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b
--GROUP BY a, b GROUPING SETS ((a,b), a)
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS ((a,b), a)
等于
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b
UNION ALL
SELECT a, null, SUM(c) FROM tab1 GROUP BY a
--GROUP BY a, b GROUPING SETS (a,b)
SELECT a,b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS (a,b)
等于
SELECT a, null, SUM(c) FROM tab1 GROUP BY a
UNION ALL
SELECT null, b, SUM(c) FROM tab1 GROUP BY b
--GROUP BY a, b GROUPING SETS ((a, b), a, b, ())
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS ((a, b), a, b, ())
等于
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b
UNION ALL
SELECT a, null, SUM(c) FROM tab1 GROUP BY a
UNION ALL
SELECT null, b, SUM(c) FROM tab1 GROUP BY b
UNION ALL
SELECT null, null, SUM(c) FROM tab1
常用于计算各种组合的报表数据。
select * from (
SELECT a, b, SUM( c ) FROM tab1 GROUP BY a, b
UNION ALL
SELECT a, null, SUM( c ) FROM tab1 GROUP BY a
UNION ALL
SELECT null, b, SUM( c ) FROM tab1 GROUP BY b
UNION ALL
SELECT null, null, SUM( c ) FROM tab1
) d
情况1 :
where d.a is null and d.b is null;
情况2 :
where d.a is null and d.b is not null;
情况2 :
where d.a is not null and d.b is null;
情况4 :
where d.a is not null and d.b is not null;
示例
创建student_grouping表
CREATE TABLE student_grouping(
id int,
name string,
age int,
sex string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
-- 分组
group by age group by sex group by age,sex 查询全部
给表导入数据
1 name1 12 boy
2 name2 12 boy
3 name3 13 girl
4 name4 13 boy
5 name5 14 boy
6 name6 14 boy
7 name7 15 girl
8 name8 15 girl
查询数据
查询分组数据
SELECT age, sex, count(id) FROM student_grouping GROUP BY age, sex GROUPING SETS ((age,sex),age,sex,());
GROUP BY age, sex 设置分组字段
GROUPING SETS ((age,sex),age,sex,()) 分组方式
Total MapReduce CPU Time Spent: 4 seconds 750 msec
OK
NULL NULL 8
NULL boy 5
NULL girl 3
12 NULL 2
12 boy 2
13 NULL 2
13 boy 1
13 girl 1
14 NULL 2
14 boy 2
15 NULL 2
15 girl 2
等于
select age, sex, count(id) from student_grouping group by age,sex; = (age,sex)
以age,sex为组
Total MapReduce CPU Time Spent: 9 seconds 550 msec
OK
12 boy 2
13 boy 1
13 girl 1
14 boy 2
15 girl 2
select age, null, count(id) from student_grouping group by age; = age
以age为组
Total MapReduce CPU Time Spent: 8 seconds 920 msec
OK
12 NULL 2
13 NULL 2
14 NULL 2
15 NULL 2
select null, sex, count(id) from student_grouping group by sex; = sex
以sex为组
Total MapReduce CPU Time Spent: 8 seconds 920 msec
OK
NULL boy 5
NULL girl 3
select null,null,count(id) from student_grouping; = ()
求总数
Total MapReduce CPU Time Spent: 9 seconds 190 msec
OK
NULL NULL 8
grouping sets加上if,case用法
select * from
(SELECT age, sex, count(id) c FROM student_grouping GROUP BY age, sex GROUPING SETS ((age,sex),age,sex,())) a
where a.age is null and a.sex is not null;
select
if(age is not null,age,'ALL'),sex,c from
(SELECT age, sex, count(id) c FROM student_grouping GROUP BY age, sex GROUPING SETS ((age,sex),age,sex,())) a
where a.age is null and a.sex is not null;
select
age,if(sex is not null,sex,'ALL'),c from
(SELECT age, sex, count(id) c FROM student_grouping GROUP BY age, sex GROUPING SETS ((age,sex),age,sex,())) a
where a.age is not null and a.sex is null;
-- 除了if 还可以使用 case when
SELECT
if(age is not null, age, 'ALL'),
case
when sex is not null then sex else 'ALL' end
as age,count(id)
FROM student_grouping
GROUP BY age, sex GROUPING SETS ( (age,sex),age,sex,() );
age sex count
ALL ALL 8 所有性别和所有年龄 = 8
ALL boy 5 所有年龄段的男孩 = 5
ALL girl 3 所有年龄段的女孩 = 3
12 ALL 2 12岁的所有孩子 = 2
12 boy 2 12岁的男孩 = 2
13 ALL 2
13 boy 1
13 girl 1
14 ALL 2
14 boy 2
15 ALL 2
15 girl 2
with cube 是group by中所有key的组合 非常多
根据提供的group by的分组字段的所有组合进行分组数据的查询
举例说明 :
假如 group by a,b,c
with cube =GROUPING SETS ((a,b,c),(a,b),(b,c),(a,c),a,b,c,())
所以上面的写法可简写成
SELECT if(age is not null, age, 'ALL'),
case when sex is not null then sex else 'ALL' end as age,
count(id)
FROM student_grouping
GROUP BY age, sex with cube;
-- GROUP BY age, sex
-- with cube (age,sex),age,sex,()
原来的写法
SELECT if(age is not null, age, 'ALL'),
case when sex is not null then sex
else 'ALL'
end as age,
count(id)
FROM student_grouping
GROUP BY age, sex GROUPING SETS ( (age,sex),age,sex,() );
with rollup是 按右侧递减的顺序组合
例如:
group by a,b,c
with rollup
GROUPING SETS ((a,b,c),(a,b),a,())
实际案例 : 年龄 性别 国家 --> 1. 年龄 性别 国家 2. 年龄 性别 3. 年龄 4 () with rollup
grouping sets()
右侧递减的意思就是从左到右 依次减去最右面的;
-- GROUP BY age, sex with rollup 等效于 GROUP BY age, sex GROUPING SETS ( (age,sex),age,() )
-- 相当于按右侧递减的顺序group by
SELECT if(age is not null, age, 'ALL'),
case when sex is not null then sex else 'ALL' end as age,
count(id) FROM student_grouping GROUP BY age, sex with rollup;
--等于
SELECT if(age is not null, age, 'ALL'),
case
when sex is not null then sex
else 'ALL'
end as age,
count(id) FROM student_grouping GROUP BY age, sex
GROUPING SETS ( (age,sex),age,() );