hive Group操作

GROUPING SETS使用

grouping sets是一种将多个group by 逻辑写在一个sql语句中的便利写法。

--GROUP BY a, b 设置分组字段
--GROUPING SETS ((a,b)) 分组方式
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS ((a,b))
等于
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b

--GROUP BY a, b GROUPING SETS ((a,b), a)
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS ((a,b), a)

等于

SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b 
UNION ALL
SELECT a, null, SUM(c) FROM tab1 GROUP BY a


--GROUP BY a, b GROUPING SETS (a,b)
SELECT a,b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS (a,b)
等于

SELECT a, null, SUM(c) FROM tab1 GROUP BY a 
UNION ALL
SELECT null, b, SUM(c) FROM tab1 GROUP BY b

--GROUP BY a, b GROUPING SETS ((a, b), a, b, ())
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS ((a, b), a, b, ())

等于
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b 
UNION ALL
SELECT a, null, SUM(c) FROM tab1 GROUP BY a
UNION ALL
SELECT null, b, SUM(c) FROM tab1 GROUP BY b 
UNION ALL
SELECT null, null, SUM(c) FROM tab1

常用于计算各种组合的报表数据。

select * from (
SELECT a, b, SUM( c ) FROM tab1 GROUP BY a, b 
UNION ALL
SELECT a, null, SUM( c ) FROM tab1 GROUP BY a
UNION ALL
SELECT null, b, SUM( c ) FROM tab1 GROUP BY b 
UNION ALL
SELECT null, null, SUM( c ) FROM tab1
    ) d 
        情况1 : 
        where d.a is null and d.b is null;
        情况2 : 
        where d.a is null and d.b is not null;
        情况2 : 
        where d.a is not null and d.b is null;
        情况4 : 
        where d.a is not null and d.b is not null;

示例
创建student_grouping表

CREATE TABLE student_grouping(
id int,
name string,
age int,
sex string 
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

-- 分组
group by age group by sex group by age,sex 查询全部

给表导入数据

1   name1   12  boy
2   name2   12  boy
3   name3   13  girl
4   name4   13  boy
5   name5   14  boy
6   name6   14  boy
7   name7   15  girl
8   name8   15  girl

查询数据
查询分组数据

SELECT age, sex, count(id) FROM student_grouping GROUP BY age, sex GROUPING SETS ((age,sex),age,sex,());
GROUP BY age, sex 设置分组字段
GROUPING SETS ((age,sex),age,sex,()) 分组方式

Total MapReduce CPU Time Spent: 4 seconds 750 msec
OK
NULL    NULL    8
NULL    boy     5
NULL    girl    3
12      NULL    2
12      boy     2
13      NULL    2
13      boy     1
13      girl    1
14      NULL    2
14      boy     2
15      NULL    2
15      girl    2

等于
select age, sex, count(id) from student_grouping group by age,sex; = (age,sex)
以age,sex为组
Total MapReduce CPU Time Spent: 9 seconds 550 msec
OK
12      boy     2
13      boy     1
13      girl    1
14      boy     2
15      girl    2

select age, null, count(id) from student_grouping group by age; = age
以age为组
Total MapReduce CPU Time Spent: 8 seconds 920 msec
OK
12      NULL    2
13      NULL    2
14      NULL    2
15      NULL    2

select null, sex, count(id) from student_grouping group by sex; = sex
以sex为组
Total MapReduce CPU Time Spent: 8 seconds 920 msec
OK
NULL    boy     5
NULL    girl    3

select null,null,count(id) from student_grouping; = ()
求总数
Total MapReduce CPU Time Spent: 9 seconds 190 msec
OK
NULL    NULL    8

grouping sets加上if,case用法

select * from 
(SELECT age, sex, count(id) c FROM student_grouping GROUP BY age, sex GROUPING SETS ((age,sex),age,sex,())) a  
where a.age is null and a.sex is not null;

select 
if(age is not null,age,'ALL'),sex,c from 
(SELECT age, sex, count(id) c FROM student_grouping GROUP BY age, sex GROUPING SETS ((age,sex),age,sex,())) a  
where a.age is null and a.sex is not null;

select 
age,if(sex is not null,sex,'ALL'),c from 
(SELECT age, sex, count(id) c FROM student_grouping GROUP BY age, sex GROUPING SETS ((age,sex),age,sex,())) a  
where a.age is not null and a.sex is null;

-- 除了if 还可以使用 case when
SELECT 
if(age is not null, age, 'ALL'), 
case 
	when sex is not null then sex else 'ALL' end 
as age,count(id) 
FROM student_grouping 
GROUP BY age, sex GROUPING SETS ( (age,sex),age,sex,() );
age sex count
ALL	ALL	8 所有性别和所有年龄 = 8
ALL	boy	5 所有年龄段的男孩 = 5
ALL	girl	3 所有年龄段的女孩 = 3
12	ALL	2 12岁的所有孩子 = 2
12	boy	2 12岁的男孩 = 2
13	ALL	2
13	boy	1
13	girl	1
14	ALL	2
14	boy	2
15	ALL	2
15	girl	2

with cube 是group by中所有key的组合 非常多

根据提供的group by的分组字段的所有组合进行分组数据的查询
举例说明 :
假如 group by a,b,c
with cube =GROUPING SETS ((a,b,c),(a,b),(b,c),(a,c),a,b,c,())
所以上面的写法可简写成

SELECT if(age is not null, age, 'ALL'), 
case when sex is not null then sex else 'ALL' end as age,
count(id) 
FROM student_grouping 
GROUP BY age, sex with cube;

-- GROUP BY age, sex

-- with cube (age,sex),age,sex,()

原来的写法

SELECT if(age is not null, age, 'ALL'), 
case when sex is not null then sex
else 'ALL'
end as age,
count(id) 
FROM student_grouping 
GROUP BY age, sex GROUPING SETS ( (age,sex),age,sex,() );

with rollup是 按右侧递减的顺序组合

例如:
group by a,b,c
with rollup
GROUPING SETS ((a,b,c),(a,b),a,())

实际案例 : 年龄 性别 国家 --> 1. 年龄 性别 国家 2. 年龄 性别 3. 年龄 4 () with rollup
grouping sets()

右侧递减的意思就是从左到右 依次减去最右面的;

-- GROUP BY age, sex with rollup 等效于  GROUP BY age, sex GROUPING SETS ( (age,sex),age,() )
-- 相当于按右侧递减的顺序group by
SELECT if(age is not null, age, 'ALL'), 
case when sex is not null then sex else 'ALL' end as age,
count(id) FROM student_grouping GROUP BY age, sex with rollup;
--等于
SELECT if(age is not null, age, 'ALL'), 
case 
	when sex is not null then sex
	else 'ALL'
end as age,
count(id) FROM student_grouping GROUP BY age, sex
GROUPING SETS ( (age,sex),age,() );
posted @ 2020-05-31 23:49  我不是忘尘  阅读(350)  评论(0编辑  收藏  举报