Spark_Sql pivot函数
CREATE TABLE test_order_info
(uname STRING, product STRING, age INT, city string, amount decimal);
INSERT INTO test_order_info VALUES
( 'zhang3' , 'tv' , 22 , 'bj', 3000),
( 'li4', 'notebook', 41, 'bj', 8000),
( 'wang5', 'phone', 32, 'sh', 4000),
( 'zhao6', 'notebook', 22, 'sz', 3000),
( 'zhang3', 'phone', 22, 'bj', 3000),
( 'li4', 'tv', 41, 'sz', 4000);
pivot ,Spark-sql和Oracle特有关键词,即旋转,将指列的字段值,旋转成为多个列。并且可以指定某些列成为旋转列的聚合值。
把整个表整理成3种列:维度列、旋转列、聚合列
格式:
select * from tablename pivot ( sum(聚合列) as 列标识 for 旋转列 in( 旋转列值1 ,旋转列值2,旋转列值3) )
旋转列:原来是某个列的行值,现在要从行值要转为列的字段。(上例的product)
聚合列: 对于旋转后的新列,要聚合的值。(上例的amount)
除了旋转列和聚合列,默认都是维度列,如果存在这三种以外的字段,需要提前用子查询去除。(上例的uname ,age)
--将制定列的字段 转换成 新表的列名 同时剩下的都会变成维度列 用来聚合
select * from (select uname,product,age,amount from test_order_info) ts pivot ( sum(amount) as amount for
product in ("tv","phone","notebook"))
uname age tv phone notebook
zhang3 22 3000 3000 NULL
zhao6 22 NULL NULL 3000
li4 41 4000 NULL 8000
wang5 32 NULL 4000 NULL
spark-sql:
sum(NULL)=NULL
count(NULL)=0
create table test_unpivot as
select uname,age,
sum( case when product="phone" then amount else null end ) phone,
sum( case when product="tv" then amount else null end ) tv,
sum( case when product="notebook" then amount else null end ) notebook
from test_order_info group by uname,age;
uname age phone tv notebook
zhang3 22 3000 3000 NULL
zhao6 22 NULL NULL 3000
li4 41 NULL 4000 8000
wang5 32 4000 NULL NULL
--spark-sql不支持 unpivot 可以使用stack代替
--这是oracle的
逆透视(unpivot)的处理流程是:
unpivotcolumn_name_list 是逆透视列的列表,其列值是相兼容的,能够存储在一个column中
保持其他列( 除unpivotcolumn_name_list 之外的所有列)的列值不变
依次将 unpivotcolumn 的列名存储到 newcolumn_store_unpivotcolumn_name 字段中,将 unpivotcolumn 的列值存储到 newcolumn_store_unpivotcolumn_value 字段中
--将phone, tv, notebook 的列名和列值存储到字段:product和Orders中
SELECT uname, age, product, amount
FROM test_unpivot as t
UNPIVOT
(amount FOR product IN
(phone, tv, notebook)
)AS unpvt;
--spark-sql
stack()
stack(n, expr1, ..., exprk) - 会将expr1, ..., exprk 分割为n行.
select * from (
select uname,age,stack(3, 'phone',phone, 'tv', tv, 'notebook', notebook) as (product, amount) from test_unpivot ) a where product is not null and amount is not null;
uname age product amount
zhang3 22 phone 3000
zhang3 22 tv 3000
zhao6 22 notebook 3000
li4 41 tv 4000
li4 41 notebook 8000
wang5 32 phone 4000
2.
--列出条件 套进去公式实现
维度列、product
旋转列、city
聚合列 age amount
select * from ( select product,age,city,amount from test_order_info ) t pivot ( sum(amount) as amount , avg(age) as age for city in ("sh","bj","sz") );
3.
create table test_user_tags (uid string, tag_code STRING, tag_value STRING);
INSERT INTO test_user_tags VALUES
( '101','gender' ,'f' ),
( '102', 'gender', 'm' ),
( '103', 'gender', 'm' ),
( '104', 'gender', 'f' ),
( '105', 'gender', 'm' ),
( '106', 'gender', 'f' ),
( '101','age' ,'60' ),
( '102', 'age', '70' ),
( '103', 'age', '80' ),
( '104', 'age', '70' ),
( '105', 'age', '90' ),
( '106', 'age', '90' ) ,
( '101','amount' ,'422' ),
( '102', 'amount', '4443' ),
( '103', 'amount', '12000' ),
( '104', 'amount', '6664' ),
( '105', 'amount', '900' ),
( '106', 'amount', '2000' ) ;
--1.
select uid ,
max(case when tag_code="gender" then tag_value end ) gender ,
max(case when tag_code="age" then tag_value end ) age ,
max(case when tag_code="amount" then tag_value end ) amount
from test_user_tags
group by uid;
uid gender age amount
101 f 60 422
104 f 70 6664
102 m 70 4443
103 m 80 12000
106 f 90 2000
105 m 90 90
--2.join
select a.uid, a.tag_value as gender , b.tag_value as amount , c.tag_value as age from
(select * from
test_user_tags where tag_code ="gender") a
join
(select * from
test_user_tags where tag_code ="amount") b
on a.uid = b .uid
join
(select * from
test_user_tags where tag_code ="age") c
on a.uid = c.uid
--3.pivot
维度列:uid
旋转列:tag_code
聚合列:tag_value
select * from test_user_tags t pivot ( max(tag_value) as tag_value for tag_code in ("gender","age","amount") ) ;
select * from test_user_tags t pivot (concat_ws ("",collect_list(tag_value)) as tag_value for tag_code in ("gender","age","amount") ) ;
uid gender age amount
101 f 60 422
104 f 70 6664
102 m 70 4443
103 m 80 12000
106 f 90 2000
105 m 90 900
select * from ( select uid , 'tg_preson_base_gender' as tag_code ,tag_value from tg_preson_base_gender where dt= '2020-06-14' union all select uid , 'tg_preson_base_agegroup' as tag_code ,tag_value from tg_preson_base_agegroup where dt= '2020-06-14' ) t pivot (max(tag_value) as tag_value for tag_code in ( 'tg_preson_base_gender' , 'tg_preson_base_agegroup' ) )
limit 10;
绝不摆烂