Spark_Sql pivot函数

CREATE TABLE  test_order_info 
(uname STRING, product  STRING, age INT, city string, amount decimal);

INSERT INTO test_order_info VALUES
    (  'zhang3' ,  'tv' ,   22  , 'bj',   3000),
    (  'li4',   'notebook',  41,  'bj',  8000),
    (  'wang5',  'phone',  32,  'sh',  4000),
    (  'zhao6',  'notebook', 22,  'sz',  3000),
    (  'zhang3',  'phone',  22,  'bj',  3000),
    (  'li4',      'tv',    41,   'sz',  4000);


pivot ，Spark-sql和Oracle特有关键词，即旋转，将指列的字段值，旋转成为多个列。并且可以指定某些列成为旋转列的聚合值。

把整个表整理成3种列：维度列、旋转列、聚合列
格式：
select * from tablename pivot ( sum(聚合列) as 列标识  for 旋转列 in( 旋转列值1 ,旋转列值2,旋转列值3) )   
旋转列：原来是某个列的行值，现在要从行值要转为列的字段。（上例的product）
聚合列: 对于旋转后的新列，要聚合的值。（上例的amount）
除了旋转列和聚合列，默认都是维度列,如果存在这三种以外的字段，需要提前用子查询去除。（上例的uname ,age）

--将制定列的字段 转换成 新表的列名  同时剩下的都会变成维度列 用来聚合
select * from (select uname,product,age,amount from test_order_info) ts pivot ( sum(amount) as amount for 
product in ("tv","phone","notebook"))

uname	age	tv	phone	notebook
zhang3	22	3000	3000	NULL
zhao6	22	NULL	NULL	3000
li4	41	4000	NULL	8000
wang5	32	NULL	4000	NULL




spark-sql:
sum(NULL)=NULL
count(NULL)=0



create table  test_unpivot  as 
select uname,age,
sum( case when product="phone" then amount else null  end ) phone,
sum( case when product="tv" then amount else null  end ) tv,
sum( case when product="notebook" then amount else null  end ) notebook
from test_order_info group by uname,age; 

uname	age	phone	tv	notebook
zhang3	22	3000	3000	NULL
zhao6	22	NULL	NULL	3000
li4	41	NULL	4000	8000
wang5	32	4000	NULL	NULL




--spark-sql不支持 unpivot 可以使用stack代替
--这是oracle的
逆透视（unpivot）的处理流程是：

unpivotcolumn_name_list 是逆透视列的列表，其列值是相兼容的，能够存储在一个column中
保持其他列（ 除unpivotcolumn_name_list 之外的所有列）的列值不变
依次将 unpivotcolumn 的列名存储到 newcolumn_store_unpivotcolumn_name 字段中，将 unpivotcolumn 的列值存储到 newcolumn_store_unpivotcolumn_value 字段中



--将phone, tv, notebook 的列名和列值存储到字段：product和Orders中
SELECT uname, age, product, amount  
FROM test_unpivot  as t
UNPIVOT  
(amount FOR product IN   
      (phone, tv, notebook)  
)AS unpvt;  



--spark-sql
stack()
stack(n, expr1, ..., exprk) - 会将expr1, ..., exprk 分割为n行.


select * from (
select uname,age,stack(3, 'phone',phone, 'tv', tv, 'notebook', notebook) as (product, amount) from  test_unpivot ) a  where product is not null and amount is not null;

uname	age	product	amount
zhang3	22	phone	3000
zhang3	22	tv	3000
zhao6	22	notebook	3000
li4	41	tv	4000
li4	41	notebook	8000
wang5	32	phone	4000



2.

--列出条件 套进去公式实现
维度列、product
旋转列、city
聚合列  age amount

select  * from  ( select product,age,city,amount from  test_order_info ) t  pivot (  sum(amount) as amount , avg(age) as age  for city in ("sh","bj","sz") );

3.

create table test_user_tags (uid string, tag_code STRING, tag_value STRING);
INSERT INTO test_user_tags VALUES
    (  '101','gender' ,'f'  ),
    (  '102', 'gender', 'm' ),
    (  '103', 'gender', 'm' ),
    (  '104', 'gender', 'f' ),
    (  '105', 'gender', 'm' ),
    (  '106', 'gender', 'f' ),
    (  '101','age' ,'60'  ),
    (  '102', 'age', '70' ),
    (  '103', 'age', '80' ),
    (  '104', 'age', '70' ),
    (  '105', 'age', '90' ),
    (  '106', 'age', '90' ) ,
    (  '101','amount' ,'422'  ),
    (  '102', 'amount', '4443' ),
    (  '103', 'amount', '12000' ),
    (  '104', 'amount', '6664' ),
    (  '105', 'amount', '900' ),
    (  '106', 'amount', '2000' ) ;




--1.
select  uid ,
max(case when tag_code="gender" then tag_value end  ) gender , 
max(case when tag_code="age" then tag_value end ) age , 
max(case when tag_code="amount" then tag_value end )   amount
from test_user_tags
group by uid;

uid	gender	age	amount
101	f	60	422
104	f	70	6664
102	m	70	4443
103	m	80	12000
106	f	90	2000
105	m	90	90

--2.join
select  a.uid, a.tag_value as gender , b.tag_value as amount  , c.tag_value   as age  from 

(select * from 
test_user_tags where  tag_code ="gender") a
join 
(select * from 
test_user_tags where  tag_code ="amount") b
on  a.uid = b .uid 
join 
(select * from 
test_user_tags where  tag_code ="age") c
on  a.uid = c.uid

--3.pivot

维度列:uid
旋转列:tag_code
聚合列:tag_value

select  * from test_user_tags t pivot ( max(tag_value)  as tag_value  for tag_code in ("gender","age","amount") ) ;
select  * from test_user_tags t pivot (concat_ws ("",collect_list(tag_value))  as tag_value  for tag_code in ("gender","age","amount") ) ;

uid	gender	age	amount
101	f	60	422
104	f	70	6664
102	m	70	4443
103	m	80	12000
106	f	90	2000
105	m	90	900




select * from  ( select uid ,  'tg_preson_base_gender' as tag_code ,tag_value  from  tg_preson_base_gender where dt= '2020-06-14'  union all  select uid ,  'tg_preson_base_agegroup' as tag_code ,tag_value  from  tg_preson_base_agegroup where dt= '2020-06-14' ) t pivot (max(tag_value) as tag_value for tag_code in ( 'tg_preson_base_gender' , 'tg_preson_base_agegroup' ) ) 
limit 10;
posted @ 2021-09-29 16:52 超级无敌小剑阅读(1437) 评论(0) 编辑收藏举报
刷新页面返回顶部
超级无敌小剑

Spark_Sql pivot函数

公告