一、函数

1.1、DATE_FORMAT
select DATE_FORMAT(CAST(create AS TIMESTAMP), '%Y-%m-%d') create_day from u
1.2、CAST
select DATE_FORMAT(CAST(create AS TIMESTAMP), '%Y-%m-%d') create_day from u
1.3、IF
select IF(status = 1, 1, 0) online from u
1.4、presto位运算
select bitwise_and(aa,7) from u
1.5、case when else end
select CASE
WHEN last_offline = 0
THEN 0
ELSE ROUND(1.0000 *(online - last_offline) / last_offline * 100, 4)
END AS offline_ratio from u
1.6、开窗
select row_number() over(partition by telphone,name order by to_date(create_date) asc) as flag from u
1.7、lag和lead
函数语法如下:
lag(exp_str,offset,defval) over(partion by ..order by …)
lead(exp_str,offset,defval) over(partion by ..order by …)
其中exp_str是字段名
Offset是偏移量,即是上1个或上N个的值,假设当前行在表中排在第5行,则offset 为3,则表示我们所要找的数据行就是表中的第2行(即5-3=2)。
Defval默认值,当两个函数取上N/下N个值,当在表中从当前行位置向前数N行已经超出了表的范围时,lag()函数将defval这个参数值作为函数的返回值,若没有指定默认值,则返回NULL,那么在数学运算中,总要给一个默认值才不会出错。
select lag(b.online, 1) over(order by b.create_day) AS last_online from u
1.8、get_json_object
返回值:String
说明:解析json的字符串json_string,返回path指定的内容。如果输入的json字符串无效,那么返回NUll,这个函数每次只能返回一个数据项。
select get_json_object('{"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"}','$.movie')
1.9、json_tuple(jsonStr, k1, k2, ...)
参数为一组键k1,k2,。。。。。和json字符串,返回值的元组。该方法比get_json_object高效,因此可以在一次调用中输入多次键
select b.b_movie,b.b_rate,b.b_timeStamp,b.b_uid from json a lateral view
json_tuple(a.data,'movie','rate','timeStamp','uid') b as b_movie,b_rate,b_timeStamp,b_uid
1.10、explode
select array('A','B','C')
select explode(array('A','B','C'))
1.11、regexp_replace
select regexp_replace('\n123\n','\n','456') #456123456
select regexp_replace('\n123\n','\n|2','456') #45614563456

二、场景场景

2.1、计算累积值法一-笛卡尔积
select a.create_day,a.num,sum(b.num) as total from
(select create_day,count(1) num from u where flag=1 group by create_day) a,
(select create_day,count(1) num from u where flag=1 group by create_day) b,
where a.create_day >=b.create_day group by a.create_day,a.num
2.2、计算累积值法二-开窗
SELECT
create_day,
online,
lag(b.online, 1) over(order by b.create_day) AS last_online,
SUM(online) over(order by create_day) AS cumulate_onlime
FROM
(
SELECT
create_day,
SUM(total) total,
SUM(online) online,
SUM(offline) offline
FROM
(
SELECT
DATE_FORMAT(CAST(apply_time AS TIMESTAMP), '%Y-%m-%d') create_day,
1 total,
IF(status = 1, 1, 0) online,
IF(status != 1, 1, 0) offline
FROM
u
WHERE
dp = 'ACTIVE'
)
a
GROUP BY
create_day
)
b
2.3、列转行
CONCAT(string A/col, string B/col…):返回输入字符串连接后的结果,支持任意个输入字符串;
CONCAT_WS(separator, str1, str2,...):它是一个特殊形式的 CONCAT()。第一个参数剩余参数间的分隔符。分隔符可以是与剩余参数一样的字符串。如果分隔符是 NULL,返回值也将为 NULL。这个函数会跳过分隔符参数后的任何 NULL 和空字符串。分隔符将被加到被连接的字符串之间;
COLLECT_SET(col):函数只接受基本数据类型,它的主要作用是将某字段的值进行去重汇总,产生array类型字段
select
t1.base,
concat_ws('|', collect_set(t1.name)) name
from
(select
name,
concat(constellation, ",", blood_type) base
from
person_info) t1
group by
t1.base;
2.4、行转列
EXPLODE(col):将hive一列中复杂的array或者map结构拆分成多行。
LATERAL VIEW
用法:LATERAL VIEW udtf(expression) tableAlias AS columnAlias
解释:用于和split, explode等UDTF一起使用,它能够将一列数据拆成多行数据,在此基础上可以对拆分后的数据进行聚合
select
movie,
category_name
from
movie_info lateral view explode(category) table_tmp as category_name;
2.5、解析json数组成表字段
SELECT explode(split(regexp_replace(regexp_replace('[{"website":"www.baidu.com","name":"百度"},{"website":"google.com","name":"谷歌"}]', '\\]|\\[',''),'\\}\\,\\{','\\}\\;\\{'),'\\;')) #{"website":"www.baidu.com","name":"百度"}
select json_tuple(json, 'website', 'name') from (SELECT explode(split(regexp_replace(regexp_replace('[{"website":"www.baidu.com","name":"百度"},{"website":"google.com","name":"谷歌"}]', '\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{'),'\\;')) as json) test # www.baidu.com 百度
2.6、比较两个表数组
select *
from (
SELECT
parent_pin,
pin
FROM
u
WHERE
--start_date <= '${date}' and end_date > '${date}'
--and 
source < 50
AND length(trim(pin)) > 0
AND length(trim(parent_pin)) > 0
GROUP BY
parent_pin, pin
)a
full join (
SELECT
parent_pin,
pin
FROM
u
WHERE
start_date <= '2021-03-21' and end_date > '2021-03-21'
and 
source < 50
AND length(trim(pin)) > 0
AND length(trim(parent_pin)) > 0
GROUP BY
parent_pin, pin
)b
on a.parent_pin = b.parent_pin
and a.pin = b.pin
where a.parent_pin is null or b.parent_pin is null