1--建表语句 2createtable score(sid string, class string, score int)
3 row format delimited fields terminated by'';
4load data local inpath '/temp/sql.txt'intotable score;
5 6--数据:学生id、班级、分数 7 s1 A 89 8 s2 C 88 9 s3 A 9210 s4 A 8911 s5 B 9012 s6 B 8613 s7 C 9214 s8 C 9015 s9 A 8516 s10 C 8617 s11 B 8618 s12 B 86
1--sql语句 2select*from (
3select class, sid, score, dense_rank() over(partition by class orderby score desc) as rank
4from score
5) t1
6where t1.rank <4;
7 8--结果 9t1.class t1.sid t1.score t1.rank
10 A s3 92111 A s4 89212 A s1 89213 A s9 85314 B s5 90115 B s11 86216 B s12 86217 B s6 86218 C s7 92119 C s8 90220 C s2 883
二、行列转换问题
(1)一行转换为多行 -- lateral view explode
将每个电影的分类列表拆分出单个分类
1--建表语句2createtable movie(movie_name string, category string)
3 row format delimited fields terminated by'';
4load data local inpath '/temp/sql.txt'intotable movie;
56--数据:电影名、分类7让子弹飞 动作、年代
8长江七号 科幻
9 大进军 战争
1--建表语句 2createtable person(name string, age int)
3 row format delimited fields terminated by'';
4load data local inpath '/temp/sql.txt'intotable person;
5 6--数据:姓名、年龄 7 A 20 8 B 18 9 C 2010 D 24
1--sql语句 2select concat_ws('|', collect_set(name)) as name_list, age
3from person
4groupby age;
5 6--结果 7name_list age
8 B 18 9 A|C 2010 D 24
将可枚举的值作为新的字段
1--建表语句 2createtable game(yearint, class string, score int)
3 row format delimited fields terminated by'';
4load data local inpath '/temp/sql.txt'intotable game;
5 6--数据:年份、班级、得分 72020 ClassA 10 82020 ClassB 12 92020 ClassC 9102021 ClassA 12112021 ClassB 8
1--sql语句 2selectyear,
3max(casewhen class ='ClassA'then score end) as A,
4max(casewhen class ='ClassB'then score end) as B,
5max(casewhen class ='ClassC'then score end) as C
6from game
7groupbyyear;
8 9--结果10year a b c
11202010129122021128NULL
连续性问题(重点)
连续3天登录的用户
1--建表语句 2createtable login(uid int, dt date, status int)
3 row format delimited fields terminated by'';
4load data local inpath '/temp/sql.txt'intotable login;
5 6--数据:用户id、日期、是否登录 712022-03-010 812022-03-021 912022-03-0311012022-03-0411112022-03-0511212022-03-0601312022-03-0711422022-03-0101522022-03-0211622022-03-0311722022-03-0401822022-03-0501922022-03-0612022022-03-0712132022-03-0112232022-03-0212332022-03-0312432022-03-0402532022-03-0512632022-03-0612732022-03-071
1--sql语句 2select* 3from (
4--查询用户在起始日期及其之后的连续登录次数 5select t1.uid, min(t1.dt) as start_day, count(*) as day_count
6from (
7--按照用户分组,按照日期递增排序,计算连续登录操作的锚定日期 8select uid, dt, date_sub(dt, row_number() over(partition by uid orderby dt asc)) as diff
9from login
10where status =1--筛选登录的日期11 ) t1
12groupby t1.uid, t1.diff --按照用户、锚定日期分组13) t2
14where t2.day_count >2; --筛选出至少3次的连续登录操作1516--结果17t2.uid t2.start_day t2.day_count
1812022-03-0241932022-03-0132032022-03-053
连续3年获得冠军的队伍
--建表语句createtable champion(yearint, team string)
row format delimited fields terminated by'';
load data local inpath '/temp/sql.txt'intotable champion;
--数据:年份、获得冠军的队伍2000 Sun
2001 Lakers
2002 Rockets
2003 Rockets
2004 Rockets
2005 Spurs
2006 Spurs
2007 Sun
2008 Lakers
2009 Lakers
2010 Lakers
2011 Lakers
2012 Warriors
2013 Warriors
2014 Warriors
2015 Heat
2016 Warriors
2017 Cavaliers
2018 Warriors
2019 Sun
2020 Warriors
2021 Warriors
2022 Raptors
1--sql语句 2--查询队伍在起始年份及其之后的连续获得冠军的次数 3select t1.team, min(t1.year) as start_year, count(*) ascount 4from (
5--按照队伍分组,按照年份递增排序,计算队伍获得冠军的锚定年份 6selectyear, team, year- row_number() over(partition by team orderbyyearasc) as diff
7from champion
8) t1
9groupby t1.team, t1.diff --按照队伍、锚定年份分组10havingcount>2; --筛选出至少3次的连续获得冠军1112--结果13 t2.team t2.start_year t2.count14 Lakers 2008415 Rockets 2002316 Warriors 20123
间隔连续问题
给定每天登录游戏的所有用户,返回每个用户连续登录的最长天数,间隔一天的两次登录也可以看作连续登录
1--建表语句 2createtable login(id int, dt date)
3 row format delimited fields terminated by'';
4load data local inpath '/temp/sql.txt'intotable login;
5 6--数据:用户id、登录日期 710012022-07-01 810022022-07-01 910032022-07-011010012022-07-021110022022-07-031210012022-07-041310022022-07-051410032022-07-051510012022-07-071610022022-07-071710032022-07-081810012022-07-091910022022-07-09
1--sql语句 2--按照用户id分组,查询每个用户连续登录的最长天数 3select id, max(day_count) as day_count
4from (
5--按照用户id、flag字段分组,计算登录日期的最大值、最小值之差,表示一次连续登录的天数 6select id, datediff(max(dt), min(dt)) +1as day_count
7from (
8--按照用户id分组,按照登录日期递增排序,判断当前登录、上次登录的日期之差是否大于2,累计求和, 9--使得属于连续登录的所有日期具有相同取值的flag字段10select id, dt,
11sum(if(diff >2, 1, 0)) over(partition by id orderby dt asc) as flag
12from (
13--查询当前登录、上次登录的日期之差diff14select id, dt,
15casewhen pre_dt isnullthen016elsedatediff(dt, pre_dt) endas diff
17from (
18--按照用户id分组,按照登录日期递增排序,查询当前登录日期、上次登录日期19select id, dt,
20 lag(dt) over(partition by id orderby dt asc) as pre_dt
21from login
22 ) t1
23 ) t2
24 ) t3
25groupby id, flag
26) t4
27groupby id;
2829--结果30id day_count
311001432100293310031
1--建表语句 2createtable click(id string, time int)
3 row format delimited fields terminated by'';
4load data local inpath '/temp/sql.txt'intotable click;
5 6--数据:用户id、点击浏览时间 7 a 1001 8 a 1005 9 a 102010 a 104811 a 107812 a 123013 a 124514 a 127015 a 128216 b 110117 b 113218 b 115619 b 118020 b 120021 b 123022 b 134523 b 137024 b 1400
1--sql语句 2--查询用户在每个会话中的起始时间点、点击浏览次数、浏览时长 3select id, min(time) as start_time, count(*) ascount, max(time) -min(time) as total_time
4from (
5--分组排序后,从上到下计算value列的累加和。如果求和结果相同,则表示属于相同的会话 6select t2.id, t2.time, sum(t2.value) over(partition by id orderby time asc) as stage
7from (
8--如果与前一次点击的时间之差超过30,则value列为1,否则为0 9--value列为1表示这次点击属于一个新的会话,为0表示这次点击与前一次属于相同的会话10select id, time, diff, casewhen nvl(diff, 9999) >30then1else0end value
11from (
12--按照用户id分组,按照点击浏览时间递增排序,计算前后两次点击的时间之差13select id, time, time - lag(time) over(partition by id orderby time asc) as diff
14from click
15 ) t1
16 ) t2
17) t3
18groupby id, stage;
1920--结果21 id start_time count total_time
22 a 100157723 a 123045224 b 11011025 b 113259826 b 1345355
--建表语句createtable maybe_friend(id string, friends string)
row format delimited fields terminated by'';
load data local inpath '/temp/sql.txt'intotable maybe_friend;
--数据:用户id、好友id列表A B,C,D
B A,C,E
C A,B,D,E,F
D A,C,F
E B,C
F C,D
1--sql语句 2--创建临时表,将好友关系分解为最细粒度 3with friend as (
4select id, friend from common_friend lateral view explode(split(friends, ',')) tempas friend)
5 6--将具有至少两个共同好友的临时表与好友关系表进行连接,如果临时表的两个用户是好友关系,则在好友关系表中存在对应记录,否则不存在对应记录, 7--表示两者是可能好友 8select t2.id1, t2.id2
9from (
10--查询具有至少两个共同好友的任意两个用户11select t1.id1, t1.id2
12from (
13--将好友关系表与自身进行连接,查询任意两个用户具有的共同好友14select a.id as id1, b.id as id2, a.friend
15from friend a
16join friend b
17on a.friend = b.friend
18where a.id < b.id
19 ) t1
20groupby t1.id1, t1.id2
21havingcount(t1.friend) >=222) t2
23leftjoin friend
24on t2.id1 = friend.id
25and t2.id2 = friend.friend
26where friend.id isnull--排除真实好友,筛选可能好友27;
2829--结果30t2.id1 t2.id2
31A E
32A F
33 B D
1--建表语句 2createtable shop(id string, product int)
3 row format delimited fields terminated by'';
4load data local inpath '/temp/sql.txt'intotable shop;
5 6--数据:用户id、商品id 7 A 1 8 A 2 9 A 110 A 311 B 212 B 313 B 414 B 515 B 216 C 117 C 218 C 119 D 120 D 321 D 6
1--sql语句 2--按照用户、商品进行分组、去重 3withtempas (
4select id, product from shop groupby id, product)
5 6--将已购买与推荐购买的临时表与已购买表进行连接,如果临时表的商品已购买,则在已购买表中存在对应记录,否则不存在对应记录,表示推荐商品 7select t4.id1 as id, t4.product
8from (
9--查询每个用户已购买与推荐购买的商品10select t3.id1, t3.product
11from (
12--查询每个用户、以及具有相同购买倾向的其余用户、其余用户已购买的商品13select t2.id1, t2.id2, temp.product
14from (
15--查询已购买至少两个相同商品的任意两个用户16select t1.id1, t1.id2
17from (
18--查询已购买相同商品的任意两个用户19select a.id as id1, b.id as id2, a.product
20fromtemp a
21jointemp b
22on a.product = b.product
23and a.id != b.id
24 ) t1
25groupby t1.id1, t1.id2
26havingcount(t1.product) >=227 ) t2
28jointemp29on t2.id2 =temp.id
30 ) t3
31groupby t3.id1, t3.product
32) t4
33leftjointemp34on t4.product =temp.product
35and t4.id1 =temp.id --相同用户购买相同商品36wheretemp.product isnull--排除已购买商品,筛选推荐商品37;
3839--结果40id t4.product
41 A 442 A 543 A 644 B 145 C 346 D 2
--建表语句createtable login_action(uid string, login_date int)
row format delimited fields terminated by'';
load data local inpath '/temp/sql.txt'intotable login_action;
--数据:用户id、登录日期
d 20220321
e 20220321
f 20220321
a 20220322
b 20220322
d 20220322
a 20220323
b 20220323
c 20220323
a 20220324
b 20220324
c 20220324
a 20220325
b 20220325
c 20220325
f 20220325
1--新增用户:需要一张全量的用户表,一张某日的登录行为表 2--将日期20220321的登录用户作为初始表,表示已知的用户,然后查询日期20220322的新增用户 3--登录行为表与全量用户表进行关联,行为表中存在、用户表中不存在的记录表示这个日期的新增用户 4--查询出每个日期的新增用户后,需要追加到全量的用户表中 5 6--全量的用户表:表示用户成为新增用户的日期 7createtable user_add(uid string, add_date int);
8insertintotable user_add (select*from login_action where login_date =20220321);
910select t1.uid, t1.login_date
11from login_action t1
12leftjoin user_add t2
13on t1.uid = t2.uid
14where t1.login_date =20220322--查询日期20220322的新增用户15and t2.uid isnull--查询在用户表中不存在的记录,即这个日期的新增用户16;
1718--20220322的新增用户19t1.uid t1.add_date
20 a 2022032221 b 202203222223--所有日期的新增用户24user_add.uid user_add.add_date
25 d 2022032126 e 2022032127 f 2022032128 a 2022032229 b 2022032230 c 20220323
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 提示词工程——AI应用必不可少的技术
· 地球OL攻略 —— 某应届生求职总结
· 字符编码:从基础到乱码解决
· SpringCloud带你走进微服务的世界