牛客MySQL真题练习1(156-179)
- 计算2021年里有播放记录的每个视频的完播率(结果保留三位小数),并按完播率降序排序
注:视频完播率是指完成播放次数占总播放次数的比例。简单起见,结束观看时间与开始播放时间的差>=视频时长时,视为完成播放。
/*
SELECT video_id,
ROUND(SUM(real_play) / COUNT(video_id), 3) AS avg_comp_play_rate
FROM (
SELECT t1.video_id,
IF(TIMESTAMPDIFF(SECOND, t1.start_time, t1.end_time) >= t2.duration, 1, 0) AS real_play
FROM tb_user_video_log t1 LEFT JOIN tb_video_info t2 USING(video_id)
WHERE YEAR(t1.start_time) = 2021
) a
GROUP BY video_id
ORDER BY avg_comp_play_rate DESC
*/
/*
avg()真的是求某种率的神器。理解:
单就这道题目来说,avg本质上就是 sum(某列)/个数。将完成播放的(开始时间与结束时间的间隔大于等于视频时长)记为1,没播放完的记为0,sum()这一列的时候,只有完成播放的才会记录进去。
举个例子一组数据(1 0 1 1 1 0)1是完成播放的,0是没完成播放的,完成播放的次数为3次,全部播放次数为5次,完播率为3/5;avg这组数据的时候,和为3,个数为5 也是3/5。
*/
SELECT video_id,
ROUND(AVG(IF(TIMESTAMPDIFF(SECOND, t1.start_time, t1.end_time) >= t2.duration, 1, 0)), 3) AS avg_comp_play_rate
FROM tb_user_video_log t1 LEFT JOIN tb_video_info t2 USING(video_id)
WHERE YEAR(t1.start_time) = 2021
GROUP BY video_id
ORDER BY avg_comp_play_rate DESC
- 计算各类视频的平均播放进度,将进度大于60%的类别输出。结果保留两位小数,并按播放进度倒序排序。
注:
播放进度=播放时长÷视频时长*100%,当播放时长大于视频时长时,播放进度均记为100%。
SELECT tag,
CONCAT(ROUND(AVG(IF(TIMESTAMPDIFF(SECOND, start_time, end_time) > duration, 100, TIMESTAMPDIFF(SECOND, start_time, end_time) / duration * 100)), 2), '%') AS avg_play_progress
FROM tb_user_video_log t1 LEFT JOIN tb_video_info t2 USING(video_id)
GROUP BY tag
HAVING avg_play_progress > 60
-- having replace(avg_play_progress,'%','') > 60
-- HAVING avg_play_progress > '60.00%' # 如果是字符串一定要保留两位小数,与数据格式一致
ORDER BY avg_play_progress DESC
- 统计在有用户互动的最近一个月(按包含当天在内的近30天算,比如10月31日的近30天为10.2~10.31之间的数据)中,每类视频的转发量和转发率(保留3位小数)。结果按转发率降序排序。
注:转发率=转发量÷播放量
SELECT tag, SUM(if_retweet) AS retweet_cnt, ROUND(AVG(if_retweet), 3) AS retweet_rate -- ROUND(SUM(if_retweet) / COUNT(video_id), 3) AS retweet_rate
FROM tb_user_video_log t1 LEFT JOIN tb_video_info t2 USING(video_id)
WHERE DATEDIFF(DATE((SELECT MAX(start_time) FROM tb_user_video_log)), DATE(start_time)) < 30
GROUP BY tag
ORDER BY retweet_rate DESC
/*
为什么 DATE 后需要加两个括号?
第一层括号是 DATE(),第二层括号是子查询,即 SELECT 语句
*/
# 历史经验:采用 DATEDIFF(DATE(start_time), DATE((SELECT MAX(start_time) FROM tb_user_video_log))) < 30 可以通过用例但是提交错误,采用 DATEDIFF(DATE((SELECT MAX(start_time) FROM tb_user_video_log)), DATE(start_time)) < 30 就正确了,注意前后关系
- 计算2021年里每个创作者每月的涨粉率及截止当月的总粉丝量
注:
涨粉率=(加粉量 - 掉粉量) / 播放量。结果按创作者ID、总粉丝量升序排序。if_follow-是否关注为1表示用户观看视频中关注了视频创作者,为0表示此次互动前后关注状态未发生变化,为2表示本次观看过程中取消了关注。
SELECT author, month,
ROUND(new_fans / total_play, 3) AS fans_growth_rate,
SUM(new_fans) OVER (PARTITION BY author ORDER BY month) AS total_fans -- 看到累计就用聚合窗口函数
FROM (
SELECT t2.author AS author, DATE_FORMAT(t1.start_time, '%Y-%m') AS month, COUNT(*) AS total_play,
SUM(CASE t1.if_follow WHEN 1 THEN 1
WHEN 2 THEN -1
ELSE 0
END) AS new_fans
FROM tb_user_video_log t1 LEFT JOIN tb_video_info t2 USING(video_id)
WHERE YEAR(t1.start_time) = 2021
GROUP BY author, month
) a
ORDER BY author, total_fans
/* 计算新增粉丝数其实只需要转化一下 if_follow = 2 的情况,即 IF(t1.if_follow = 2, -1, t1.if_follow) AS new_fans */
- 统计2021年国庆头3天每类视频每天的近一周总点赞量和一周内最大单天转发量,结果按视频类别降序、日期升序排序。假设数据库中数据足够多,至少每个类别下国庆头3天及之前一周的每天都有播放记录。
SELECT tag, dt, sum_like_cnt_7d, max_retweet_cnt_7d
FROM (
SELECT t2.tag,
DATE(t1.start_time) AS dt,
SUM(SUM(t1.if_like)) OVER (PARTITION BY t2.tag ORDER BY DATE(t1.start_time) ROWS 6 PRECEDING) AS sum_like_cnt_7d, -- SUM(t1.if_like)是一天的点赞量,多天累计需要在用一次SUM()函数
MAX(SUM(t1.if_retweet)) OVER (PARTITION BY t2.tag ORDER BY DATE(t1.start_time) ROWS 6 PRECEDING) AS max_retweet_cnt_7d
FROM tb_user_video_log t1 LEFT JOIN tb_video_info t2 USING(video_id)
WHERE DATE(t1.start_time) BETWEEN DATE_SUB('2021-10-01', interval 6 day)
AND timestampadd(day, 2, '2021-10-01')
# 可以直接写为 WHERE DATE(t1.start_time) BETWEEN '2021-09-25' AND '2021-10-03'
GROUP BY t2.tag, dt
)a
WHERE dt BETWEEN '2021-10-01' AND '2021-10-03'
ORDER BY tag DESC, dt
/* 思路:
1. 计算出每天的点赞量,转发量作临时表,这里的日期应从'2021-09-25' 到 '2021-10-03'
2. 每天的近一周总点赞量和一周内最大单天转发量 --> 开窗函数
*/
/*
1. dt升序排序,向前6行
partition by tag order by dt rows 6 preceding
partition by tag order by dt rows between 6 preceding and CURRENT row
2. dt降序排列,当前行往后6行
partition by tag order by dt desc rows between CURRENT row and 6 following
*/
- 找出近一个月发布的视频中热度最高的top3视频。结果中热度保留为整数,并按热度降序排序。
注:
热度=(a*视频完播率+b*点赞数+c*评论数+d*转发数)*新鲜度;
新鲜度=1/(最近无播放天数+1);
当前配置的参数a,b,c,d分别为100、5、3、2。
最近播放日期以end_time-结束观看时间为准,假设为T,则最近一个月按[T-29, T]闭区间统计。
SELECT video_id, ROUND((100 * allplay_rate + 5 * like_cnt + 3 * comment_cnt + 2 * retweet_cnt) * 1 / (noplay_days + 1)) AS hot_index
FROM (
SELECT t1.video_id,
AVG(IF(TIMESTAMPDIFF(SECOND, t1.start_time, t1.end_time) >= t2.duration, 1, 0)) AS allplay_rate, -- 用超级好用的 AVG() 函数,相当于 SUM(IF(TIMESTAMPDIFF(SECOND, start_time, end_time) >= duration, 1, 0)) / COUNT(t1.video_id) AS allplay_rate
SUM(t1.if_like) AS like_cnt,
COUNT(t1.comment_id) AS comment_cnt, # 不用判断NULL
SUM(t1.if_retweet) AS retweet_cnt,
DATEDIFF((SELECT MAX(end_time) FROM tb_user_video_log), MAX(t1.end_time)) AS noplay_days -- 最近无播放天数。注意子查询语句要括起来,且 FROM 子句不能直接用 t1
FROM tb_user_video_log t1 LEFT JOIN tb_video_info t2 USING(video_id)
WHERE DATEDIFF((SELECT MAX(end_time) FROM tb_user_video_log), t2.release_time) <= 29 -- 视频发布时间离最近一次播放时间在一个月内
GROUP BY t1.video_id
) a
ORDER BY hot_index DESC
LIMIT 3
/*
思路:首先算出每个视频每天的完播,点赞数,评论数,转发数,然后计算每个视频的热度,最后取TOP3排序
最近无播放天数 :DATEDIFF((select max(end_time) from tb_user_video_log), max(end_time)),其中:
select max(end_time) from tb_user_video_log 是取整个表格中最近的日期;
max(end_time)后是有group by video_id的,取的是每个视频的最近日期
*/
- 统计2021年11月每天的人均浏览文章时长(秒数),结果保留1位小数,并按时长由短到长排序。
SELECT dt, ROUND(view_len_sec / num, 1) AS avg_view_len_sec
FROM (
SELECT DATE(in_time) AS dt, COUNT(DISTINCT uid) AS num,
SUM(TIMESTAMPDIFF(SECOND, in_time, out_time)) AS view_len_sec
FROM tb_user_log
WHERE DATE_FORMAT(in_time, '%Y-%m') = '2021-11'
# YEAR(in_time) = 2021 AND MONTH(in_time) = 11
# 把日期看作字符串也可以写为 SUBSTRING(in_time, 1, 7)
AND artical_id > 0 -- 题目提示 artical_id 为0时表示用户在非文章内容页,所以要考虑这一隐藏条件
GROUP BY dt
) a
ORDER BY avg_view_len_sec
- 统计每篇文章同一时刻最大在看人数,如果同一时刻有进入也有离开时,先记录用户数增加再记录减少,结果按最大人数降序。
SELECT artical_id, MAX(user_cnt) AS max_uv # 3. 求最大的累计和
FROM (
SELECT
artical_id,
SUM(num) OVER (PARTITION BY artical_id ORDER BY read_time, num DESC) AS user_cnt # 2. 利用窗口函数对计数求累计和,先按时间排序再按计数排序
FROM (
( # 1. 将用户的进入时间单独拎出来,同时记为 1
SELECT artical_id, in_time AS read_time, 1 AS num
FROM tb_user_log
WHERE artical_id > 0)
UNION ALL
( # 1. 离开时间单独拎出来,同时记为 -1
SELECT artical_id, out_time AS read_time, -1 AS num
FROM tb_user_log
WHERE artical_id > 0)
) a
) b
GROUP BY artical_id
ORDER BY max_uv DESC
/*
1. 将用户的进入时间单独拎出来,同时记为 1;离开时间单独拎出来,同时记为 -1,这样就聚合这两个表,按照时间排序,意思就是:进去一个加1,离开一个减1
2. 利用窗口函数对计数(1或者-1)求累计和,因为题目规定:同一时间有就有出的话先算进来的后算出去的,所以排序的时候就要看好了先按时间排序,再按计数排序
3. 在每个分组里面去求最大的累积和就是最多同时在线的人数
*/
- 统计2021年11月每天新用户的次日留存率(保留2位小数)
注:次日留存率为当天新增的用户数中第二天又活跃了的用户数占比。
如果in_time-进入时间和out_time-离开时间跨天了,在两天里都记为该用户活跃过,结果按日期升序。
SELECT a.dt AS dt, ROUND(COUNT(b.uid) / COUNT(a.uid), 2) AS uv_left_rate
FROM ( # 每个用户的首次登陆日期(按uid聚合的最小日期)
SELECT uid, MIN(DATE(in_time)) AS dt
FROM tb_user_log
GROUP BY uid
) a
LEFT JOIN ( # 每个用户的全部活跃日期
SELECT uid, DATE(in_time) AS dt
FROM tb_user_log
UNION
SELECT uid, DATE(out_time) AS dt
FROM tb_user_log
) b
ON a.uid = b.uid # 依旧有登陆记录
AND b.dt = DATE_ADD(a.dt, INTERVAL 1 DAY) # 每个用户首次登陆日期的第二天
WHERE DATE_FORMAT(a.dt, '%Y-%m') = '2021-11'
GROUP BY dt
ORDER BY dt
/* 从日期减去指定间隔 DATE_SUB(date,INTERVAL expr type) */
- 统计活跃间隔对用户分级后,各活跃等级用户占比,结果保留两位小数,且按占比降序排序。
注:用户等级标准简化为:忠实用户(近7天活跃过且非新晋用户)、新晋用户(近7天新增)、沉睡用户(近7天未活跃但更早前活跃过)、流失用户(近30天未活跃但更早前活跃过)。
假设今天就是数据中所有日期的最大值。
近7天表示包含当天T的近7天,即闭区间[T-6, T]。
WITH tt AS(
SELECT a.uid, new_time, 7day_new_time,
(CASE WHEN TIMESTAMPDIFF(DAY, new_time, DATE((SELECT MAX(DATE(in_time)) FROM tb_user_log))) > 6 AND 7day_new_time IS NOT NULL THEN "忠实用户"
WHEN TIMESTAMPDIFF(DAY, new_time, DATE((SELECT MAX(DATE(in_time)) FROM tb_user_log))) < 7 AND 7day_new_time IS NOT NULL THEN "新晋用户"
WHEN TIMESTAMPDIFF(DAY, new_time, DATE((SELECT MAX(DATE(in_time)) FROM tb_user_log))) BETWEEN 7 AND 29 AND 7day_new_time IS NULL THEN "沉睡用户"
WHEN TIMESTAMPDIFF(DAY, new_time, DATE((SELECT MAX(DATE(in_time)) FROM tb_user_log))) > 29 THEN "流失用户"
END
) AS user_grade
FROM ( # 首次登入时间表
SELECT uid, MIN(DATE(in_time)) AS new_time
FROM tb_user_log
GROUP BY uid
) a
LEFT JOIN ( # 近7天内用户首次登入时间表
SELECT uid, MIN(DATE(in_time)) AS 7day_new_time
FROM tb_user_log
WHERE TIMESTAMPDIFF(DAY, DATE(in_time), DATE((SELECT MAX(DATE(out_time)) FROM tb_user_log))) < 7
GROUP BY uid
) b
USING(uid)
)
SELECT user_grade, ROUND(COUNT(uid) / (SELECT COUNT(DISTINCT uid) FROM tb_user_log), 2)ratio
FROM tt
GROUP BY user_grade
ORDER BY ratio DESC
/*
先做一个临时表,里面有uid,用户首次登入时间,近7天内用户首次登入时间信息,用户分级:
a) 若首次登入时间在6天前,且近7天内首次登入时间非空(有登入过)则为忠实用户
b) 若首次登入时间在近6天内,且近7天内首次登入时间非空(有登入过)则为新晋用户
c) 若首次登入时间在6-29天前,且近7天首次登入时间为空(未登入过)则为沉睡用户
d) 若首次登入时间在29天前,为流失用户(首次登入的时候肯定活跃过了)
*/
/*
简洁一点的版本
with temp as
(
select
(
case when datediff('2021-11-04',first_in)<7 then '新晋用户'
when datediff('2021-11-04',first_in)>=7 and
datediff('2021-11-04',last_in)<7 then '忠实用户'
when datediff('2021-11-04',last_in)>=30 then '流失用户'
else '沉睡用户' end
) as user_grade
from
(
select
uid,
date(min(in_time)) as first_in,
date(max(out_time)) as last_in
from tb_user_log
group by uid
) t1
)
select
user_grade,
round(count(user_grade)/
(select count(*) from temp)
,2)as ratio
from temp
group by user_grade
order by ratio desc
*/
- 统计每天的日活数及新用户占比
注:新用户占比=当天的新用户数÷当天活跃用户数(日活数)。
如果in_time-进入时间和out_time-离开时间跨天了,在两天里都记为该用户活跃过。
新用户占比保留2位小数,结果按日期升序排序。
# 提取每个用户的所有活动日期,作为日活表
WITH tmp AS(
SELECT uid, dt
FROM (
SELECT uid, DATE(in_time) AS dt
FROM tb_user_log
UNION
SELECT uid, DATE(out_time) AS dt
FROM tb_user_log
ORDER BY dt -- 可以先排好序
) tt
)
SELECT dt, COUNT(uid) AS dau, IFNULL(ROUND(SUM(IF(new_dt = dt, 1, 0)) / COUNT(uid), 2), 0) AS uv_new_ratio -- 注意值为NULL的特殊情况
FROM (
SELECT uid, dt # 为什么 dau 的计算方式不能写在这里?
FROM tmp
) t1
LEFT JOIN (
SELECT uid, MIN(dt) AS new_dt -- 通过判断用户第一次登录时间和当前时间是否相等来分析是否为新增用户
FROM tmp
GROUP BY uid
) t2
USING(uid)
GROUP BY dt
ORDER BY dt
/*
简洁版本
SELECT dt,count(distinct uid)dau,round(sum(if(dt=min_date,1,0))/count(distinct uid),2)uv_new_ratio
FROM
(SELECT uid,date(in_time) dt FROM tb_user_log
UNION
SELECT uid,date(out_time) dt FROM tb_user_log)a
LEFT JOIN
(SELECT uid,min(date(in_time)) min_date FROM tb_user_log GROUP BY uid)b
USING(uid)
GROUP BY dt
ORDER BY dt;
*/
- 计算每个用户2021年7月以来每月获得的金币数(该活动到10月底结束,11月1日开始的签到不再获得金币)。结果按月份、ID升序排序。
注:如果签到记录的in_time-进入时间和out_time-离开时间跨天了,也只记作in_time对应的日期签到了。
SELECT uid, DATE_FORMAT(dt, '%Y%m') AS month, SUM(day_coin) AS coin
FROM ( # 3. 计算每天赚得金币数
SELECT uid, dt, (
CASE MOD(RANK() OVER(PARTITION BY uid, rank_day ORDER BY dt), 7) -- 每天赚得金币数
WHEN 0 THEN 7 -- 每签到第7天7枚
WHEN 3 THEN 3 -- 每签到第3天3枚
ELSE 1 -- 其他天数1枚
END) AS day_coin
FROM ( # 2. 得到组内分组标签
SELECT uid, dt,
DATE_SUB(dt, INTERVAL RANK() OVER(PARTITION BY uid ORDER BY dt) -- 每个用户,把日期标上序号
DAY) AS rank_day -- 用日期减序号,得到连续分组
FROM ( # 1. 取出有效记录:过滤不签到的和重复的用户
SELECT DISTINCT uid, DATE(in_time) AS dt -- 用户去重,过滤重复签到的情况
FROM tb_user_log
WHERE artical_id = 0
AND sign_in = 1
AND DATE(in_time) BETWEEN '2021-07-07' AND '2021-10-31'
) t1
) t2
) t3
GROUP BY uid, month
ORDER BY month, uid
- 请计算商城中2021年每月的GMV,输出GMV大于10w的每月GMV,值保留到整数。
注:GMV为已付款订单和未付款订单两者之和。结果按GMV升序排序。
SELECT DATE_FORMAT(event_time, '%Y-%m') AS month, ROUND(SUM(total_amount)) AS GMV
FROM tb_order_overall
WHERE status IN (0, 1)
AND YEAR(event_time) = 2021
GROUP BY month
HAVING GMV > 100000 -- 筛选GMV大于10w的分组
ORDER BY GMV
- 请统计2021年10月每个有展示记录的退货率不大于0.5的商品各项指标,
注:
商品点展比=点击数÷展示数;
加购率=加购数÷点击数;
成单率=付款数÷加购数;退货率=退款数÷付款数,
当分母为0时整体结果记为0,结果中各项指标保留3位小数,并按商品ID升序排序。
SELECT
product_id,
ROUND(SUM(if_click) / COUNT(product_id), 3) AS ctr,
ROUND(IFNULL(SUM(if_cart) / SUM(if_click), 0), 3) AS cart_rate,
ROUND(IFNULL(SUM(if_payment) / SUM(if_cart), 0), 3) AS payment_rate,
ROUND(IFNULL(SUM(if_refund) / SUM(if_payment), 0), 3) AS refund_rate
FROM tb_user_event
WHERE DATE_FORMAT(event_time, '%Y%m') = 202110
GROUP BY product_id
HAVING refund_rate <= 0.5
ORDER BY product_id
- 请计算2021年10月以来店铺901中商品毛利率大于24.9%的商品信息及店铺整体毛利率。
注:商品毛利率=(1-进价/平均单件售价)100%;
店铺毛利率=(1-总进价成本/总销售收入)100%。
结果先输出店铺毛利率,再按商品ID升序输出各商品毛利率,均保留1位小数。
/*
不知道考虑status后哪里错了
SELECT product_id, profit_rate
FROM ((
SELECT
"店铺汇总" AS product_id,
CONCAT(ROUND((1 - SUM(t1.in_price * t3.cnt) / SUM(t3.price * t3.cnt * IF(t2.status = 2, -1, t2.status))) * 100, 1), '%') AS profit_rate -- 判断status的版本,WHERE条件语句就不需要加上status=1,更贴合实际
FROM tb_order_detail t3 LEFT JOIN tb_order_overall t2 USING(order_id)
LEFT JOIN tb_product_info t1 USING(product_id)
WHERE DATE_FORMAT(t2.event_time, '%Y%m') >= 202110
AND t1.shop_id = 901)
UNION ALL (
SELECT
t1.product_id,
IFNULL(CONCAT(ROUND((1 - in_price / (SUM(t3.price * IF(t2.status = 2, -1, t2.status)) / SUM(t3.cnt * IF(t2.status = 1, 1, 0)))) * 100, 1), '%'), 0) AS profit_rate
FROM tb_order_detail t3 LEFT JOIN tb_order_overall t2 USING(order_id)
LEFT JOIN tb_product_info t1 USING(product_id)
WHERE DATE_FORMAT(t2.event_time, '%Y%m') >= 202110
AND t1.shop_id = 901
GROUP BY t1.product_id
HAVING profit_rate > 24.9
ORDER BY t1.product_id)
) tt
*/
SELECT product_id, profit_rate
FROM ((
SELECT
"店铺汇总" AS product_id,
CONCAT(ROUND((1 - SUM(t1.in_price * t3.cnt) / SUM(t3.price * t3.cnt)) * 100, 1), '%') AS profit_rate
FROM tb_order_detail t3 LEFT JOIN tb_order_overall t2 USING(order_id)
LEFT JOIN tb_product_info t1 USING(product_id)
WHERE DATE_FORMAT(t2.event_time, '%Y%m') >= 202110
AND t1.shop_id = 901
AND t2.status = 1)
UNION ALL (
SELECT
t1.product_id,
CONCAT(ROUND((1 - AVG(t1.in_price) / AVG(t3.price)) * 100, 1), '%') AS profit_rate
FROM tb_order_detail t3 LEFT JOIN tb_order_overall t2 USING(order_id)
LEFT JOIN tb_product_info t1 USING(product_id)
WHERE DATE_FORMAT(t2.event_time, '%Y%m') >= 202110
AND t1.shop_id = 901
AND t2.status = 1
GROUP BY t1.product_id
HAVING profit_rate > 24.9
ORDER BY t1.product_id)
) tt
- 请统计零食类商品中复购率top3高的商品。保留3位小数,并按复购率倒序、商品ID升序排序
注:复购率指用户在一段时间内对某商品的重复购买比例,复购率越大,则反映出消费者对品牌的忠诚度就越高,也叫回头率
此处定义:某商品复购率 = 近90天内购买它至少两次的人数 ÷ 购买它的总人数
近90天指包含最大日期(记为当天)在内的近90天。
SELECT
product_id,
ROUND(SUM(IF(repurchase_cnt > 1, 1, 0)) / COUNT(DISTINCT uid), 3) AS repurchase_rate
FROM (
SELECT
t1.product_id,
t2.uid,
ROW_NUMBER() OVER (PARTITION BY t1.product_id, t2.uid ORDER BY t2.event_time) AS repurchase_cnt -- 获取购买次数排序
FROM tb_order_detail t3 LEFT JOIN tb_order_overall t2 USING(order_id)
LEFT JOIN tb_product_info t1 USING(product_id)
WHERE t1.tag = "零食"
AND t2.status = 1
AND DATEDIFF((SELECT MAX(event_time) FROM tb_order_overall), t2.event_time) < 90 -- 筛选90天内的订单。,“近90天”是指距离tb_order_overall表中最大日期的近90天(包含最大日期当天),而非每一件商品各自的近90天
) tt
GROUP BY product_id -- 按商品分组
ORDER BY repurchase_rate DESC, product_id
LIMIT 3
/*
不用窗口函数的方法:
SELECT product_id, ROUND(SUM(repurchase) / COUNT(repurchase), 3) AS repurchase_rate
FROM (
SELECT uid, product_id, IF(COUNT(uid) > 1, 1, 0) AS repurchase
FROM tb_order_detail
JOIN tb_order_overall USING(order_id)
JOIN tb_product_info USING(product_id)
WHERE tag="零食" AND event_time >= (
SELECT DATE_SUB(MAX(event_time), INTERVAL 89 DAY)
FROM tb_order_overall
)
GROUP BY uid, product_id
) as t_uid_product_info
GROUP BY product_id
ORDER BY repurchase_rate DESC, product_id
LIMIT 3
*/
- 请计算2021年10月商城里所有新用户的首单平均交易金额(客单价)和平均获客成本(保留一位小数)。
注:订单的优惠金额 = 订单明细里的{该订单各商品单价×数量之和} - 订单总表里的{订单总金额} 。
SELECT
ROUND(AVG(total_amount), 1) AS avg_amount,
ROUND(AVG(cost - total_amount), 1) AS avg_cost
FROM(
SELECT
uid,
event_time, -- necessary
total_amount,
SUM(price * cnt) OVER (PARTITION BY order_id) AS cost,
ROW_NUMBER() OVER (PARTITION BY uid ORDER BY event_time) AS ranking
FROM tb_order_detail LEFT JOIN tb_order_overall USING(order_id)
WHERE status = 1
) tt
WHERE ranking = 1
AND DATE_FORMAT(event_time, '%Y%m') = 202110
/*
with new_tab as (select uid,a.order_id,event_time,total_amount,price,cnt
,row_number()over(partition by uid order by event_time) as rank2
from tb_order_overall a
left join tb_order_detail b on a.order_id=b.order_id)
select round(avg(amount),1),round(avg(cost),1)
from
(select uid
,round(sum(total_amount)/count(uid)) as amount
,round(sum(price*cnt)-sum(total_amount)/count(uid)) as cost
from new_tab
where (uid,order_id) in (select uid,order_id from new_tab where substring(event_time,1,7)='2021-10' and rank2=1)
group by uid,order_id) t
*/
- 计算店铺901在2021年国庆头3天的7日动销率和滞销率,结果保留3位小数,按日期升序排序。
注:动销率定义为店铺中一段时间内有销量的商品占当前已上架总商品数的比例(有销量的商品/已上架总商品数)。
滞销率定义为店铺中一段时间内没有销量的商品占当前已上架总商品数的比例。(没有销量的商品/已上架总商品数)。
只要当天任一店铺有任何商品的销量就输出该天的结果,即使店铺901当天的动销率为0。
WITH t1 AS ( # 取10-01到10-03之间有成交记录的日期
SELECT DATE(event_time) AS dt
FROM tb_order_overall
WHERE DATE(event_time) between '2021-10-01' and '2021-10-03'
),
t2 AS ( # 901商店已成交订单中,每个下单日期的商品product_id
SELECT DATE(event_time) AS dt, tod.product_id
FROM tb_order_detail tod JOIN tb_order_overall too ON tod.order_id = too.order_id AND status = 1
JOIN tb_product_info tpi ON tod.product_id = tpi.product_id AND shop_id = '901'
),
t3 AS ( # 计算每个日期下901商店的已上架总商品数
SELECT
DATE(event_time) AS dt,
COUNT(DISTINCT CASE WHEN DATEDIFF(DATE(event_time), DATE(release_time)) >= 0
THEN product_id
END ) AS sum_product
FROM tb_product_info, tb_order_overall
WHERE shop_id = '901'
GROUP BY dt
)
SELECT
t1.dt,
ROUND(COUNT(DISTINCT t2.product_id) / sum_product, 3) AS sale_rate,
ROUND(1 - (COUNT(DISTINCT t2.product_id) / sum_product ), 3) AS unsale_rate
FROM t1 LEFT JOIN t2 ON DATEDIFF(t1.dt, t2.dt) between 0 and 6
JOIN t3 ON t1.dt = t3.dt
GROUP BY t1.dt
ORDER BY t1.dt
/*
根据题目描述 “只要当天任一店铺有任何商品的销量就输出该天的结果,即使店铺901当天的动销率为0” ,因此
1. 获得10-01到10-03之间有成交记录的日期
2. 获得901店铺的订单信息(日期和商品)
3. 获得按天分类的901商店已上架总商品数
4. 联结各临时表,使用datediff()将10-01到10-03各天往前7天的901店铺成交信息都归为该天的成交结果,最后按天分类计算有成交记录的商品占901店铺上架产品数的比重
*/
- 统计2021年国庆7天期间在北京市接单至少3次的司机的平均接单数和平均兼职收入(暂不考虑平台佣金,直接计算完成的订单费用总额),结果保留3位小数。
SELECT
city,
ROUND(AVG(order_cnt), 3) AS avg_order_num,
ROUND(SUM(income) / COUNT(driver_id), 3) AS avg_income
FROM (
SELECT
t1.city AS city,
t2.driver_id AS driver_id,
SUM(t2.fare) AS income,
COUNT(t1.order_id) AS order_cnt
FROM tb_get_car_record t1 LEFT JOIN tb_get_car_order t2 USING(order_id)
WHERE DATE(order_time) BETWEEN '2021-10-01' AND '2021-10-07'
AND city = "北京"
GROUP BY city, driver_id
HAVING order_cnt >= 3
) tt
GROUP BY city
- 找到2021年10月有过取消订单记录的司机,计算他们每人全部已完成的有评分订单的平均评分及总体平均评分,保留1位小数。先按driver_id升序输出,再输出总体情况。
/*
select IFNULL(driver_id, "总体") AS driver_id, ROUND(AVG(grade), 1) AS avg_grade
# (CASE WHEN GROUPING(driver_id) = 1 THEN '总体' ELSE driver_id END) AS driver_id
FROM tb_get_car_record JOIN tb_get_car_order USING(order_id)
WHERE driver_id IN (
SELECT driver_id
FROM tb_get_car_order
WHERE start_time IS NULL AND DATE(order_time) BETWEEN '2021-10-1'AND '2021-10-31') -- 筛选driver_id的的时候需要加上时间限制条件,求平均的时候不需要,是计算全部时间段的
GROUP BY driver_id
WITH ROLLUP -- 追加汇总信息
*/
WITH tmp AS (
SELECT driver_id, grade
FROM tb_get_car_order
WHERE driver_id IN (
SELECT DISTINCT driver_id
FROM tb_get_car_order
WHERE finish_time IS NOT NULL AND start_time IS NULL -- 取消的订单
AND DATE_FORMAT(order_time, '%Y%m') = 202110
)
)
(SELECT driver_id, ROUND(AVG(grade), 1) AS avg_grade
FROM tmp
GROUP BY driver_id
ORDER BY driver_id)
UNION
SELECT "总体" AS driver_id, ROUND(AVG(grade), 1) AS avg_grade
FROM tmp
- 统计每个城市中评分最高的司机平均评分、日均接单量和日均行驶里程数。
注:有多个司机评分并列最高时,都输出。
平均评分和日均接单量保留1位小数,
日均行驶里程数保留3位小数,按日均接单数升序排序。
SELECT city, driver_id, avg_grade, avg_order_num, avg_mileage
FROM (
SELECT
city,
driver_id,
ROUND(AVG(grade), 1) AS avg_grade,
ROUND(COUNT(order_id) / COUNT(DISTINCT DATE(order_time)), 1) AS avg_order_num,
ROUND(SUM(mileage) / COUNT(DISTINCT DATE(order_time)), 3) AS avg_mileage,
RANK() OVER (PARTITION BY city ORDER BY ROUND(AVG(grade), 1) DESC) AS ranking -- 评分排序
FROM tb_get_car_record t1 LEFT JOIN tb_get_car_order t2 USING(order_id)
GROUP BY city, driver_id
) a
WHERE ranking = 1
ORDER BY avg_order_num
- 统计国庆头3天里,每天的近7日日均订单完成量和日均订单取消量,按日期升序排序。结果保留2位小数。
SELECT dt, ROUND(finish_num / 7, 2) AS finish_num_7d, ROUND(cancel_num / 7, 2) AS cancel_num_7d
FROM (
SELECT
DISTINCT DATE(order_time) AS dt,
SUM(COUNT(fare)) OVER (ORDER BY DATE(order_time) ROWS 6 PRECEDING) AS finish_num,
SUM(SUM(fare IS NULL)) OVER (ORDER BY DATE(order_time) ROWS 6 PRECEDING) AS cancel_num -- 先求每天的订单取消量,再求连续7天的取消量,所以是两个SUM()
FROM tb_get_car_order
GROUP BY dt
ORDER BY dt
) a
WHERE dt BETWEEN '2021-10-01' AND '2021-10-03' -- 在窗口上一层筛选日期
/*
求前几行的累计和:
sum(column1) over (order by column2 rows N preceding)
例如 题目中说的近7日,所以就是 N=6 -- 本行+本行前面的6行
*/
- 统计周一到周五各时段的叫车量、平均等待接单时间和平均调度时间。全部以event_time-开始打车时间为时段划分依据,平均等待接单时间和平均调度时间均保留1位小数,平均调度时间仅计算完成了的订单,结果按叫车量升序排序。
注:
不同时段定义:早高峰 [07:00:00, 09:00:00)、工作时间 [09:00:00 , 17:00:00)、晚高峰 [17:00:00 , 20:00:00)、休息时间 [20:00:00 , 07:00:00)。
时间区间左闭右开(即7:00:00算作早高峰,而9:00:00不算做早高峰)
从开始打车到司机接单为等待接单时间,从司机接单到上车为调度时间。
SELECT
period,
COUNT(period) AS get_car_num,
ROUND(AVG(wait_time) / 60, 1) AS avg_wait_time,
ROUND(AVG(dispatch_time) / 60, 1) AS avg_dispatch_time
FROM (
SELECT
(CASE WHEN HOUR(event_time) >= 7 AND HOUR(event_time) < 9 THEN "早高峰"
WHEN HOUR(event_time) >= 9 AND HOUR(event_time) < 17 THEN "工作时间"
WHEN HOUR(event_time) >= 17 AND HOUR(event_time) < 20 THEN "晚高峰"
ELSE "休息时间"
END ) AS period,
TIMESTAMPDIFF(SECOND, event_time, order_time) AS wait_time, -- 如果用MINUTE,返回的值会取整,比如1.5分钟的结果只显示1分钟
TIMESTAMPDIFF(SECOND, order_time, start_time) AS dispatch_time
FROM tb_get_car_record t1 LEFT JOIN tb_get_car_order t2 USING(order_id)
WHERE DATE_FORMAT(event_time, '%w') BETWEEN 1 AND 5 -- 0表示周日
# WHERE DAYOFWEEK(event_time) BETWEEN 2 AND 6 -- 1表示周日
) a
GROUP BY period
ORDER BY get_car_num
- 统计各个城市在2021年10月期间,单日中最大的同时等车人数。结果按各城市最大等车人数升序排序,相同时按城市升序排序。
注: 等车指从开始打车起,直到取消打车、取消等待或上车前的这段时间里用户的状态。
如果同一时刻有人停止等车,有人开始等车,等车人数记作先增加后减少。
SELECT city, MAX(wait_uv) AS max_wait_uv
FROM (
SELECT
city,
dt,
SUM(num) OVER (PARTITION BY city ORDER BY dt, num DESC) AS wait_uv -- sum累计开窗排序的时候要先+1再-1,也就是先通过dt升序排,再通过你加的字段 num 降序这样就能同一时刻保证先加后减
FROM ( # 把打车时间(event_time)和上车时间(start_time)拆成俩表然后union
SELECT
city,
event_time AS dt,
1 AS num
FROM tb_get_car_record
UNION ALL
SELECT
city,
start_time AS dt,
-1 AS num
FROM tb_get_car_record t1 LEFT JOIN tb_get_car_order t2 USING(order_id)
) a
WHERE DATE_FORMAT(dt, '%Y%m') = 202110
) b
GROUP BY city, LEFT(dt, 10) -- 不是很明白为什么是这里group
ORDER BY max_wait_uv, city