|NO.Z.00036|——————————|BigDataEnd|——|Hadoop&Hue.V05|——|Hue.v05|案例实现.v02|
一、实现:步骤一:创建表
### --- 步骤一:创建表
~~~ 将数据存放在ORC文件中
~~~ # createtable.hql
hive (tuning)> drop database sale cascade;
hive (tuning)> create database if not exists sale;
create table sale.dimdate_ori(
dt date,
yearmonth int,
year smallint,
month tinyint,
day tinyint,
week tinyint,
weeks tinyint,
quat tinyint,
tendays tinyint,
halfmonth tinyint
)
row format delimited
fields terminated by ",";
hive (tuning)> create table sale.sale_ori(
orderid string,
locationid string,
dt date
)
row format delimited
fields terminated by ",";
hive (tuning)> create table sale.saledetail_ori(
orderid string,
rownum int,
goods string,
num int,
price double,
amount double
)
row format delimited
fields terminated by ",";
hive (tuning)> create table sale.dimdate(
dt date,
yearmonth int,
year smallint,
month tinyint,
day tinyint,
week tinyint,
weeks tinyint,
quat tinyint,
tendays tinyint,
halfmonth tinyint
) stored as orc;
hive (tuning)> create table sale.sale(
orderid string,
locationid string,
dt date
) stored as orc;
hive (tuning)> create table sale.saledetail(
orderid string,
rownum int,
goods string,
num int,
price double,
amount double
)stored as orc;
[root@linux122 ~]# hive -f createtable.hql
二、步骤二:导入数据
### --- 步骤二:导入数据
~~~ # 加载数据
hive (tuning)> use sale;
hive (sale)> load data local inpath "/home/hadoop/data/tbDate.dat" overwrite into table dimdate_ori;
hive (sale)> load data local inpath "/home/hadoop/data/tbSale.dat" overwrite into table sale_ori;
hive (sale)> load data local inpath "/home/hadoop/data/tbSaleDetail.dat" overwrite into table saledetail_ori;
~~~ # 导入数据
hive (sale)> insert into table dimdate select * from dimdate_ori;
hive (sale)> insert into table sale select * from sale_ori;
hive (sale)> insert into table saledetail select * from saledetail_ori;
[root@linux122 ~]# hive -f loaddata.hql
### --- 按年统计销售额
hive (sale)> SELECT year(B.dt) year, round(sum(A.amount)/10000, 2) amount
FROM saledetail A join sale B on A.orderid=B.orderid
group by year(B.dt);
~~输出参数
year amount
2004 326.81
2005 1325.76
2006 1368.1
2007 1671.94
2008 1467.43
2009 632.37
2010 21.09
3274 0.17
### --- 销售金额在 10W 以上的订单
hive (sale)> SELECT orderid, round(sum(amount), 2) amount
FROM saledetail
group by orderid
having sum(amount) > 100000;
~~输出参数
orderid amount
HMJSL00009024 119084.8
HMJSL00009958 159126.0
### --- 每年销售额的差值
hive (sale)> SELECT year, round(amount, 2) amount, round(lag(amount) over
(ORDER BY year), 2) prioramount
,round(amount - lag(amount) over (ORDER BY year), 2) diff
from (SELECT year(B.dt) year, sum(A.amount) amount
from saledetail A join sale B on A.orderid=B.orderid
group by year(B.dt)
) tmp;
~~输出参数
year amount prioramount diff
2004 3268115.5 NULL NULL
2005 1.325756415E7 3268115.5 9989448.65
2006 1.36809829E7 1.325756415E7 423418.75
2007 1.671935456E7 1.36809829E7 3038371.66
2008 1.46742953E7 1.671935456E7 -2045059.26
2009 6323697.19 1.46742953E7 -8350598.11
2010 210949.66 6323697.19 -6112747.53
3274 1703.0 210949.66 -209246.66
### --- 年度订单金额前10位(年度、订单号、订单金额、排名)
~~~ # 方法一
hive (sale)> SELECT dt, orderid, amount, rank
from (SELECT dt, orderid, amount,
dense_rank() over(PARTITION BY dt ORDER BY
amount desc) rank
from (SELECT year(B.dt) dt, A.orderid, sum(A.amount)
amount
from saledetail A join sale B on
A.orderid=B.orderid
GROUP BY year(B.dt), A.orderid
) tmp1
) tmp2
where rank <= 10;
~~输出参数
dt orderid amount rank
2004 HMJSL00001557 23656.79999999997 1
2004 HMJSL00001556 22010.599999999984 2
2004 HMJSL00001349 17147.2 3
2004 HMJSL00001531 16605.6 4
2004 HMJSL00001567 15651.0 5
2004 HMJSL00001562 15429.999999999989 6
2004 HMJSL00000706 15266.0 7
2004 HMJSL00000656 15021.319999999998 8
2004 HMJSL00001568 14444.0 9
2004 HMJSL00000963 14268.800000000001 10
~~~ # 方法二
hive (sale)> with tmp as (
SELECT year(B.dt) dt, A.orderid, sum(A.amount) amount
from saledetail A join sale B on A.orderid=B.orderid
GROUP BY year(B.dt), A.orderid
)
SELECT dt, orderid, amount, rank
from (SELECT dt, orderid, amount,
dense_rank() over(PARTITION BY dt ORDER BY
amount desc) rank
from tmp
) tmp2
where rank <= 10;
~~输出参数
dt orderid amount rank
2004 HMJSL00001557 23656.79999999997 1
2004 HMJSL00001556 22010.599999999984 2
2004 HMJSL00001349 17147.2 3
2004 HMJSL00001531 16605.6 4
2004 HMJSL00001567 15651.0 5
2004 HMJSL00001562 15429.999999999989 6
2004 HMJSL00000706 15266.0 7
2004 HMJSL00000656 15021.319999999998 8
2004 HMJSL00001568 14444.0 9
### --- 季度订单金额前10位(年度、季度、订单id、订单金额、排名)
~~~ # 方法一
hive (sale)> with tmp as (
select C.year, C.quat, A.orderid, round(sum(B.amount), 2)
amount
from sale A join saledetail B on A.orderid=B.orderid
join dimdate C on A.dt=C.dt
group by C.year, C.quat, A.orderid
)
select year, quat, orderid, amount, rank
from (
select year, quat, orderid, amount,
dense_rank() over (partition by year, quat order
by amount desc) rank
from tmp
) tmp1
where rank <= 10;
~~输出参数
year quat orderid amount rank
2004 1 HMJSL00002525 3113.84 1
2004 3 HMJSL00000609 10730.0 1
2004 3 HMJSL00000600 9031.2 2
2004 3 HMJSL00000606 8880.8 3
2004 3 HMJSL00000571 8570.16 4
2004 3 HMJSL00000610 8094.0 5
2004 3 HMJSL00000574 7540.1 6
2004 3 HMJSL00000561 6981.6 7
2004 3 HMJSL00000615 6946.0 8
2004 3 HMJSL00000616 6509.0 9
2004 3 HMJSL00000627 6164.8 10
~~~ # 方法二
hive (sale)> with tmp as(
select year(A.dt) year,
case when month(A.dt) <= 3 then 1
when month(A.dt) <= 6 then 2
when month(A.dt) <= 9 then 3
A.orderid,
round(sum(B.amount), 2) amount
from sale A join saledetail B on A.orderid = B.orderid
group by year(A.dt),
case when month(A.dt) <= 3 then 1
when month(A.dt) <= 6 then 2
when month(A.dt) <= 9 then 3
else 4 end,
A.orderid
)
select year, quat, orderid, amount, rank
from (
select year, quat, orderid, amount,
dense_rank() over (partition by year, quat order
by amount desc) rank
from tmp
) tmp1
where rank <= 10;
~~输出参数
~~~ # 方法三:求季度
hive (sale)> select floor(month(dt/3.1)) + 1;
hive (sale)> with tmp as (
select year(A.dt) year, floor(month(A.dt)/3.1) + 1 quat,
A.orderid,
round(sum(B.amount), 2) amount
from sale A join saledetail B on A.orderid=B.orderid
group by year(A.dt), floor(month(A.dt)/3.1) + 1, A.orderid
)
select year, quat, orderid, amount, rank
from (
select year, quat, orderid, amount,
dense_rank() over (partition by year, quat order
by amount desc) rank
from tmp
) tmp1
where rank <= 10;
~~输出参数
year quat orderid amount rank
2004 1 HMJSL00002525 3113.84 1
2004 3 HMJSL00000609 10730.0 1
2004 3 HMJSL00000600 9031.2 2
2004 3 HMJSL00000606 8880.8 3
2004 3 HMJSL00000571 8570.16 4
2004 3 HMJSL00000610 8094.0 5
2004 3 HMJSL00000574 7540.1 6
2004 3 HMJSL00000561 6981.6 7
2004 3 HMJSL00000615 6946.0 8
2004 3 HMJSL00000616 6509.0 9
2004 3 HMJSL00000627 6164.8 10
### --- 求所有交易日中订单金额最高的前10位
~~~ # topN问题:
~~~ 基础数据
~~~ 上排名函数
~~~ 解决N的问题
hive (sale)> with tmp as (
select A.dt, A.orderid, round(sum(B.amount), 2) amount
from sale A join saledetail B on A.orderid=B.orderid
group by A.dt, A.orderid
)
select dt, orderid, amount, rank
from (
select dt, orderid, amount, dense_rank() over(order by
amount desc) rank
from tmp
) tmp1
where rank <= 10;
~~输出参数
dt orderid amount rank
2007-07-31 HMJSL00009958 159126.0 1
2007-04-24 HMJSL00009024 119084.8 2
2008-01-15 HMJSL00010598 55828.0 3
2007-07-31 HMJSL00009957 52422.0 4
2007-10-09 HMJSL00010216 49800.0 5
2007-07-31 HMJSL00009956 43018.0 6
2007-11-04 HMJSL00010339 42157.6 7
2007-02-07 HMJSL00008593 41902.4 8
2007-09-27 HMJSL00010137 40340.0 9
2005-05-31 HMJSL00003263 38186.4 10
### --- 每年度销售额最大的交易日
hive (sale)> with tmp as (
select A.dt, round(sum(B.amount), 2) amount
from sale A join saledetail B on A.orderid=B.orderid
group by A.dt
)
select year(dt) year, max(amount) dayamount
from tmp
group by year(dt);
~~输出参数
year dayamount
2004 160121.08
2005 189984.81
2006 232302.62
2007 309427.1
2008 151134.0
2009 71127.0
2010 28161.8
3274 1703.0
~~~ # 备注:以上求解忽略了交易日,以下SQL更符合题意
hive (sale)> with tmp as (
select dt, amount, dense_rank() over (partition by year(dt)
order by amount desc) as rank
from (select A.dt, round(sum(B.amount), 2) amount
from sale A join saledetail B on A.orderid=B.orderid
group by A.dt) tab1
)
select year(dt) as year, dt, amount
from tmp
where rank=1;
~~输出参数
year dt amount
2004 2004-11-01 160121.08
2005 2005-10-03 189984.81
2006 2006-04-29 232302.62
2007 2007-07-31 309427.1
2008 2008-01-20 151134.0
2009 2009-01-24 71127.0
2010 2010-01-01 28161.8
3274 3274-12-12 1703.0
### --- 年度最畅销的商品(即每年销售金额最大的商品)
hive (sale)> with tmp as (
select year(B.dt) year, goods, round(sum(amount),2) amount
from saledetail A join sale B on A.orderid=B.orderid
group by year(B.dt), goods
)
select year, goods, amount
from (select year, goods, amount, dense_rank() over
(partition by year order by amount desc) rank
from tmp) tmp1
where rank = 1;
~~输出参数
year goods amount
2004 JY424420810101 53401.76
2005 24124118880102 56627.33
2006 JY425468460101 113720.6
2007 JY425468460101 70225.1
2008 E2628204040101 98003.6
2009 YL327439080102 30029.2
2010 SQ429425090101 4494.0
3274 YA217390232301 698.0
Walter Savage Landor:strove with none,for none was worth my strife.Nature I loved and, next to Nature, Art:I warm'd both hands before the fire of life.It sinks, and I am ready to depart
——W.S.Landor
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通