拉链表-增量更新方法一
参考文档:http://lxw1234.com/archives/2015/08/473.htm
一、元表结构
1、定义业务库原始订单表:
drop table chavin.orders;
CREATE TABLE orders (
orderid INT,
createtime STRING,
modifiedtime STRING,
status STRING
)row format delimited fields terminated by '\t'
stored AS textfile;
--加载测试数据
1 2015-08-18 2015-08-18 创建
2 2015-08-18 2015-08-18 创建
3 2015-08-19 2015-08-21 支付
4 2015-08-19 2015-08-21 完成
5 2015-08-19 2015-08-20 支付
6 2015-08-20 2015-08-20 创建
7 2015-08-20 2015-08-21 支付
8 2015-08-21 2015-08-21 创建
2、定义ODS订单表结构,采用日分区存储:
drop table t_ods_orders_inc;
CREATE TABLE t_ods_orders_inc (
orderid INT,
createtime STRING,
modifiedtime STRING,
status STRING
) PARTITIONED BY (day STRING)
row format delimited fields terminated by '\t'
stored AS textfile;
3、创建dw层历史订单表:
drop table t_dw_orders_his;
CREATE TABLE t_dw_orders_his (
orderid INT,
createtime STRING,
modifiedtime STRING,
status STRING,
dw_start_date STRING,
dw_end_date STRING
)row format delimited fields terminated by '\t'
stored AS textfile;
二、初始化dw层历史订单表:
1、将源库订单表历史数据插入到ods订单表中:
INSERT overwrite TABLE t_ods_orders_inc PARTITION (day = '2015-08-20')
SELECT orderid,createtime,modifiedtime,status
FROM chavin.orders
WHERE cast(createtime as date) <= '2015-08-20';
2、通过ods层订单表数据初始化dw层历史订单表:
INSERT overwrite TABLE t_dw_orders_his
SELECT orderid,createtime,modifiedtime,status,
createtime AS dw_start_date,
'9999-12-31' AS dw_end_date
FROM t_ods_orders_inc
WHERE day = '2015-08-20';
三、增量添加数据
1、将原始订单表增量数据插入到ods层订单表前一天分区中:
INSERT overwrite TABLE t_ods_orders_inc PARTITION (day = '2015-08-21')
SELECT orderid,createtime,modifiedtime,status
FROM orders
WHERE createtime = '2015-08-21' OR modifiedtime = '2015-08-21';
2、通过dw历史数据和ods增量数据刷新dw历史数据,此处采用临时表方法:
DROP TABLE IF EXISTS t_dw_orders_his_tmp;
CREATE TABLE t_dw_orders_his_tmp AS
SELECT orderid,
createtime,
modifiedtime,
status,
dw_start_date,
dw_end_date
FROM (
SELECT a.orderid,
a.createtime,
a.modifiedtime,
a.status,
a.dw_start_date,
CASE WHEN b.orderid IS NOT NULL AND a.dw_end_date > '2015-08-21' THEN '2015-08-20' ELSE a.dw_end_date END AS dw_end_date
FROM t_dw_orders_his a
left outer join (SELECT * FROM t_ods_orders_inc WHERE day = '2015-08-21') b
ON (a.orderid = b.orderid)
UNION ALL
SELECT orderid,
createtime,
modifiedtime,
status,
modifiedtime AS dw_start_date,
'9999-12-31' AS dw_end_date
FROM t_ods_orders_inc
WHERE day = '2015-08-21'
) x
ORDER BY orderid,dw_start_date;
3、根据历史表更新dw层历史订单表:
INSERT overwrite TABLE t_dw_orders_his
SELECT * FROM t_dw_orders_his_tmp;
4、根据上面步骤增加22号数据:
--加载增量数据到ods层订单表分区'2015-08-22'中:
1 2015-08-18 2015-08-22 支付
2 2015-08-18 2015-08-22 完成
6 2015-08-20 2015-08-22 支付
9 2015-08-22 2015-08-22 创建
8 2015-08-22 2015-08-22 支付
10 2015-08-22 2015-08-22 支付
alter table t_ods_orders_inc add partition(day='2015-08-22');
load data local inpath '/opt/datas/orders22.txt' into table chavin.t_ods_orders_inc partition(day='2015-08-22');
--根据历史订单数据和增量数据更新历史订单表数据,此处采用临时表:
DROP TABLE IF EXISTS t_dw_orders_his_tmp;
CREATE TABLE t_dw_orders_his_tmp AS
SELECT orderid,
createtime,
modifiedtime,
status,
dw_start_date,
dw_end_date
FROM (
SELECT a.orderid,
a.createtime,
a.modifiedtime,
a.status,
a.dw_start_date,
CASE WHEN b.orderid IS NOT NULL AND a.dw_end_date > '2015-08-22' THEN '2015-08-21' ELSE a.dw_end_date END AS dw_end_date
FROM t_dw_orders_his a
left outer join (SELECT * FROM t_ods_orders_inc WHERE day = '2015-08-22') b
ON (a.orderid = b.orderid)
UNION ALL
SELECT orderid,
createtime,
modifiedtime,
status,
modifiedtime AS dw_start_date,
'9999-12-31' AS dw_end_date
FROM t_ods_orders_inc
WHERE day = '2015-08-22'
) x
ORDER BY orderid,dw_start_date;
--根据临时表更新历史订单表:
INSERT overwrite TABLE t_dw_orders_his
SELECT * FROM t_dw_orders_his_tmp;
5、查看2015-08-21、2015-08-21历史快照:
select * from t_dw_orders_his where dw_start_date <= '2015-08-21' and dw_end_date >= '2015-08-21';
select * from t_dw_orders_his where dw_start_date <= '2015-08-22' and dw_end_date >= '2015-08-22';