Hive-json表-处理JSON格式数据

add jar /home/jasonapp/json-serde-1.3.7-jar-with-dependencies.jar;

create external table if not exists dm_goddog.student(
student map<string,string> comment "学生信息",
class map<string,string> comment "课程信息",
teacher map<string,string> comment "授课老师信息"
)
comment "学生课程信息"
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
stored as textfile;

ps: 如果查询时，出现json无法解析的错误，删除文件中错误json文件中哪些行数据。

1.下载json-serde包，下载地址如下：

http://www.congiu.net/hive-json-serde/1.3.7/cdh5/json-serde-1.3.7-jar-with-dependencies.jar

2.测试建表

add jar /home/jasonapp/json-serde-1.3.7-jar-with-dependencies.jar;

*******运单信息基础表*******dwd_ids_test.dwd_order_base_info_dtl_di
create external table if not exists dwd_ids_test.dwd_order_base_info_dtl_di(
  order_id   string,
  order_no   string,
  express_flow_code   string,
...........
inc_day   string
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
stored as textfile;


desc formatted dwd_ids_test.dwd_order_base_info_dtl_di;

dfs -ls hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di;
dfs -rm -f hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di/*;

hdfs dfs -put /home/jasonapp/dwd_order_base_info_dtl_di*.json hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di;

select * from dwd_ids_test.dwd_order_base_info_dtl_di limit 1;

select * from dwd_ids_test.dwd_order_base_info_dtl_di where inc_day='20200103' limit 1;

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.fetch.task.conversion=more;
set hive.exec.parallel=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec;
set mapreduce.output.fileoutputformat.compress.type=BLOCK;

DROP TABLE IF EXISTS dwd_ids_test.dwd_order_base_info_dtl_di_parquet;
CREATE EXTERNAL TABLE IF NOT EXISTS dwd_ids_test.dwd_order_base_info_dtl_di_parquet(
  order_id   string,
  order_no   string,
  express_flow_code   string,
  source_zone_code   string,
 .....
)
PARTITIONED BY (
  inc_day string COMMENT 'inc_day used by partition'
)
STORED AS parquet
TBLPROPERTIES('parquet.compression'='SNAPPY');

INSERT OVERWRITE TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day)
select * from dwd_ids_test.dwd_order_base_info_dtl_di;

dfs -ls  hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet;


ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day='20200101') RENAME TO PARTITION (inc_day='20201125');
ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day='20200102') RENAME TO PARTITION (inc_day='20201126');
ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day='20200103') RENAME TO PARTITION (inc_day='20201127');
ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day='20200104') RENAME TO PARTITION (inc_day='20201128');
ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day='20200105') RENAME TO PARTITION (inc_day='20201129');

--------订单宽表------dwd.dwd_pub_order_dtl_di
add jar /home/jasonapp/json-serde-1.3.7-jar-with-dependencies.jar;
drop table if exists dwd_ids_test.dwd_pub_order_dtl_di;
create table if not exists dwd_ids_test.dwd_pub_order_dtl_di(
inner_order_no string,
.......
inc_day string
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
stored as textfile;



desc formatted dwd_ids_test.dwd_pub_order_dtl_di;

dfs -ls hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di;
dfs -rm -f hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di/*;

hdfs dfs -put /home/jasonapp/dwd_pub_order_dtl_di*.json hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di;

select * from dwd_ids_test.dwd_pub_order_dtl_di limit 1;

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.fetch.task.conversion=more;
set hive.exec.parallel=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec;
set mapreduce.output.fileoutputformat.compress.type=BLOCK;

DROP TABLE IF EXISTS dwd_ids_test.dwd_pub_order_dtl_di_parquet;
CREATE EXTERNAL TABLE IF NOT EXISTS dwd_ids_test.dwd_pub_order_dtl_di_parquet(
inner_order_no string,
src_order_no string,
.......
load_tm string,
cx_order_entry string
)
PARTITIONED BY (
  inc_day string COMMENT 'inc_day used by partition'
)
STORED AS parquet
TBLPROPERTIES('parquet.compression'='SNAPPY');

INSERT OVERWRITE TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day)
select * from dwd_ids_test.dwd_pub_order_dtl_di;

dfs -ls hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet;


ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day='20200101') RENAME TO PARTITION (inc_day='20201125');
ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day='20200102') RENAME TO PARTITION (inc_day='20201126');
ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day='20200103') RENAME TO PARTITION (inc_day='20201127');
ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day='20200104') RENAME TO PARTITION (inc_day='20201128');
ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day='20200105') RENAME TO PARTITION (inc_day='20201129');


select * from dwd_ids_test.dwd_pub_order_dtl_di_parquet limit 11;

select * from dwd_ids_test.dwd_pub_order_dtl_di limit 11;

----------日期维表------dim_ids_test.dim_calendar
add jar /home/jasonapp/json-serde-1.3.7-jar-with-dependencies.jar;
drop table if exists dim_ids_test.dim_calendar;
create table if not exists dim_ids_test.dim_calendar(
day_wid string,
......
is_work string,
is_business_peak string,
public_holiday string,
work_day double,
inc_year string
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
stored as textfile;


desc formatted dim_ids_test.dim_calendar;

dfs -put /home/jasonapp/dim_calendar.json hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_calendar;

select * from dim_ids_test.dim_calendar limit 1;

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.fetch.task.conversion=more;
set hive.exec.parallel=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec;
set mapreduce.output.fileoutputformat.compress.type=BLOCK;

DROP TABLE IF EXISTS dim_ids_test.dim_calendar_parquet;
CREATE EXTERNAL TABLE IF NOT EXISTS dim_ids_test.dim_calendar_parquet(
day_wid string,
day_name string,
.....
work string,
is_business_peak string,
public_holiday string,
work_day double
)
PARTITIONED BY (
  inc_year string COMMENT 'inc_year used by partition'
)
STORED AS parquet
TBLPROPERTIES('parquet.compression'='SNAPPY');

INSERT OVERWRITE TABLE dim_ids_test.dim_calendar_parquet PARTITION (inc_year)
select * from dim_ids_test.dim_calendar;

----------城市维表------dim_ids_test.dim_city
add jar /home/jasonapp/json-serde-1.3.7-jar-with-dependencies.jar;
drop table if exists dim_ids_test.dim_city;
create table if not exists dim_ids_test.dim_city(
dist_id string,
......
parent_dist_code string,
inc_day string
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
stored as textfile;


desc formatted dim_ids_test.dim_city;

dfs -ls hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city;
dfs -rm -f hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city/*;

hdfs dfs -put /home/jasonapp/dim_city.*.json hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city;

select * from dim_ids_test.dim_city limit 1;

select * from dim_ids_test.dim_city where inc_day='20200103' limit 1;

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.fetch.task.conversion=more;
set hive.exec.parallel=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec;
set mapreduce.output.fileoutputformat.compress.type=BLOCK;

DROP TABLE IF EXISTS dim_ids_test.dim_city_parquet;
CREATE EXTERNAL TABLE IF NOT EXISTS dim_ids_test.dim_city_parquet(
dist_id string,
.......
parent_dist_code string
)
PARTITIONED BY (
  inc_day string COMMENT 'inc_day used by partition'
)
STORED AS parquet
TBLPROPERTIES('parquet.compression'='SNAPPY');

INSERT OVERWRITE TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day)
select * from dim_ids_test.dim_city;


ALTER TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day='20200101') RENAME TO PARTITION (inc_day='20201125');
ALTER TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day='20200102') RENAME TO PARTITION (inc_day='20201126');
ALTER TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day='20200103') RENAME TO PARTITION (inc_day='20201127');
ALTER TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day='20200104') RENAME TO PARTITION (inc_day='20201128');
ALTER TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day='20200105') RENAME TO PARTITION (inc_day='20201129');

----------SAP公司分公司信息表维表------dim_ids_test.dim_company
add jar /home/jasonapp/json-serde-1.3.7-jar-with-dependencies.jar;
drop table if exists dim_ids_test.dim_company;
create table if not exists dim_ids_test.dim_company(
comp_code string,
.......
dw_datasource_code string
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
stored as textfile;


desc formatted dim_ids_test.dim_company;

dfs -put /home/jasonapp/dim_company.json hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_company;

select * from dim_ids_test.dim_company limit 1;

drop table if exists dim_ids_test.dim_company_parquet;
create table if not exists dim_ids_test.dim_company_parquet(
comp_code string,
.......
dw_datasource_code string
)
STORED AS parquet
TBLPROPERTIES('parquet.compression'='SNAPPY');

INSERT OVERWRITE TABLE dim_ids_test.dim_company_parquet
select * from dim_ids_test.dim_company;

增加分区，并将数据copy到分区，使用调度平台，每日增量分区数据

ALTER TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day='${dt1}') RENAME TO PARTITION (inc_day='${dt2}');
ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day='${dt1}') RENAME TO PARTITION (inc_day='${dt2}');
ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day='${dt1}') RENAME TO PARTITION (inc_day='${dt2}');



ALTER TABLE dim_ids_test.dim_city_parquet ADD PARTITION (inc_day='20201122') PARTITION (inc_day='20201123') PARTITION (inc_day='20201124');
ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet ADD PARTITION (inc_day='20201122') PARTITION (inc_day='20201123') PARTITION (inc_day='20201124');
ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet ADD PARTITION (inc_day='20201122') PARTITION (inc_day='20201123') PARTITION (inc_day='20201124');


dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet/inc_day=20200101/* hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet/inc_day=20201122;
dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet/inc_day=20200102/* hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet/inc_day=20201123;
dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet/inc_day=20200103/* hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet/inc_day=20201124;
dfs -ls  hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet;


dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/inc_day=20200101/* hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/inc_day=20201122;
dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/inc_day=20200102/* hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/inc_day=20201123;
dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/inc_day=20200103/* hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/inc_day=20201124;
dfs -ls  hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/*;


dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/inc_day=20200101/* hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/inc_day=20201122;
dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/inc_day=20200102/* hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/inc_day=20201123;
dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/inc_day=20200103/* hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/inc_day=20201124;
dfs -ls  hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/*;


--按年inc_year分区
show partitions dim_ids_test.dim_calendar_parquet;   
--无分区
show partitions dim_ids_test.dim_company_parquet;    
--按天inc_day分区
show partitions dim_ids_test.dim_city_parquet;       

--按天inc_day分区
show partitions dwd_ids_test.dwd_pub_order_dtl_di_parquet;      
--按天inc_day分区            
show partitions dwd_ids_test.dwd_order_base_info_dtl_di_parquet;

posted on 2020-11-27 10:07 架构艺术阅读(584) 评论(1) 编辑收藏举报