add jar /home/jasonapp/json-serde-1.3.7-jar-with-dependencies.jar; create external table if not exists dm_goddog.student( student map<string,string> comment "学生信息", class map<string,string> comment "课程信息", teacher map<string,string> comment "授课老师信息" ) comment "学生课程信息" ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' stored as textfile;
ps: 如果查询时,出现json无法解析的错误,删除文件中错误json文件中哪些行数据。
1.下载json-serde包,下载地址如下:
http://www.congiu.net/hive-json-serde/1.3.7/cdh5/json-serde-1.3.7-jar-with-dependencies.jar
2.测试建表
add jar /home/jasonapp/json-serde-1.3.7-jar-with-dependencies.jar; *******运单信息基础表*******dwd_ids_test.dwd_order_base_info_dtl_di create external table if not exists dwd_ids_test.dwd_order_base_info_dtl_di( order_id string, order_no string, express_flow_code string,
........... inc_day string ) ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' stored as textfile; desc formatted dwd_ids_test.dwd_order_base_info_dtl_di;
dfs -ls hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di;
dfs -rm -f hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di/*;
hdfs dfs -put /home/jasonapp/dwd_order_base_info_dtl_di*.json hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di;
select * from dwd_ids_test.dwd_order_base_info_dtl_di limit 1;
select * from dwd_ids_test.dwd_order_base_info_dtl_di where inc_day='20200103' limit 1;
set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.fetch.task.conversion=more; set hive.exec.parallel=true; set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec; set mapreduce.output.fileoutputformat.compress.type=BLOCK; DROP TABLE IF EXISTS dwd_ids_test.dwd_order_base_info_dtl_di_parquet; CREATE EXTERNAL TABLE IF NOT EXISTS dwd_ids_test.dwd_order_base_info_dtl_di_parquet( order_id string, order_no string, express_flow_code string, source_zone_code string, ..... ) PARTITIONED BY ( inc_day string COMMENT 'inc_day used by partition' ) STORED AS parquet TBLPROPERTIES('parquet.compression'='SNAPPY'); INSERT OVERWRITE TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day) select * from dwd_ids_test.dwd_order_base_info_dtl_di; dfs -ls hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet; ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day='20200101') RENAME TO PARTITION (inc_day='20201125'); ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day='20200102') RENAME TO PARTITION (inc_day='20201126'); ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day='20200103') RENAME TO PARTITION (inc_day='20201127'); ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day='20200104') RENAME TO PARTITION (inc_day='20201128'); ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day='20200105') RENAME TO PARTITION (inc_day='20201129');
--------订单宽表------dwd.dwd_pub_order_dtl_di add jar /home/jasonapp/json-serde-1.3.7-jar-with-dependencies.jar; drop table if exists dwd_ids_test.dwd_pub_order_dtl_di; create table if not exists dwd_ids_test.dwd_pub_order_dtl_di( inner_order_no string, ....... inc_day string ) ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' stored as textfile; desc formatted dwd_ids_test.dwd_pub_order_dtl_di;
dfs -ls hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di;
dfs -rm -f hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di/*;
hdfs dfs -put /home/jasonapp/dwd_pub_order_dtl_di*.json hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di;
select * from dwd_ids_test.dwd_pub_order_dtl_di limit 1;
set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.fetch.task.conversion=more; set hive.exec.parallel=true; set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec; set mapreduce.output.fileoutputformat.compress.type=BLOCK; DROP TABLE IF EXISTS dwd_ids_test.dwd_pub_order_dtl_di_parquet; CREATE EXTERNAL TABLE IF NOT EXISTS dwd_ids_test.dwd_pub_order_dtl_di_parquet( inner_order_no string, src_order_no string, ....... load_tm string, cx_order_entry string ) PARTITIONED BY ( inc_day string COMMENT 'inc_day used by partition' ) STORED AS parquet TBLPROPERTIES('parquet.compression'='SNAPPY'); INSERT OVERWRITE TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day) select * from dwd_ids_test.dwd_pub_order_dtl_di; dfs -ls hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet; ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day='20200101') RENAME TO PARTITION (inc_day='20201125'); ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day='20200102') RENAME TO PARTITION (inc_day='20201126'); ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day='20200103') RENAME TO PARTITION (inc_day='20201127'); ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day='20200104') RENAME TO PARTITION (inc_day='20201128'); ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day='20200105') RENAME TO PARTITION (inc_day='20201129'); select * from dwd_ids_test.dwd_pub_order_dtl_di_parquet limit 11; select * from dwd_ids_test.dwd_pub_order_dtl_di limit 11;
----------日期维表------dim_ids_test.dim_calendar add jar /home/jasonapp/json-serde-1.3.7-jar-with-dependencies.jar; drop table if exists dim_ids_test.dim_calendar; create table if not exists dim_ids_test.dim_calendar( day_wid string, ...... is_work string, is_business_peak string, public_holiday string, work_day double, inc_year string ) ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' stored as textfile; desc formatted dim_ids_test.dim_calendar; dfs -put /home/jasonapp/dim_calendar.json hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_calendar; select * from dim_ids_test.dim_calendar limit 1;
set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.fetch.task.conversion=more; set hive.exec.parallel=true; set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec; set mapreduce.output.fileoutputformat.compress.type=BLOCK; DROP TABLE IF EXISTS dim_ids_test.dim_calendar_parquet; CREATE EXTERNAL TABLE IF NOT EXISTS dim_ids_test.dim_calendar_parquet( day_wid string, day_name string, .....
work string, is_business_peak string, public_holiday string, work_day double ) PARTITIONED BY ( inc_year string COMMENT 'inc_year used by partition' ) STORED AS parquet TBLPROPERTIES('parquet.compression'='SNAPPY'); INSERT OVERWRITE TABLE dim_ids_test.dim_calendar_parquet PARTITION (inc_year) select * from dim_ids_test.dim_calendar;
----------城市维表------dim_ids_test.dim_city add jar /home/jasonapp/json-serde-1.3.7-jar-with-dependencies.jar; drop table if exists dim_ids_test.dim_city; create table if not exists dim_ids_test.dim_city( dist_id string, ...... parent_dist_code string, inc_day string ) ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' stored as textfile; desc formatted dim_ids_test.dim_city;
dfs -ls hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city;
dfs -rm -f hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city/*;
hdfs dfs -put /home/jasonapp/dim_city.*.json hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city;
select * from dim_ids_test.dim_city limit 1;
select * from dim_ids_test.dim_city where inc_day='20200103' limit 1;
set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.fetch.task.conversion=more; set hive.exec.parallel=true; set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec; set mapreduce.output.fileoutputformat.compress.type=BLOCK; DROP TABLE IF EXISTS dim_ids_test.dim_city_parquet; CREATE EXTERNAL TABLE IF NOT EXISTS dim_ids_test.dim_city_parquet( dist_id string, ....... parent_dist_code string ) PARTITIONED BY ( inc_day string COMMENT 'inc_day used by partition' ) STORED AS parquet TBLPROPERTIES('parquet.compression'='SNAPPY'); INSERT OVERWRITE TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day) select * from dim_ids_test.dim_city; ALTER TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day='20200101') RENAME TO PARTITION (inc_day='20201125'); ALTER TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day='20200102') RENAME TO PARTITION (inc_day='20201126'); ALTER TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day='20200103') RENAME TO PARTITION (inc_day='20201127'); ALTER TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day='20200104') RENAME TO PARTITION (inc_day='20201128'); ALTER TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day='20200105') RENAME TO PARTITION (inc_day='20201129');
----------SAP公司分公司信息表维表------dim_ids_test.dim_company add jar /home/jasonapp/json-serde-1.3.7-jar-with-dependencies.jar; drop table if exists dim_ids_test.dim_company; create table if not exists dim_ids_test.dim_company( comp_code string, ....... dw_datasource_code string ) ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' stored as textfile; desc formatted dim_ids_test.dim_company; dfs -put /home/jasonapp/dim_company.json hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_company; select * from dim_ids_test.dim_company limit 1;
drop table if exists dim_ids_test.dim_company_parquet; create table if not exists dim_ids_test.dim_company_parquet( comp_code string, ....... dw_datasource_code string ) STORED AS parquet TBLPROPERTIES('parquet.compression'='SNAPPY'); INSERT OVERWRITE TABLE dim_ids_test.dim_company_parquet select * from dim_ids_test.dim_company;
增加分区,并将数据copy到分区,使用调度平台,每日增量分区数据
ALTER TABLE dim_ids_test.dim_city_parquet PARTITION (inc_day='${dt1}') RENAME TO PARTITION (inc_day='${dt2}'); ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet PARTITION (inc_day='${dt1}') RENAME TO PARTITION (inc_day='${dt2}'); ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet PARTITION (inc_day='${dt1}') RENAME TO PARTITION (inc_day='${dt2}'); ALTER TABLE dim_ids_test.dim_city_parquet ADD PARTITION (inc_day='20201122') PARTITION (inc_day='20201123') PARTITION (inc_day='20201124'); ALTER TABLE dwd_ids_test.dwd_pub_order_dtl_di_parquet ADD PARTITION (inc_day='20201122') PARTITION (inc_day='20201123') PARTITION (inc_day='20201124'); ALTER TABLE dwd_ids_test.dwd_order_base_info_dtl_di_parquet ADD PARTITION (inc_day='20201122') PARTITION (inc_day='20201123') PARTITION (inc_day='20201124'); dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet/inc_day=20200101/* hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet/inc_day=20201122; dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet/inc_day=20200102/* hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet/inc_day=20201123; dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet/inc_day=20200103/* hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet/inc_day=20201124; dfs -ls hdfs://clusterxxxxx/user/hive/warehouse/dim_ids_test.db/dim_city_parquet; dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/inc_day=20200101/* hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/inc_day=20201122; dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/inc_day=20200102/* hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/inc_day=20201123; dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/inc_day=20200103/* hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/inc_day=20201124; dfs -ls hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_pub_order_dtl_di_parquet/*; dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/inc_day=20200101/* hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/inc_day=20201122; dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/inc_day=20200102/* hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/inc_day=20201123; dfs -cp hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/inc_day=20200103/* hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/inc_day=20201124; dfs -ls hdfs://clusterxxxxx/user/hive/warehouse/dwd_ids_test.db/dwd_order_base_info_dtl_di_parquet/*; --按年inc_year分区 show partitions dim_ids_test.dim_calendar_parquet; --无分区 show partitions dim_ids_test.dim_company_parquet; --按天inc_day分区 show partitions dim_ids_test.dim_city_parquet; --按天inc_day分区 show partitions dwd_ids_test.dwd_pub_order_dtl_di_parquet; --按天inc_day分区 show partitions dwd_ids_test.dwd_order_base_info_dtl_di_parquet;