Iceberg常用命令

一、登录spark客户端

spark-sql --master yarn \
--deploy-mode client \
--queue default \
--name wang \
--driver-memory 12G \
--num-executors 10 \
--executor-cores 2 \
--executor-memory 10G

二、sql查询表信息

1、查询表快照信息

SELECT * FROM spark_catalog.data_lake_ods.test_table.snapshots order by committed_at desc

SELECT count(1) FROM spark_catalog.data_lake_ods.test_table.snapshots

2、查询表数据文件

--查询当前快照对应的数据文件

SELECT count(1) FROM data_lake_ods_test.dwd_mkt_uqun_call_detail.data_files

--查询表所有数据文件

SELECT count(1) FROM data_lake_ods_test.dwd_mkt_uqun_call_detail.all_data_files

3、查询删除文件

--查询当前快照对应的删除文件

SELECT count(1) FROM data_lake_ods.test_table.delete_files

--查询所有删除文件

SELECT * FROM data_lake_ods.test_table.all_delete_files

SELECT count(1) FROM data_lake_ods.test_table.all_delete_files

4、查询表快照历史

select * from data_lake_ods.ods_bdg_dp_data_source.history

5、查询manifests

select * from iceberg_catalog.data_lake_ods.ods_bdg_dp_data_source.manifests

三、Iceberg表DDL操作

1、建非分区表

--建表。 布隆筛选建议和主键一样,最多不超过3个
CREATE TABLE data_lake_ods.test123(
  changed_type int comment 'null',
  id bigint comment 'id',
  create_time string comment '创建时间',
  update_time string comment '更新时间'
)USING iceberg
TBLPROPERTIES(
  'format-version'='2'
  ,'write.upsert.enabled'='true'
  ,'engine.hive.enabled'='true'
  ,'write.distribution-mode' = 'hash'
  ,'write.metadata.metrics.default'='full'
  ,'write.parquet.bloom-filter-enabled.column.id' = 'true'
  ,'table.drop.base-path.enabled'='true'
);

2、建分桶表

--分桶表。布隆筛选建议和主键一样,最多不超过3个
CREATE TABLE spark_catalog.data_lake_ods_test.test_bucket_month (
  `order_number` BIGINT COMMENT '订单编号',
  `user_number` BIGINT COMMENT '用户编号',
  `pay_number` BIGINT COMMENT '支付编号',
  `spu_number` BIGINT COMMENT '商品spu编号',
  `sku_number` BIGINT COMMENT '商品sku编号',
  `kafka_timestamp` TIMESTAMP COMMENT 'kafka写入时间戳'
)USING iceberg
PARTITIONED BY (bucket(12,order_number))
TBLPROPERTIES(
  'format-version'='2'
  ,'write.upsert.enabled'='true'
  ,'engine.hive.enabled'='true'
  ,'write.distribution-mode' = 'hash'
  ,'write.metadata.metrics.default'='full'
  ,'write.parquet.bloom-filter-enabled.column.id' = 'true'
  ,'table.drop.base-path.enabled'='true'
);

其它优化参数

TBLPROPERTIES (
  'write.merge.mode' = 'merge-on-read',
  'write.update.mode' = 'merge-on-read',
  'write.delete.mode' ='merge-on-read',
  'write.metadata.delete-after-commit.enabled' = 'true',
  //保留metadata.json个数,会删除历史的metadata.json,但不会清理snapshot(即manifestList、manifest、data file、delete file都不会清理)
  'write.metadata.previous-versions-max' = '10',  
);

3、修改表属性

alter table data_lake_ods.test123 SET TBLPROPERTIES('write.metadata.metrics.default'='full','write.distribution-mode' = 'hash');

4、删除表

drop table data_lake_ods.test123 purge;

 

 

四、表治理

1、小文件合并(非分区表)

CALL spark_catalog.system.rewrite_data_files(
table => 'data_lake_ods.test123',
options => map(
  'partial-progress.enabled', 'true',
  'rewrite-job-order', 'bytes-asc',
  'partial-progress.max-commits', '10000',
  'max-file-group-size-bytes', '1073741824',
  'target-file-size-bytes','134217728',
  'rewrite-all','true'
  )
);

2、小文件合并(分区表)

CALL spark_catalog.system.rewrite_data_files(
table => 'data_lake_ods_test.test123',
options => map(
  'partial-progress.enabled', 'true',
  'rewrite-job-order', 'bytes-asc',
  'partial-progress.max-commits', '10000',
  'max-file-group-size-bytes', '1073741824',
  'target-file-size-bytes','134217728',
  'remove-dangling-deletes','true',
  'rewrite-all','true'
  )
);

3、治理删除文件特别多的单表(先把删除文件干掉,然后再执行上面治理命令)

CALL spark_catalog.system.rewrite_data_files(
table => 'data_lake_ods.test123',
options => map(
  'max-concurrent-file-group-rewrites','19',
  'max-file-group-size-bytes','1',
  'partial-progress.enabled', 'true',
  'rewrite-all','true'
  )
);

4、过期快照清理

CALL spark_catalog.system.expire_snapshots(table => 'data_lake_ods.test_table', older_than => TIMESTAMP '2024-04-08 10:00:00.000');

五、表运维

1、spark3.3查询指定时间快照数据

SELECT * FROM data_lake_ods.ods_bdg_dp_data_source TIMESTAMP AS OF '2024-02-29 17:10:31';
SELECT * FROM data_lake_ods.ods_bdg_dp_data_source TIMESTAMP AS OF 499162860;

2、spark3.3查询制定快照数据

SELECT * FROM data_lake_ods.ods_bdg_dp_data_source VERSION AS OF 10963874102873;

3、回滚快照

CALL hadoop_prod.system.rollback_to_timestamp('mydb.mytest', TIMESTAMP '2021-12-23 16:56:40.000')

 

 

 

六、优秀文章

1、网易数帆

https://www.6aiq.com/article/1686471152273

posted @ 2024-04-22 10:48  黑水滴  阅读(55)  评论(0编辑  收藏  举报