iceberg文件详解
一、数据内容
t20
├── data
│ ├── 00000-0-9c7ff22e-a767-4b85-91ec-a2771e54c209-00001.parquet
│ └── 00000-0-ecd3f21c-1bc0-4cdc-8917-d9a1afe7ce55-00001.parquet
└── metadata
├── 00000-d864e750-e5e2-4afd-bddb-2fab1e627a21.metadata.json
├── 00001-aabfd9a8-7dcd-4aa0-99aa-f6695f39bf6b.metadata.json
├── 00002-b5b7725f-7e86-454b-8d16-0e142bc84266.metadata.json
├── 0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro
├── f787e035-8f7c-43a3-b264-42057bad2710-m0.avro
├── snap-6190364701448945732-1-0254b8b6-4d76-473c-86c2-97acda68d587.avro
└── snap-6460256963744122971-1-f787e035-8f7c-43a3-b264-42057bad2710.avro
二、文件详解
data是数据,metadata是元数据
建表时会生成metadata/00000-xx.metadata.json
每做一次insert会生成元数据和数据,会生成新的00001-xx.metadata.json ..
1、数据
xxx.parquet
$ parquet head ~/t20/data/00000-0-ecd3f21c-1bc0-4cdc-8917-d9a1afe7ce55-00001.parquet
{"id": 20}
$ parquet head ~/t20/data/00000-0-9c7ff22e-a767-4b85-91ec-a2771e54c209-00001.parquet
{"id": 10}
2、元数据
(1)xxx.metadata.json
从hive metastore的mysql库的TABLE_PARAMS可以查到表的metastore_location位置,即xxx.metadata.json,可以拿到当前表的快照 id(current-snapshot-id),以及这张表的所有快照信息,也就是 JSON 信息里面的 snapshots 数组对应的值
(2)清单列表(相当于snapshot):snap--xxx.avro
每个快照包含的一系列清单文件,每行中存储了清单文件的路径、清单文件里面存储数据文件的分区范围、增加了几个数据文件、删除了几个数据文件等信息。这些信息可以用来在查询时提供过滤
manifest_path
|
manifest_length
|
partition_spec_id
|
added_snapshot_id
|
added_data_files_count
|
existing_data_files_count
|
deleted_data_files_count
|
partitions
|
added_rows_count
|
existing_rows_count
|
deleted_rows_count
|
hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro
|
5514
|
0
|
6190364701448940000
|
1
|
0
|
0
|
[]
|
1
|
0
|
0
|
hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro
|
5514
|
0
|
6190364701448940000
|
1
|
0
|
0
|
[]
|
1
|
0
|
0
|
(3)清单:xxx.avro
每行都是每个数据文件的详细描述,包括数据文件的状态、文件路径、分区信息、列级别的统计信息(比如每列的最大最小值、空值数等)、文件的大小以及文件里面数据的行数等信息。其中列级别的统计信息在 Scan 的时候可以为算子下推提供数据,以便可以过滤掉不必要的文件
{
"status": 1,
"snapshot_id": {"long": 6460256963744123000},
"data_file": {
"file_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/data/00000-0-ecd3f21c-1bc0-4cdc-8917-d9a1afe7ce55-00001.parquet",
"file_format": "PARQUET",
"partition": {},
"record_count": 1,
"file_size_in_bytes": 387,
"block_size_in_bytes": 67108864,
"column_sizes": {
"array": [{ "key": 1, "value": 51}]
},
"value_counts": {
"array": [{"key": 1,"value": 1}]
},
"null_value_counts": {
"array": [{"key": 1,"value": 0}]
},
"nan_value_counts": {"array": []},
"lower_bounds": {
"array": [{"key": 1,"value": "\u0014\u0000\u0000\u0000"}]
},
"upper_bounds": {
"array": [{"key": 1,"value": "\u0014\u0000\u0000\u0000"}]
},
"key_metadata": null,
"split_offsets": {
"array": [4]
}
}
}
以下是完整的metadata目录下的文件内容,有兴趣的可以再深究
metadata/00001-aabfd9a8-7dcd-4aa0-99aa-f6695f39bf6b.metadata.json
{ "format-version" : 1, "table-uuid" : "900edf11-3434-408d-a789-a6a5acecdca3", "location" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20", "last-updated-ms" : 1619022202031, "last-column-id" : 1, "schema" : { "type" : "struct", "fields" : [ { "id" : 1, "name" : "id", "required" : false, "type" : "int" } ] }, "partition-spec" : [ ], "default-spec-id" : 0, "partition-specs" : [ { "spec-id" : 0, "fields" : [ ] } ], "default-sort-order-id" : 0, "sort-orders" : [ { "order-id" : 0, "fields" : [ ] } ], "properties" : { }, "current-snapshot-id" : 6190364701448945732, "snapshots" : [ { "snapshot-id" : 6190364701448945732, "timestamp-ms" : 1619022202031, "summary" : { "operation" : "append", "flink.job-id" : "93d92dedbddaf202ac2a2beb9d381084", "flink.max-committed-checkpoint-id" : "9223372036854775807", "added-data-files" : "1", "added-records" : "1", "added-files-size" : "387", "changed-partition-count" : "1", "total-records" : "1", "total-data-files" : "1", "total-delete-files" : "0", "total-position-deletes" : "0", "total-equality-deletes" : "0" }, "manifest-list" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/snap-6190364701448945732-1-0254b8b6-4d76-473c-86c2-97acda68d587.avro" } ], "snapshot-log" : [ { "timestamp-ms" : 1619022202031, "snapshot-id" : 6190364701448945732 } ], "metadata-log" : [ { "timestamp-ms" : 1619020518215, "metadata-file" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/00000-d864e750-e5e2-4afd-bddb-2fab1e627a21.metadata.json" } ] }
metadata/00002-b5b7725f-7e86-454b-8d16-0e142bc84266.metadata.json
{ "format-version" : 1, "table-uuid" : "900edf11-3434-408d-a789-a6a5acecdca3", "location" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20", "last-updated-ms" : 1619023435305, "last-column-id" : 1, "schema" : { "type" : "struct", "fields" : [ { "id" : 1, "name" : "id", "required" : false, "type" : "int" } ] }, "partition-spec" : [ ], "default-spec-id" : 0, "partition-specs" : [ { "spec-id" : 0, "fields" : [ ] } ], "default-sort-order-id" : 0, "sort-orders" : [ { "order-id" : 0, "fields" : [ ] } ], "properties" : { }, "current-snapshot-id" : 6460256963744122971, "snapshots" : [ { "snapshot-id" : 6190364701448945732, "timestamp-ms" : 1619022202031, "summary" : { "operation" : "append", "flink.job-id" : "93d92dedbddaf202ac2a2beb9d381084", "flink.max-committed-checkpoint-id" : "9223372036854775807", "added-data-files" : "1", "added-records" : "1", "added-files-size" : "387", "changed-partition-count" : "1", "total-records" : "1", "total-data-files" : "1", "total-delete-files" : "0", "total-position-deletes" : "0", "total-equality-deletes" : "0" }, "manifest-list" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/snap-6190364701448945732-1-0254b8b6-4d76-473c-86c2-97acda68d587.avro" }, { "snapshot-id" : 6460256963744122971, "parent-snapshot-id" : 6190364701448945732, "timestamp-ms" : 1619023435305, "summary" : { "operation" : "append", "flink.job-id" : "3be57424a6547f41f1df350f9667ae65", "flink.max-committed-checkpoint-id" : "9223372036854775807", "added-data-files" : "1", "added-records" : "1", "added-files-size" : "387", "changed-partition-count" : "1", "total-records" : "2", "total-data-files" : "2", "total-delete-files" : "0", "total-position-deletes" : "0", "total-equality-deletes" : "0" }, "manifest-list" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/snap-6460256963744122971-1-f787e035-8f7c-43a3-b264-42057bad2710.avro" } ], "snapshot-log" : [ { "timestamp-ms" : 1619022202031, "snapshot-id" : 6190364701448945732 }, { "timestamp-ms" : 1619023435305, "snapshot-id" : 6460256963744122971 } ], "metadata-log" : [ { "timestamp-ms" : 1619020518215, "metadata-file" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/00000-d864e750-e5e2-4afd-bddb-2fab1e627a21.metadata.json" }, { "timestamp-ms" : 1619022202031, "metadata-file" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/00001-aabfd9a8-7dcd-4aa0-99aa-f6695f39bf6b.metadata.json" } ] }
metadata/00000-d864e750-e5e2-4afd-bddb-2fab1e627a21.metadata.json
{ "format-version" : 1, "table-uuid" : "900edf11-3434-408d-a789-a6a5acecdca3", "location" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20", "last-updated-ms" : 1619020518215, "last-column-id" : 1, "schema" : { "type" : "struct", "schema-id" : 0, "fields" : [ { "id" : 1, "name" : "id", "required" : false, "type" : "int" } ] }, "current-schema-id" : 0, "schemas" : [ { "type" : "struct", "schema-id" : 0, "fields" : [ { "id" : 1, "name" : "id", "required" : false, "type" : "int" } ] } ], "partition-spec" : [ ], "default-spec-id" : 0, "partition-specs" : [ { "spec-id" : 0, "fields" : [ ] } ], "last-partition-id" : 999, "default-sort-order-id" : 0, "sort-orders" : [ { "order-id" : 0, "fields" : [ ] } ], "properties" : { }, "current-snapshot-id" : -1, "snapshots" : [ ], "snapshot-log" : [ ], "metadata-log" : [ ] }
metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro
{ "status": 1, "snapshot_id": { "long": 6190364701448946000 }, "data_file": { "file_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/data/00000-0-9c7ff22e-a767-4b85-91ec-a2771e54c209-00001.parquet", "file_format": "PARQUET", "partition": {}, "record_count": 1, "file_size_in_bytes": 387, "block_size_in_bytes": 67108864, "column_sizes": { "array": [ { "key": 1, "value": 51 } ] }, "value_counts": { "array": [ { "key": 1, "value": 1 } ] }, "null_value_counts": { "array": [ { "key": 1, "value": 0 } ] }, "nan_value_counts": { "array": [] }, "lower_bounds": { "array": [ { "key": 1, "value": "\n\u0000\u0000\u0000" } ] }, "upper_bounds": { "array": [ { "key": 1, "value": "\n\u0000\u0000\u0000" } ] }, "key_metadata": null, "split_offsets": { "array": [ 4 ] } } }
metadata/f787e035-8f7c-43a3-b264-42057bad2710-m0.avro
{ "status": 1, "snapshot_id": { "long": 6460256963744123000 }, "data_file": { "file_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/data/00000-0-ecd3f21c-1bc0-4cdc-8917-d9a1afe7ce55-00001.parquet", "file_format": "PARQUET", "partition": {}, "record_count": 1, "file_size_in_bytes": 387, "block_size_in_bytes": 67108864, "column_sizes": { "array": [ { "key": 1, "value": 51 } ] }, "value_counts": { "array": [ { "key": 1, "value": 1 } ] }, "null_value_counts": { "array": [ { "key": 1, "value": 0 } ] }, "nan_value_counts": { "array": [] }, "lower_bounds": { "array": [ { "key": 1, "value": "\u0014\u0000\u0000\u0000" } ] }, "upper_bounds": { "array": [ { "key": 1, "value": "\u0014\u0000\u0000\u0000" } ] }, "key_metadata": null, "split_offsets": { "array": [ 4 ] } } }
metadata/snap-6190364701448945732-1-0254b8b6-4d76-473c-86c2-97acda68d587.avro
{ "manifest_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro", "manifest_length": 5514, "partition_spec_id": 0, "added_snapshot_id": { "long": 6190364701448946000 }, "added_data_files_count": { "int": 1 }, "existing_data_files_count": { "int": 0 }, "deleted_data_files_count": { "int": 0 }, "partitions": { "array": [] }, "added_rows_count": { "long": 1 }, "existing_rows_count": { "long": 0 }, "deleted_rows_count": { "long": 0 } }
metadata/snap-6460256963744122971-1-f787e035-8f7c-43a3-b264-42057bad2710.avro
{ "manifest_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/f787e035-8f7c-43a3-b264-42057bad2710-m0.avro", "manifest_length": 5514, "partition_spec_id": 0, "added_snapshot_id": { "long": 6460256963744123000 }, "added_data_files_count": { "int": 1 }, "existing_data_files_count": { "int": 0 }, "deleted_data_files_count": { "int": 0 }, "partitions": { "array": [] }, "added_rows_count": { "long": 1 }, "existing_rows_count": { "long": 0 }, "deleted_rows_count": { "long": 0 } } { "manifest_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro", "manifest_length": 5514, "partition_spec_id": 0, "added_snapshot_id": { "long": 6190364701448946000 }, "added_data_files_count": { "int": 1 }, "existing_data_files_count": { "int": 0 }, "deleted_data_files_count": { "int": 0 }, "partitions": { "array": [] }, "added_rows_count": { "long": 1 }, "existing_rows_count": { "long": 0 }, "deleted_rows_count": { "long": 0 } }