FlinkSQL和SparkSQL区别
区别:
- FlinkSQL 的 insert 语句可只操作部分字段,而 SparkSQL 必须指定所有字段:
spark-sql> create table t11 (
> ds BIGINT,
> ts BIGINT,
> pk BIGINT,
> f0 BIGINT,
> f1 BIGINT,
> f2 BIGINT,
> f3 BIGINT,
> f4 BIGINT
> ) using hudi
> partitioned by (ds)
> tblproperties ( -- 这里也可使用 options (https://hudi.apache.org/docs/table_management)
> type = 'mor',
> primaryKey = 'pk',
> preCombineField = 'ts',
> hoodie.bucket.index.num.buckets = '2',
> hoodie.index.type = 'BUCKET',
> hoodie.compaction.payload.class = 'org.apache.hudi.common.model.OverwriteNonDefaultsWithLatestAvroPayload',
> hoodie.datasource.write.payload.class = 'org.apache.hudi.common.model.OverwriteNonDefaultsWithLatestAvroPayload'
> );
Time taken: 1.382 seconds
spark-sql> insert into t11 (ds,ts,pk,f0) values (20230101,CAST(CURRENT_TIMESTAMP AS BIGINT),1006,1);
Error in query: Cannot write to 'default.t11', not enough data columns:
Table columns: 'ts', 'pk', 'f0', 'f1', 'f2', 'f3', 'f4', 'ds'
Data columns: 'col1', 'col2', 'col3', 'col4'
SparkSQL 将分区字段放在最后,即使创建时放在最前:
spark-sql> create table t11 (
> ds BIGINT,
> ts BIGINT,
> pk BIGINT,
> f0 BIGINT,
> f1 BIGINT,
> f2 BIGINT,
> f3 BIGINT,
> f4 BIGINT
> ) using hudi
> partitioned by (ds)
> tblproperties ( -- 这里也可使用 options (https://hudi.apache.org/docs/table_management)
> type = 'mor',
> primaryKey = 'pk',
> preCombineField = 'ts',
> hoodie.bucket.index.num.buckets = '2',
> hoodie.index.type = 'BUCKET',
> hoodie.compaction.payload.class = 'org.apache.hudi.common.model.OverwriteNonDefaultsWithLatestAvroPayload',
> hoodie.datasource.write.payload.class = 'org.apache.hudi.common.model.OverwriteNonDefaultsWithLatestAvroPayload'
> );
Time taken: 1.382 seconds
spark-sql> show create table t11;
CREATE TABLE default.t11 (
_hoodie_commit_time STRING,
_hoodie_commit_seqno STRING,
_hoodie_record_key STRING,
_hoodie_partition_path STRING,
_hoodie_file_name STRING,
ts BIGINT,
pk BIGINT,
f0 BIGINT,
f1 BIGINT,
f2 BIGINT,
f3 BIGINT,
f4 BIGINT,
ds BIGINT)
USING hudi
PARTITIONED BY (ds)
TBLPROPERTIES (
'hoodie.bucket.index.num.buckets' = '2',
'hoodie.compaction.payload.class' = 'org.apache.hudi.common.model.OverwriteNonDefaultsWithLatestAvroPayload',
'hoodie.datasource.write.payload.class' = 'org.apache.hudi.common.model.OverwriteNonDefaultsWithLatestAvroPayload',
'hoodie.index.type' = 'BUCKET',
'preCombineField' = 'ts',
'primaryKey' = 'pk',
'type' = 'mor')
Time taken: 0.082 seconds, Fetched 1 row(s)
而 Flink 不会做这个改动:
Flink SQL> CREATE TABLE t101(
> ds BIGINT,
> pk VARCHAR(20) PRIMARY KEY NOT ENFORCED,
> ut VARCHAR(10),
> f0 BIGINT,
> f1 BIGINT,
> f2 BIGINT
> )
> PARTITIONED BY (`ds`)
> WITH (
> 'connector' = 'hudi',
> 'path' = 'hdfs:///user/root/hudi/t101',
> 'table.type' = 'MERGE_ON_READ' -- this creates a MERGE_ON_READ table, by default is COPY_ON_WRITE
> );
[INFO] Execute statement succeed.
Flink SQL> show create table t101;
CREATE TABLE `default_catalog`.`default_database`.`t101` (
`ds` BIGINT,
`pk` VARCHAR(20) NOT NULL,
`ut` VARCHAR(10),
`f0` BIGINT,
`f1` BIGINT,
`f2` BIGINT,
CONSTRAINT `PK_3610` PRIMARY KEY (`pk`) NOT ENFORCED
) PARTITIONED BY (`ds`)
WITH (
'path' = 'hdfs:///user/root/hudi/t101',
'table.type' = 'MERGE_ON_READ',
'connector' = 'hudi'
)