岚天逸见

upsert部分hudi表字段报错“Expected table’s schema”

当 insert into 一个 hudi 表时,如果只指定了部分字段,则运行时报错“Expected table’s schema”:

java.sql.SQLException: java.util.concurrent.ExecutionException: java.lang.RuntimeException: org.apache.hudi.exception.HoodieException: Expected table's schema:

at org.apache.spark.sql.hudi.command.InsertIntoHoodieTableCommand$.validate(InsertIntoHoodieTableCommand.scala:177)
at org.apache.spark.sql.hudi.command.InsertIntoHoodieTableCommand$.alignQueryOutput(InsertIntoHoodieTableCommand.scala:131)
at org.apache.spark.sql.hudi.command.InsertIntoHoodieTableCommand$.run(InsertIntoHoodieTableCommand.scala:99)
at org.apache.spark.sql.hudi.command.InsertIntoHoodieTableCommand.run(InsertIntoHoodieTableCommand.scala:60)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:79)
at org.apache.spark.sql.Dataset.$anonfun$logicalPlan$1(Dataset.scala:231)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3699)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:105)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:172)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:92)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:801)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3697)
at org.apache.spark.sql.Dataset.<init>(Dataset.scala:231)
at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:102)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:801)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:96)
at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:623)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:801)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:616)
at org.apache.livy.thriftserver.session.SqlJob.executeSql(SqlJob.java:93)
at org.apache.livy.thriftserver.session.SqlJob.call(SqlJob.java:73)
at org.apache.livy.thriftserver.session.SqlJob.call(SqlJob.java:40)
at org.apache.livy.rsc.driver.JobWrapper.call(JobWrapper.java:84)
at org.apache.livy.rsc.driver.JobWrapper.call(JobWrapper.java:34)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)

测试 SQL:

set `hoodie.datasource.write.payload.class`=`org.apache.hudi.common.model.OverwriteNonDefaultsWithLatestAvroPayload`;
INSERT INTO t_test_001 (ds,ts,pk,a0,a1,a2) SELECT ds,CAST(current_timestamp AS BIGINT) AS ts,pk,f0,f1,f2 FROM t_test_002;

CREATE TABLE t_test_001 (
    ds BIGINT COMMENT	'date stamp',
    ts BIGINT COMMENT 'ts',
    pk BIGINT COMMENT 'pk',
    a0 BIGINT COMMENT 'a0',
    a1 BIGINT COMMENT 'a1',
    a2 string COMMENT 'a2',
    a3 BIGINT COMMENT 'a3',
    a4 BIGINT COMMENT 'a4',
    a5 string COMMENT 'a5',
    a6 BIGINT COMMENT 'a6',
    a7 BIGINT COMMENT 'a7',
    a8 BIGINT COMMENT 'a8',
    a9 string COMMENT 'a9'
) using hudi
tblproperties (
  type = 'mor', -- cow/mor
  primaryKey='pk',
  preCombineField = 'ts',
  hoodie.clean.automatic = 'true',
  hoodie.cleaner.policy = 'KEEP_LATEST_BY_HOURS', -- KEEP_LATEST_FILE_VERSIONS/KEEP_LATEST_COMMITS
  hoodie.cleaner.hours.retained = '48',
  hoodie.clean.trigger.strategy = 'NUM_COMMITS',
  hoodie.clean.max.commits = '9',
  hoodie.archive.automatic = 'true',
  hoodie.keep.max.commits = '30',
  hoodie.keep.min.commits = '20'
);

相关代码(https://github.com/apache/hudi/blob/master/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala):

  private def validate(queryOutputSchema: StructType, partitionsSpec: Map[String, Option[String]], catalogTable: HoodieCatalogTable): Unit = {
    // Validate that partition-spec has proper format (it could be empty if all of the partition values are dynamic,
    // ie there are no static partition-values specified)
    if (partitionsSpec.nonEmpty && partitionsSpec.size != catalogTable.partitionSchema.size) {
      throw new HoodieException(s"Required partition schema is: ${catalogTable.partitionSchema.fieldNames.mkString("[", ", ", "]")}, " +
        s"partition spec is: ${partitionsSpec.mkString("[", ", ", "]")}")
    }

    val staticPartitionValues = filterStaticPartitionValues(partitionsSpec)
    val fullQueryOutputSchema = StructType(queryOutputSchema.fields ++ staticPartitionValues.keys.map(StructField(_, StringType)))

    // Assert that query provides all the required columns
    if (!conforms(fullQueryOutputSchema, catalogTable.tableSchemaWithoutMetaFields)) {
      throw new HoodieException(s"Expected table's schema: ${catalogTable.tableSchemaWithoutMetaFields.fields.mkString("[", ", ", "]")}, " +
        s"query's output (including static partition values): ${fullQueryOutputSchema.fields.mkString("[", ", ", "]")}")
    }
  }

posted on   岚天逸见  阅读(95)  评论(0编辑  收藏  举报

相关博文:
阅读排行:
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义

导航

统计信息

点击右上角即可分享
微信分享提示