dremio CTAS STORE AS && WITH SINGLE WRITER 简单说明

dremio CTAS 支持存储格式以及写入的文件数量(相对分区还说)

参考CTAS格式

CREATE TABLE "s3"."91733d30-d1d2-46bf-8f2b-3c34d587a96c" STORE AS (type => 'text', fieldDelimiter => ',', lineDelimiter => '
') WITH SINGLE WRITER AS SELECT * FROM (
SELECT * FROM "ops-action"
) LIMIT 1000000

store as 配置参考处理

  • CreateTableHandler.java
createStorageOptionsMap(sqlCreateTable.getFormatOptions());
if (CatalogUtil.requestedPluginSupportsVersionedTables(path, catalog)) {
  return doVersionedCtas(config, path, catalog, sql, sqlCreateTable);
}
 
public void createStorageOptionsMap(final SqlNodeList args) {
    if (args == null || args.size() == 0) {
      return;
    }
 
    final ImmutableMap.Builder<String, Object> storageOptions = ImmutableMap.builder();
    for (SqlNode operand : args) {
      if (operand.getKind() != SqlKind.ARGUMENT_ASSIGNMENT) {
        throw UserException.unsupportedError()
          .message("Unsupported argument type. Only assignment arguments (param => value) are supported.")
          .build(logger);
      }
      final List<SqlNode> operandList = ((SqlCall) operand).getOperandList();
 
      final String name = ((SqlIdentifier) operandList.get(1)).getSimple();
      SqlNode literal = operandList.get(0);
      if (!(literal instanceof SqlLiteral)) {
        throw UserException.unsupportedError()
          .message("Only literals are accepted for storage option values")
          .build(logger);
      }
 
      Object value = ((SqlLiteral)literal).getValue();
      if (value instanceof NlsString) {
        value = ((NlsString)value).getValue();
      }
      storageOptions.put(name, value);
    }
   // 存储为一个map
    this.storageOptionsMap = storageOptions.build();
  }

参数使用
DataAdditionCmdHandler.java

 
 内部实际会到对应的存储扩展
tableEntry = datasetCatalog.createNewTable(
  key,
  icebergTableProps,
  options,
  storageOptions);

s3 存储扩展的处理

  public CreateTableEntry createNewTable(
    NamespaceKey tableSchemaPath, SchemaConfig config,
    IcebergTableProps icebergTableProps,
    WriterOptions writerOptions,
    Map<String, Object> storageOptions,
    boolean isResultsTable
  ) {
    Preconditions.checkArgument(tableSchemaPath.size() >= 2, "key must be at least two parts");
    final List<String> resolvedPath = resolveTableNameToValidPath(tableSchemaPath.getPathComponents()); // strips source name
    final String containerName = resolvedPath.get(0);
    if (resolvedPath.size() == 1) {
      throw UserException.validationError()
        .message("Creating buckets is not supported (name: %s)", containerName)
        .build(logger);
    }
   // 调用父类文件系统插件的createNewTable
    final CreateTableEntry entry = super.createNewTable(tableSchemaPath, config,
      icebergTableProps, writerOptions, storageOptions, isResultsTable);
 
    final S3FileSystem fs = getSystemUserFS().unwrap(S3FileSystem.class);
 
    if (!fs.containerExists(containerName)) {
      throw UserException.validationError()
          .message("Cannot create the table because '%s' bucket does not exist", containerName)
          .build(logger);
    }
 
    return entry;
  }

文件系统插件的createNewTable的处理(此处是使用到定义的选项)

  public CreateTableEntry createNewTable(NamespaceKey tableSchemaPath, SchemaConfig config, IcebergTableProps icebergTableProps,
                                         WriterOptions writerOptions, Map<String, Object> storageOptions,
                                         boolean isResultsTable) {
    if(!getMutability().hasMutationCapability(MutationType.TABLE, config.isSystemUser())) {
      throw UserException.parseError()
        .message("Unable to create table. Schema [%s] is immutable for this user.", tableSchemaPath.getParent())
        .build(logger);
    }
 
    final String tableName = getTableName(tableSchemaPath);
 
    final FormatPlugin formatPlugin;
    // 默认的处理,配置为parquet 格式的
    if (storageOptions == null || storageOptions.isEmpty() || !storageOptions.containsKey("type")) {
      final String storage = config.getOptions().getOption(ExecConstants.OUTPUT_FORMAT_VALIDATOR);
      formatPlugin = getFormatPlugin(storage);
      if (formatPlugin == null) {
        throw new UnsupportedOperationException(String.format("Unsupported format '%s' in '%s'", storage, tableSchemaPath));
      }
    } else {
     // 通过配置查找到的
      final FormatPluginConfig formatConfig = createConfigForTable(tableName, storageOptions);
      formatPlugin = getFormatPlugin(formatConfig);
    }

后续执行计划处理(逻辑计划)

  if (!isCreate()) {
      BatchSchema partSchemaWithSelectedFields = tableSchemaFromKVStore.subset(fieldNames).orElse(tableSchemaFromKVStore);
      queryRowType = CalciteArrowHelper.wrap(partSchemaWithSelectedFields)
          .toCalciteRecordType(convertedRelNode.getCluster().getTypeFactory(), PrelUtil.getPlannerSettings(convertedRelNode.getCluster()).isFullNestedSchemaSupport());
      logger.debug("Inserting into table with schema : '{}' ", tableSchemaFromKVStore.toString());
    }
 
    // DX-54255: Don't add cast projection, if inserting values from another table
    if (RelOptUtil.findTables(convertedRelNode).isEmpty() && !(sqlCmd instanceof SqlCopyIntoTable)) {
      convertedRelNode = addCastProject(convertedRelNode, queryRowType);
    }
 
    // skip writer and display DML results on UI only
    if (!config.getContext().getOptions().getOption(ExecConstants.ENABLE_DML_DISPLAY_RESULT_ONLY) || !(sqlCmd instanceof SqlCopyIntoTable)) {
      convertedRelNode = new WriterRel(convertedRelNode.getCluster(),
        convertedRelNode.getCluster().traitSet().plus(Rel.LOGICAL),
        convertedRelNode, tableEntry, queryRowType);
    }
 
    convertedRelNode = SqlHandlerUtil.storeQueryResultsIfNeeded(config.getConverter().getParserConfig(),
      config.getContext(), convertedRelNode);
 
    return new ScreenRel(convertedRelNode.getCluster(), convertedRelNode.getTraitSet(), convertedRelNode);

后变就是物理计划的处理了

WITH SINGLE WRITER

这个实际上就是一个sql 字面量,转化为一个true 或者false 的标记,之后对于不同的writer 作为参数传递,对于实际执行会使用
dremio 的不同Writer 实现(比如parquet的)

  • parquet writer 的operator 处理
 
public class ParquetWriterBatchCreator implements SingleInputOperator.Creator<ParquetWriter>{
  @Override
  public SingleInputOperator create(OperatorContext context, ParquetWriter config) throws ExecutionSetupException {
    ParquetRecordWriter writer = new ParquetRecordWriter(context, config, new ParquetFormatConfig());
    return new WriterOperator(context, config.getOptions(), writer);
  }
}

WriterOperator 处理

public VectorAccessible setup(VectorAccessible incoming) throws Exception {
    state.is(State.NEEDS_SETUP);
     // 此处会结合实际的配置进行处理是否包含了分片
    if(options.hasPartitions() || options.hasDistributions()){
      partitionManager = new PartitionWriteManager(options, incoming, options.getTableFormatOptions().isTableFormatWriter());
      this.maskedContainer = partitionManager.getMaskedContainer();
      recordWriter.setup(maskedContainer, listener, statsListener);
    } else {
      // 单文件的处理
      recordWriter.setup(incoming, listener, statsListener);
    }
    // Create the RecordWriter.SCHEMA vectors.
    fragmentIdVector = output.addOrGet(RecordWriter.FRAGMENT);
    pathVector = output.addOrGet(RecordWriter.PATH);
    summaryVector = output.addOrGet(RecordWriter.RECORDS);
    fileSizeVector = output.addOrGet(RecordWriter.FILESIZE);
    metadataVector = output.addOrGet(RecordWriter.METADATA);
    partitionNumberVector = output.addOrGet(RecordWriter.PARTITION);
    icebergMetadataVector = output.addOrGet(RecordWriter.ICEBERG_METADATA);
    schemaVector = output.addOrGet(RecordWriter.FILE_SCHEMA);
    partitionDataVector  = output.addOrGet(RecordWriter.PARTITION_DATA);
    operationTypeVector = output.addOrGet(RecordWriter.OPERATION_TYPE);
    partitionValueVector = output.addOrGet(RecordWriter.PARTITION_VALUE);
    rejectedRecordVector = output.addOrGet(RecordWriter.REJECTED_RECORDS);
    output.buildSchema();
    output.setInitialCapacity(context.getTargetBatchSize());
    state = State.CAN_CONSUME;
    return output;
  }

说明

以上是对于dremio CTAS 几个配置参数的一个简单说明,实际上用好这些配置配置可以简化一些处理,比如dremio 的下载功能,就是结合了上边说的东西

参考资料

sabot/grammar/src/main/codegen/includes/parserImpls.ftl
sabot/kernel/src/main/java/com/dremio/exec/planner/sql/handlers/query/CreateTableHandler.java
sabot/kernel/src/main/java/com/dremio/exec/planner/sql/handlers/query/DataAdditionCmdHandler.java
sabot/kernel/src/main/java/com/dremio/exec/store/dfs/FileSystemPlugin.java
plugins/s3/src/main/java/com/dremio/plugins/s3/store/S3StoragePlugin.java
sabot/kernel/src/main/java/com/dremio/exec/store/dfs/FormatPluginOptionExtractor.java
common/legacy/src/main/java/com/dremio/common/logical/FormatPluginConfig.java
sabot/kernel/src/main/java/com/dremio/sabot/op/writer/WriterOperator.java
sabot/kernel/src/main/java/com/dremio/exec/store/parquet/ParquetWriterBatchCreator.java
https://docs.dremio.com/current/reference/sql/commands/tables#create-table-as
https://www.cnblogs.com/rongfengliang/p/15954623.html
https://www.cnblogs.com/rongfengliang/p/17039839.html
https://www.cnblogs.com/rongfengliang/p/18008209

posted on 2024-02-09 08:01  荣锋亮  阅读(3)  评论(0编辑  收藏  举报

导航