dbt snapshot 处理简单说明

dbt 的snapshot 实际上也是一种物化处理,支持与test,docs,稍有不同就是dbt 没定义独立的block 扩展,以下是一个简单说明
dbt 目前默认的snapshot是基于了scd2 模型

使用

包含了配置以及snapshot 定义,配置支持dbt_project 项目级以及独立snapshot 定义,对于snapshot 是需要指定策略的

  • 参考配置
{% snapshot orders_snapshot %}
    {{
        config(
          unique_key='id',
          strategy='timestamp',
          updated_at='updated_at'
        )
    }}
    -- Pro-Tip: Use sources in snapshots!
    select * from {{ source('jaffle_shop', 'orders') }}
{% endsnapshot %}

处理机制

dbt snapshot 实际上也是物化,所以dbt 处理的时候只是进行了snapshot block 的parse,实际编译的元数据也是materialization

  • 参考解析

ManifestLoader 的load方法中调用的解析

parser_types: List[Type[Parser]] = [
                ModelParser,
                SnapshotParser,
                AnalysisParser,
                SingularTestParser,
                SeedParser,
                DocumentationParser,
                HookParser,
            ]
            for project in self.all_projects.values():
                if project.project_name not in project_parser_files:
                    continue
                self.parse_project(
                    project, project_parser_files[project.project_name], parser_types
                )

SnapshotParser 解析处理

class SnapshotParser(SQLParser[IntermediateSnapshotNode, SnapshotNode]):
    def parse_from_dict(self, dct, validate=True) -> IntermediateSnapshotNode:
        if validate:
            IntermediateSnapshotNode.validate(dct)
        return IntermediateSnapshotNode.from_dict(dct)
 
    @property
    def resource_type(self) -> NodeType:
        return NodeType.Snapshot
 
    @classmethod
    def get_compiled_path(cls, block: FileBlock):
        return block.path.relative_path
 
    def set_snapshot_attributes(self, node):
        # use the target_database setting if we got it, otherwise the
        # `database` value of the node (ultimately sourced from the `database`
        # config value), and if that is not set, use the database defined in
        # the adapter's credentials.
        if node.config.target_database:
            node.database = node.config.target_database
        elif not node.database:
            node.database = self.root_project.credentials.database
 
        # the target schema must be set if we got here, so overwrite the node's
        # schema
        node.schema = node.config.target_schema
        # We need to set relation_name again, since database/schema might have changed
        self._update_node_relation_name(node)
 
        return node
 
    def get_fqn(self, path: str, name: str) -> List[str]:
        """Get the FQN for the node. This impacts node selection and config
        application.
 
        On snapshots, the fqn includes the filename.
        """
        no_ext = os.path.splitext(path)[0]
        fqn = [self.project.project_name]
        fqn.extend(split_path(no_ext))
        fqn.append(name)
        return fqn
 
    def transform(self, node: IntermediateSnapshotNode) -> SnapshotNode:
        try:
            # The config_call_dict is not serialized, because normally
            # it is not needed after parsing. But since the snapshot node
            # does this extra to_dict, save and restore it, to keep
            # the model config when there is also schema config.
            config_call_dict = node.config_call_dict
            dct = node.to_dict(omit_none=True)
            parsed_node = SnapshotNode.from_dict(dct)
            parsed_node.config_call_dict = config_call_dict
            self.set_snapshot_attributes(parsed_node)
            return parsed_node
        except ValidationError as exc:
            raise SnapshopConfigError(exc, node)
 
    def parse_file(self, file_block: FileBlock) -> None:
       # 此处文件处理只支持snapshot 的,BlockSearcher 处理部分会使用自己的一套blocktag 解析处理
        blocks = BlockSearcher(
            source=[file_block],
            allowed_blocks={"snapshot"},
            source_tag_factory=BlockContents,
        )
        for block in blocks:
            self.parse_node(block)
  • 生成格式参考

如下图

 

  • 执行

通过了解我们知道也是物化,那么就会使用物化相关定义处理
参考处理snapshot.sql ,里边东西还是比较多的, 同时也调用了不少其他macro 定义,详细的后边解决物化完整介绍下,默认的实现是基于了merge into

{% materialization snapshot, default %}
  {%- set config = model['config'] -%}
 
  {%- set target_table = model.get('alias', model.get('name')) -%}
 
  {%- set strategy_name = config.get('strategy') -%}
  {%- set unique_key = config.get('unique_key') %}
  -- grab current tables grants config for comparision later on
  {%- set grant_config = config.get('grants') -%}
 
  {% set target_relation_exists, target_relation = get_or_create_relation(
          database=model.database,
          schema=model.schema,
          identifier=target_table,
          type='table') -%}
 
  {%- if not target_relation.is_table -%}
    {% do exceptions.relation_wrong_type(target_relation, 'table') %}
  {%- endif -%}
 
 
  {{ run_hooks(pre_hooks, inside_transaction=False) }}
 
  {{ run_hooks(pre_hooks, inside_transaction=True) }}
 
  {% set strategy_macro = strategy_dispatch(strategy_name) %}
  {% set strategy = strategy_macro(model, "snapshotted_data", "source_data", config, target_relation_exists) %}
 
  {% if not target_relation_exists %}
 
      {% set build_sql = build_snapshot_table(strategy, model['compiled_code']) %}
      {% set final_sql = create_table_as(False, target_relation, build_sql) %}
 
  {% else %}
 
      {{ adapter.valid_snapshot_target(target_relation) }}
 
      {% set staging_table = build_snapshot_staging_table(strategy, sql, target_relation) %}
 
      -- this may no-op if the database does not require column expansion
      {% do adapter.expand_target_column_types(from_relation=staging_table,
                                               to_relation=target_relation) %}
 
      {% set missing_columns = adapter.get_missing_columns(staging_table, target_relation)
                                   | rejectattr('name', 'equalto', 'dbt_change_type')
                                   | rejectattr('name', 'equalto', 'DBT_CHANGE_TYPE')
                                   | rejectattr('name', 'equalto', 'dbt_unique_key')
                                   | rejectattr('name', 'equalto', 'DBT_UNIQUE_KEY')
                                   | list %}
 
      {% do create_columns(target_relation, missing_columns) %}
 
      {% set source_columns = adapter.get_columns_in_relation(staging_table)
                                   | rejectattr('name', 'equalto', 'dbt_change_type')
                                   | rejectattr('name', 'equalto', 'DBT_CHANGE_TYPE')
                                   | rejectattr('name', 'equalto', 'dbt_unique_key')
                                   | rejectattr('name', 'equalto', 'DBT_UNIQUE_KEY')
                                   | list %}
 
      {% set quoted_source_columns = [] %}
      {% for column in source_columns %}
        {% do quoted_source_columns.append(adapter.quote(column.name)) %}
      {% endfor %}
 
      {% set final_sql = snapshot_merge_sql(
            target = target_relation,
            source = staging_table,
            insert_cols = quoted_source_columns
         )
      %}
 
  {% endif %}
 
  {% call statement('main') %}
      {{ final_sql }}
  {% endcall %}
 
  {% set should_revoke = should_revoke(target_relation_exists, full_refresh_mode=False) %}
  {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}
 
  {% do persist_docs(target_relation, model) %}
 
  {% if not target_relation_exists %}
    {% do create_indexes(target_relation) %}
  {% endif %}
 
  {{ run_hooks(post_hooks, inside_transaction=True) }}
 
  {{ adapter.commit() }}
 
  {% if staging_table is defined %}
      {% do post_snapshot(staging_table) %}
  {% endif %}
 
  {{ run_hooks(post_hooks, inside_transaction=False) }}
 
  {{ return({'relations': [target_relation]}) }}
 
{% endmaterialization %}

说明

以上只是一个简单说明,dbt 不少功能都是基于物化block 处理的,后边说明下物化的处理

参考资料

core/dbt/parser/snapshots.py
core/dbt/parser/manifest.py
core/dbt/parser/search.py
core/dbt/clients/_jinja_blocks.py
core/dbt/contracts/graph/nodes.py
core/dbt/include/global_project/macros/materializations/snapshots/snapshot.sql
core/dbt/include/global_project/macros/materializations/snapshots/strategies.sql
https://docs.getdbt.com/reference/snapshot-properties

posted on 2024-05-11 08:00  荣锋亮  阅读(33)  评论(0编辑  收藏  举报

导航