dbt snapshot 处理简单说明
dbt 的snapshot 实际上也是一种物化处理,支持与test,docs,稍有不同就是dbt 没定义独立的block 扩展,以下是一个简单说明
dbt 目前默认的snapshot是基于了scd2 模型
使用
包含了配置以及snapshot 定义,配置支持dbt_project 项目级以及独立snapshot 定义,对于snapshot 是需要指定策略的
- 参考配置
{% snapshot orders_snapshot %}
{{
config(
unique_key='id',
strategy='timestamp',
updated_at='updated_at'
)
}}
-- Pro-Tip: Use sources in snapshots!
select * from {{ source('jaffle_shop', 'orders') }}
{% endsnapshot %}
处理机制
dbt snapshot 实际上也是物化,所以dbt 处理的时候只是进行了snapshot block 的parse,实际编译的元数据也是materialization
- 参考解析
ManifestLoader 的load方法中调用的解析
parser_types: List[Type[Parser]] = [
ModelParser,
SnapshotParser,
AnalysisParser,
SingularTestParser,
SeedParser,
DocumentationParser,
HookParser,
]
for project in self.all_projects.values():
if project.project_name not in project_parser_files:
continue
self.parse_project(
project, project_parser_files[project.project_name], parser_types
)
SnapshotParser 解析处理
class SnapshotParser(SQLParser[IntermediateSnapshotNode, SnapshotNode]):
def parse_from_dict(self, dct, validate=True) -> IntermediateSnapshotNode:
if validate:
IntermediateSnapshotNode.validate(dct)
return IntermediateSnapshotNode.from_dict(dct)
@property
def resource_type(self) -> NodeType:
return NodeType.Snapshot
@classmethod
def get_compiled_path(cls, block: FileBlock):
return block.path.relative_path
def set_snapshot_attributes(self, node):
# use the target_database setting if we got it, otherwise the
# `database` value of the node (ultimately sourced from the `database`
# config value), and if that is not set, use the database defined in
# the adapter's credentials.
if node.config.target_database:
node.database = node.config.target_database
elif not node.database:
node.database = self.root_project.credentials.database
# the target schema must be set if we got here, so overwrite the node's
# schema
node.schema = node.config.target_schema
# We need to set relation_name again, since database/schema might have changed
self._update_node_relation_name(node)
return node
def get_fqn(self, path: str, name: str) -> List[str]:
"""Get the FQN for the node. This impacts node selection and config
application.
On snapshots, the fqn includes the filename.
"""
no_ext = os.path.splitext(path)[0]
fqn = [self.project.project_name]
fqn.extend(split_path(no_ext))
fqn.append(name)
return fqn
def transform(self, node: IntermediateSnapshotNode) -> SnapshotNode:
try:
# The config_call_dict is not serialized, because normally
# it is not needed after parsing. But since the snapshot node
# does this extra to_dict, save and restore it, to keep
# the model config when there is also schema config.
config_call_dict = node.config_call_dict
dct = node.to_dict(omit_none=True)
parsed_node = SnapshotNode.from_dict(dct)
parsed_node.config_call_dict = config_call_dict
self.set_snapshot_attributes(parsed_node)
return parsed_node
except ValidationError as exc:
raise SnapshopConfigError(exc, node)
def parse_file(self, file_block: FileBlock) -> None:
# 此处文件处理只支持snapshot 的,BlockSearcher 处理部分会使用自己的一套blocktag 解析处理
blocks = BlockSearcher(
source=[file_block],
allowed_blocks={"snapshot"},
source_tag_factory=BlockContents,
)
for block in blocks:
self.parse_node(block)
- 生成格式参考
如下图
- 执行
通过了解我们知道也是物化,那么就会使用物化相关定义处理
参考处理snapshot.sql ,里边东西还是比较多的, 同时也调用了不少其他macro 定义,详细的后边解决物化完整介绍下,默认的实现是基于了merge into
{% materialization snapshot, default %}
{%- set config = model['config'] -%}
{%- set target_table = model.get('alias', model.get('name')) -%}
{%- set strategy_name = config.get('strategy') -%}
{%- set unique_key = config.get('unique_key') %}
-- grab current tables grants config for comparision later on
{%- set grant_config = config.get('grants') -%}
{% set target_relation_exists, target_relation = get_or_create_relation(
database=model.database,
schema=model.schema,
identifier=target_table,
type='table') -%}
{%- if not target_relation.is_table -%}
{% do exceptions.relation_wrong_type(target_relation, 'table') %}
{%- endif -%}
{{ run_hooks(pre_hooks, inside_transaction=False) }}
{{ run_hooks(pre_hooks, inside_transaction=True) }}
{% set strategy_macro = strategy_dispatch(strategy_name) %}
{% set strategy = strategy_macro(model, "snapshotted_data", "source_data", config, target_relation_exists) %}
{% if not target_relation_exists %}
{% set build_sql = build_snapshot_table(strategy, model['compiled_code']) %}
{% set final_sql = create_table_as(False, target_relation, build_sql) %}
{% else %}
{{ adapter.valid_snapshot_target(target_relation) }}
{% set staging_table = build_snapshot_staging_table(strategy, sql, target_relation) %}
-- this may no-op if the database does not require column expansion
{% do adapter.expand_target_column_types(from_relation=staging_table,
to_relation=target_relation) %}
{% set missing_columns = adapter.get_missing_columns(staging_table, target_relation)
| rejectattr('name', 'equalto', 'dbt_change_type')
| rejectattr('name', 'equalto', 'DBT_CHANGE_TYPE')
| rejectattr('name', 'equalto', 'dbt_unique_key')
| rejectattr('name', 'equalto', 'DBT_UNIQUE_KEY')
| list %}
{% do create_columns(target_relation, missing_columns) %}
{% set source_columns = adapter.get_columns_in_relation(staging_table)
| rejectattr('name', 'equalto', 'dbt_change_type')
| rejectattr('name', 'equalto', 'DBT_CHANGE_TYPE')
| rejectattr('name', 'equalto', 'dbt_unique_key')
| rejectattr('name', 'equalto', 'DBT_UNIQUE_KEY')
| list %}
{% set quoted_source_columns = [] %}
{% for column in source_columns %}
{% do quoted_source_columns.append(adapter.quote(column.name)) %}
{% endfor %}
{% set final_sql = snapshot_merge_sql(
target = target_relation,
source = staging_table,
insert_cols = quoted_source_columns
)
%}
{% endif %}
{% call statement('main') %}
{{ final_sql }}
{% endcall %}
{% set should_revoke = should_revoke(target_relation_exists, full_refresh_mode=False) %}
{% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}
{% do persist_docs(target_relation, model) %}
{% if not target_relation_exists %}
{% do create_indexes(target_relation) %}
{% endif %}
{{ run_hooks(post_hooks, inside_transaction=True) }}
{{ adapter.commit() }}
{% if staging_table is defined %}
{% do post_snapshot(staging_table) %}
{% endif %}
{{ run_hooks(post_hooks, inside_transaction=False) }}
{{ return({'relations': [target_relation]}) }}
{% endmaterialization %}
说明
以上只是一个简单说明,dbt 不少功能都是基于物化block 处理的,后边说明下物化的处理
参考资料
core/dbt/parser/snapshots.py
core/dbt/parser/manifest.py
core/dbt/parser/search.py
core/dbt/clients/_jinja_blocks.py
core/dbt/contracts/graph/nodes.py
core/dbt/include/global_project/macros/materializations/snapshots/snapshot.sql
core/dbt/include/global_project/macros/materializations/snapshots/strategies.sql
https://docs.getdbt.com/reference/snapshot-properties