dbt dbt_external_tables 包简单说明
dbt dbt_external_tables 提供了在source 中使用外部table 的能力,主要面向的一些支持外表能力的数据仓库
因为dbt 已经支持了对于external配置属性的解析,dbt-external-tables 包核心就是对于包含属性的进行处理
比如创建table,schema,管理刷新,详细内部实现大家可以参考实际源码,我只简单说明下
使用
- 配置source
version: 2
sources:
- name: snowplow
tables:
- name: event
description: >
This source table is actually a set of files in external storage.
The dbt-external-tables package provides handy macros for getting
those files queryable, just in time for modeling.
external:
location: # required: S3 file path, GCS file path, Snowflake stage, Synapse data source
partitions: # optional
- name: collector_date
data_type: date
# Specify ALL column names + datatypes.
# Column order must match for CSVs, column names must match for other formats.
# Some databases support schema inference.
columns:
- name: app_id
data_type: varchar(255)
description: "Application ID"
- name: platform
data_type: varchar(255)
description: "Platform"
- 执行
通过run-operation 执行stage_external_sources macro
dbt run-operation stage_external_sources
stage_external_sources macro 实现
- 参考代码
实际上就是通过graph.sources 获取source, 然后判断是否支持external,对于支持external 的进行table,schema,以及更新处理
{% macro stage_external_sources(select=none) %}
{% set sources_to_stage = [] %}
{% set source_nodes = graph.sources.values() if graph.sources else [] %}
{% for node in source_nodes %}
{% if node.external %}
{% if select %}
{% for src in select.split(' ') %}
{% if '.' in src %}
{% set src_s = src.split('.') %}
{% if src_s[0] == node.source_name and src_s[1] == node.name %}
{% do sources_to_stage.append(node) %}
{% endif %}
{% else %}
{% if src == node.source_name %}
{% do sources_to_stage.append(node) %}
{% endif %}
{% endif %}
{% endfor %}
{% else %}
{% do sources_to_stage.append(node) %}
{% endif %}
{% endif %}
{% endfor %}
{% if sources_to_stage|length == 0 %}
{% do log('No external sources selected', info = true) %}
{% endif %}
{% for node in sources_to_stage %}
{% set loop_label = loop.index ~ ' of ' ~ loop.length %}
{% do log(loop_label ~ ' START external source ' ~ node.schema ~ '.' ~ node.identifier, info = true) -%}
# 为了方便处理,自己定义了一个get_external_build_plan,因为每个具体的数仓内部处理流程可能不太一样
{% set run_queue = dbt_external_tables.get_external_build_plan(node) %}
{% do log(loop_label ~ ' SKIP', info = true) if run_queue == [] %}
{% set width = flags.PRINTER_WIDTH %}
{% for q in run_queue %}
{% set q_msg = q|replace('\n','')|replace('begin;','')|trim %}
{% set q_log = q_msg[:width] ~ '... ' if q_msg|length > width else q_msg %}
{% do log(loop_label ~ ' (' ~ loop.index ~ ') ' ~ q_log, info = true) %}
{% set exit_txn = dbt_external_tables.exit_transaction() %}
{% call statement('runner', fetch_result = True, auto_begin = False) %}
{{ exit_txn }} {{ q }}
{% endcall %}
{% set runner = load_result('runner') %}
{% set log_msg = runner['response'] if 'response' in runner.keys() else runner['status'] %}
{% do log(loop_label ~ ' (' ~ loop.index ~ ') ' ~ log_msg, info = true) %}
{% endfor %}
{% set update_columns = dbt_external_tables.update_external_table_columns(node) %}
{{ update_columns }}
{% endfor %}
{% endmacro %}
包代码结构值得学习的地方
dbt_external_tables 对于不同数据仓库的实现处理通过文件夹区分了,当然还是会结合dbt 的dispatch 玩法,分层上很值得学习
- 代码结构
可以看到包含了common 以及每个数仓独立的实现
macros
├── common
│ ├── create_external_schema.sql
│ ├── create_external_table.sql
│ ├── get_external_build_plan.sql
│ ├── helpers
│ │ ├── dropif.sql
│ │ └── transaction.sql
│ ├── refresh_external_table.sql
│ ├── stage_external_sources.sql
│ └── update_external_table_columns.sql
└── plugins
├── bigquery
│ ├── create_external_schema.sql
│ ├── create_external_table.sql
│ ├── get_external_build_plan.sql
│ └── update_external_table_columns.sql
├── redshift
│ ├── create_external_table.sql
│ ├── get_external_build_plan.sql
│ ├── helpers
│ │ ├── add_partitions.sql
│ │ ├── dropif.sql
│ │ ├── is_ext_tbl.sql
│ │ ├── paths.sql
│ │ ├── render_macro.sql
│ │ └── transaction.sql
│ └── refresh_external_table.sql
├── snowflake
│ ├── create_external_schema.sql
│ ├── create_external_table.sql
│ ├── get_external_build_plan.sql
│ ├── helpers
│ │ └── is_csv.sql
│ ├── refresh_external_table.sql
│ └── snowpipe
│ ├── create_empty_table.sql
│ ├── create_snowpipe.sql
│ ├── get_copy_sql.sql
│ └── refresh_snowpipe.sql
├── spark
│ ├── create_external_table.sql
│ ├── get_external_build_plan.sql
│ ├── helpers
│ │ ├── dropif.sql
│ │ └── recover_partitions.sql
│ └── refresh_external_table.sql
└── sqlserver
├── create_external_schema.sql
├── create_external_table.sql
├── get_external_build_plan.sql
└── helpers
└── dropif.sql
- 参考common
我只简单说明一个create_external_table 定义,default 会提示异常
{% macro create_external_table(source_node) %}
{{ adapter.dispatch('create_external_table', 'dbt_external_tables')(source_node) }}
{% endmacro %}
{% macro default__create_external_table(source_node) %}
{{ exceptions.raise_compiler_error("External table creation is not implemented for the default adapter") }}
{% endmacro %}
bigquery 实现
{% macro bigquery__create_external_table(source_node) %}
{%- set columns = source_node.columns.values() -%}
{%- set external = source_node.external -%}
{%- set partitions = external.partitions -%}
{%- set options = external.options -%}
{%- set non_string_options = ['max_staleness'] %}
{% if options is mapping and options.get('connection_name', none) %}
{% set connection_name = options.pop('connection_name') %}
{% endif %}
{%- set uris = [] -%}
{%- if options is mapping and options.get('uris', none) -%}
{%- set uris = external.options.get('uris') -%}
{%- else -%}
{%- set uris = [external.location] -%}
{%- endif -%}
create or replace external table {{source(source_node.source_name, source_node.name)}}
{%- if columns -%}(
{% for column in columns %}
{%- set column_quoted = adapter.quote(column.name) if column.quote else column.name %}
{{column_quoted}} {{column.data_type}} {{- ',' if not loop.last -}}
{%- endfor -%}
)
{% endif %}
{% if options and options.get('hive_partition_uri_prefix', none) %}
with partition columns {%- if partitions %} (
{%- for partition in partitions %}
{{partition.name}} {{partition.data_type}}{{',' if not loop.last}}
{%- endfor -%}
) {% endif -%}
{% endif %}
{% if connection_name %}
with connection `{{ connection_name }}`
{% endif %}
options (
uris = [{%- for uri in uris -%} '{{uri}}' {{- "," if not loop.last}} {%- endfor -%}]
{%- if options is mapping -%}
{%- for key, value in options.items() if key != 'uris' %}
{%- if value is string and key not in non_string_options -%}
, {{key}} = '{{value}}'
{%- else -%}
, {{key}} = {{value}}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
)
{% endmacro %}
说明
以上是一个简单的说明,代码结构局布局上很值得学习
参考资料
https://github.com/dbt-labs/dbt-external-tables
https://docs.getdbt.com/reference/resource-properties/external