dbt dbt_external_tables 包简单说明

dbt dbt_external_tables 提供了在source 中使用外部table 的能力,主要面向的一些支持外表能力的数据仓库
因为dbt 已经支持了对于external配置属性的解析,dbt-external-tables 包核心就是对于包含属性的进行处理
比如创建table,schema,管理刷新,详细内部实现大家可以参考实际源码,我只简单说明下

使用

  • 配置source
version: 2
 
sources:
  - name: snowplow
    tables:
      - name: event
        description: >
            This source table is actually a set of files in external storage.
            The dbt-external-tables package provides handy macros for getting
            those files queryable, just in time for modeling.
 
        external:
          location:         # required: S3 file path, GCS file path, Snowflake stage, Synapse data source
 
          ...               # database-specific properties of external table
 
          partitions:       # optional
            - name: collector_date
              data_type: date
              ...           # database-specific properties
 
        # Specify ALL column names + datatypes.
        # Column order must match for CSVs, column names must match for other formats.
        # Some databases support schema inference.
 
        columns:
          - name: app_id
            data_type: varchar(255)
            description: "Application ID"
          - name: platform
            data_type: varchar(255)
            description: "Platform"
  • 执行
    通过run-operation 执行stage_external_sources macro
dbt run-operation stage_external_sources

stage_external_sources macro 实现

  • 参考代码
    实际上就是通过graph.sources 获取source, 然后判断是否支持external,对于支持external 的进行table,schema,以及更新处理
{% macro stage_external_sources(select=none) %}
 
    {% set sources_to_stage = [] %}
 
    {% set source_nodes = graph.sources.values() if graph.sources else [] %}
 
    {% for node in source_nodes %}
        {% if node.external %}
 
            {% if select %}
 
                {% for src in select.split(' ') %}
 
                    {% if '.' in src %}
                        {% set src_s = src.split('.') %}
                        {% if src_s[0] == node.source_name and src_s[1] == node.name %}
                            {% do sources_to_stage.append(node) %}
                        {% endif %}
                    {% else %}
                        {% if src == node.source_name %}
                            {% do sources_to_stage.append(node) %}
                        {% endif %}
                    {% endif %}
 
                {% endfor %}
 
            {% else %}
 
                {% do sources_to_stage.append(node) %}
 
            {% endif %}
        {% endif %}
 
    {% endfor %}
 
    {% if sources_to_stage|length == 0 %}
        {% do log('No external sources selected', info = true) %}
    {% endif %}
 
    {% for node in sources_to_stage %}
 
        {% set loop_label = loop.index ~ ' of ' ~ loop.length %}
 
        {% do log(loop_label ~ ' START external source ' ~ node.schema ~ '.' ~ node.identifier, info = true) -%}
        # 为了方便处理,自己定义了一个get_external_build_plan,因为每个具体的数仓内部处理流程可能不太一样
        {% set run_queue = dbt_external_tables.get_external_build_plan(node) %}
 
        {% do log(loop_label ~ ' SKIP', info = true) if run_queue == [] %}
        {% set width = flags.PRINTER_WIDTH %}
 
        {% for q in run_queue %}
 
            {% set q_msg = q|replace('\n','')|replace('begin;','')|trim %}
            {% set q_log = q_msg[:width] ~ '...  ' if q_msg|length > width else q_msg %}
 
            {% do log(loop_label ~ ' (' ~ loop.index ~ ') ' ~ q_log, info = true) %}
            {% set exit_txn = dbt_external_tables.exit_transaction() %}
 
            {% call statement('runner', fetch_result = True, auto_begin = False) %}
                {{ exit_txn }} {{ q }}
            {% endcall %}
 
            {% set runner = load_result('runner') %}
            {% set log_msg = runner['response'] if 'response' in runner.keys() else runner['status'] %}
            {% do log(loop_label ~ ' (' ~ loop.index ~ ') ' ~ log_msg, info = true) %}
 
        {% endfor %}
 
        {% set update_columns = dbt_external_tables.update_external_table_columns(node) %}
        {{ update_columns }}
 
    {% endfor %}
 
{% endmacro %}

包代码结构值得学习的地方

dbt_external_tables 对于不同数据仓库的实现处理通过文件夹区分了,当然还是会结合dbt 的dispatch 玩法,分层上很值得学习

  • 代码结构

可以看到包含了common 以及每个数仓独立的实现

macros
├── common
├── create_external_schema.sql
├── create_external_table.sql
├── get_external_build_plan.sql
├── helpers
├── dropif.sql
└── transaction.sql
├── refresh_external_table.sql
├── stage_external_sources.sql
└── update_external_table_columns.sql
└── plugins
    ├── bigquery
    ├── create_external_schema.sql
    ├── create_external_table.sql
    ├── get_external_build_plan.sql
    └── update_external_table_columns.sql
    ├── redshift
    ├── create_external_table.sql
    ├── get_external_build_plan.sql
    ├── helpers
    ├── add_partitions.sql
    ├── dropif.sql
    ├── is_ext_tbl.sql
    ├── paths.sql
    ├── render_macro.sql
    └── transaction.sql
    └── refresh_external_table.sql
    ├── snowflake
    ├── create_external_schema.sql
    ├── create_external_table.sql
    ├── get_external_build_plan.sql
    ├── helpers
    └── is_csv.sql
    ├── refresh_external_table.sql
    └── snowpipe
    ├── create_empty_table.sql
    ├── create_snowpipe.sql
    ├── get_copy_sql.sql
    └── refresh_snowpipe.sql
    ├── spark
    ├── create_external_table.sql
    ├── get_external_build_plan.sql
    ├── helpers
    ├── dropif.sql
    └── recover_partitions.sql
    └── refresh_external_table.sql
    └── sqlserver
        ├── create_external_schema.sql
        ├── create_external_table.sql
        ├── get_external_build_plan.sql
        └── helpers
            └── dropif.sql
  • 参考common
    我只简单说明一个create_external_table 定义,default 会提示异常
{% macro create_external_table(source_node) %}
    {{ adapter.dispatch('create_external_table', 'dbt_external_tables')(source_node) }}
{% endmacro %}
 
{% macro default__create_external_table(source_node) %}
    {{ exceptions.raise_compiler_error("External table creation is not implemented for the default adapter") }}
{% endmacro %}

bigquery 实现

{% macro bigquery__create_external_table(source_node) %}
    {%- set columns = source_node.columns.values() -%}
    {%- set external = source_node.external -%}
    {%- set partitions = external.partitions -%}
    {%- set options = external.options -%}
    {%- set non_string_options = ['max_staleness'] %}
 
    {% if options is mapping and options.get('connection_name', none) %}
        {% set connection_name = options.pop('connection_name') %}
    {% endif %}
 
    {%- set uris = [] -%}
    {%- if options is mapping and options.get('uris', none) -%}
        {%- set uris = external.options.get('uris') -%}
    {%- else -%}
        {%- set uris = [external.location] -%}
    {%- endif -%}
 
    create or replace external table {{source(source_node.source_name, source_node.name)}}
        {%- if columns -%}(
            {% for column in columns %}
                {%- set column_quoted = adapter.quote(column.name) if column.quote else column.name %}
                {{column_quoted}} {{column.data_type}} {{- ',' if not loop.last -}}
            {%- endfor -%}
        )
        {% endif %}
        {% if options and options.get('hive_partition_uri_prefix', none) %}
        with partition columns {%- if partitions %} (
            {%- for partition in partitions %}
                {{partition.name}} {{partition.data_type}}{{',' if not loop.last}}
            {%- endfor -%}
        ) {% endif -%}
        {% endif %}
        {% if connection_name %}
            with connection `{{ connection_name }}`
        {% endif %}
        options (
            uris = [{%- for uri in uris -%} '{{uri}}' {{- "," if not loop.last}} {%- endfor -%}]
            {%- if options is mapping -%}
            {%- for key, value in options.items() if key != 'uris' %}
                {%- if value is string and key not in non_string_options -%}
                , {{key}} = '{{value}}'
                {%- else -%}
                , {{key}} = {{value}}
                {%- endif -%}
            {%- endfor -%}
            {%- endif -%}
        )
{% endmacro %}

说明

以上是一个简单的说明,代码结构局布局上很值得学习

参考资料

https://github.com/dbt-labs/dbt-external-tables
https://docs.getdbt.com/reference/resource-properties/external

posted on 2024-05-10 19:37  荣锋亮  阅读(6)  评论(0编辑  收藏  举报

导航