dbt show 内部实现简单说明

以前简单介绍过关于dbt show 的使用，以下简单说明下关于内部处理

参考实现

核心是show.py 中的ShowTask

参考代码

class ShowTask(CompileTask):

   # 进行判断参数必须包含select 或者inline

    def _runtime_initialize(self):

        if not (self.args.select or getattr(self.args, "inline", None)):

            raise DbtRuntimeError("Either --select or --inline must be passed to show")

        super()._runtime_initialize()

    # 执行具体执行的runner 关于runner 以前有简单介绍过，此处区分了seed 以及其他的，因为seed 比较特殊（实际就是文件）同时按照show 的处理与其他的是有差异

    def get_runner_type(self, node):

        if isinstance(node, SeedNode):

            return SeedRunner

        else:

            return ShowRunner
 
    def task_end_messages(self, results):

        is_inline = bool(getattr(self.args, "inline", None))
 
        if is_inline:

            matched_results = [result for result in results if result.node.name == "inline_query"]

        else:

            matched_results = []

            for result in results:

                if result.node.name in self.selection_arg[0]:

                    matched_results.append(result)

                else:

                    fire_event(

                        Note(msg=f"Excluded node '{result.node.name}' from results"),

                        EventLevel.DEBUG,

                    )
 
        for result in matched_results:

            # Allow passing in -1 (or any negative number) to get all rows

            table = result.agate_table
 
            if self.args.limit >= 0:

                table = table.limit(self.args.limit)

                result.agate_table = table
 
            # Hack to get Agate table output as string

            output = io.StringIO()

            if self.args.output == "json":

                table.to_json(path=output)

            else:

                table.print_table(output=output, max_rows=None)
 
            node_name = result.node.name
 
            if hasattr(result.node, "version") and result.node.version:

                node_name += f".v{result.node.version}"
 
            fire_event(

                ShowNode(

                    node_name=node_name,

                    preview=output.getvalue(),

                    is_inline=is_inline,

                    output_format=self.args.output,

                    unique_id=result.node.unique_id,

                )

            )
 
    def _handle_result(self, result):

        super()._handle_result(result)
 
        if (

            result.node.is_ephemeral_model

            and type(self) is ShowTask

            and (self.args.select or getattr(self.args, "inline", None))

        ):

            self.node_results.append(result)

ShowRunner 处理
从实际要说，就是macro 的执行生成编译的sql，然后调用adapter 的sql 查询进行数据获取，并处理

class ShowRunner(CompileRunner):

    def __init__(self, config, adapter, node, node_index, num_nodes):

        super().__init__(config, adapter, node, node_index, num_nodes)

        self.run_ephemeral_models = True
 
    def execute(self, compiled_node, manifest):

        start_time = time.time()
 
        # Allow passing in -1 (or any negative number) to get all rows

        limit = None if self.config.args.limit < 0 else self.config.args.limit

       # 模型上下文生成

        model_context = generate_runtime_model_context(compiled_node, self.config, manifest)

       # 执行get_show_sql 生成模型sql （编译的）

        compiled_node.compiled_code = self.adapter.execute_macro(

            macro_name="get_show_sql",

            manifest=manifest,

            context_override=model_context,

            kwargs={

                "compiled_code": model_context["compiled_code"],

                "sql_header": model_context["config"].get("sql_header"),

                "limit": limit,

            },

        )

      # 执行macro 的sql （通过adapter 执行）

        adapter_response, execute_result = self.adapter.execute(

            compiled_node.compiled_code, fetch=True

        )
 
        end_time = time.time()

       # 包装返回数据

        return RunResult(

            node=compiled_node,

            status=RunStatus.Success,

            timing=[],

            thread_id=threading.current_thread().name,

            execution_time=end_time - start_time,

            message=None,

            adapter_response=adapter_response.to_dict(),

            agate_table=execute_result,

            failures=None,

        )

get_show_sql macro实现
实际上就是为了方便处理基于jinja2 的macro 包装了一个macro 方便sql 生成（当前实现主要是get_limit_subquery_sql）

{% macro get_show_sql(compiled_code, sql_header, limit) -%}

  {%- if sql_header -%}

  {{ sql_header }}

  {%- endif -%}

  {%- if limit is not none -%}

  {{ get_limit_subquery_sql(compiled_code, limit) }}

  {%- else -%}

  {{ compiled_code }}

  {%- endif -%}

{% endmacro %}
 
{% macro get_limit_subquery_sql(sql, limit) %}

  {{ adapter.dispatch('get_limit_subquery_sql', 'dbt')(sql, limit) }}

{% endmacro %}
 
{% macro default__get_limit_subquery_sql(sql, limit) %}

    select *

    from (

        {{ sql }}

    ) as model_limit_subq

    limit {{ limit }}

{% endmacro %}

说明

以上是关于dbt show 内部处理的说明，集合源码以及官方文档学习我们就会发现整体实现上还是比较简单的，而且也比较清晰

参考资料

core/dbt/task/show.py
core/dbt/include/global_project/macros/adapters/show.sql
https://docs.getdbt.com/reference/commands/show

posted on 2024-05-04 00:24 荣锋亮阅读(12) 评论(0) 编辑收藏举报

刷新页面返回顶部

rongfengliang-荣锋亮

dbt show 内部实现简单说明

参考实现

说明

参考资料

导航

公告