dbt show 内部实现简单说明
以前简单介绍过关于dbt show 的使用,以下简单说明下关于内部处理
参考实现
核心是show.py 中的ShowTask
- 参考代码
class ShowTask(CompileTask):
# 进行判断参数必须包含select 或者inline
def _runtime_initialize(self):
if not (self.args.select or getattr(self.args, "inline", None)):
raise DbtRuntimeError("Either --select or --inline must be passed to show")
super()._runtime_initialize()
# 执行具体执行的runner 关于runner 以前有简单介绍过,此处区分了seed 以及其他的,因为seed 比较特殊(实际就是文件)同时按照show 的处理与其他的是有差异
def get_runner_type(self, node):
if isinstance(node, SeedNode):
return SeedRunner
else:
return ShowRunner
def task_end_messages(self, results):
is_inline = bool(getattr(self.args, "inline", None))
if is_inline:
matched_results = [result for result in results if result.node.name == "inline_query"]
else:
matched_results = []
for result in results:
if result.node.name in self.selection_arg[0]:
matched_results.append(result)
else:
fire_event(
Note(msg=f"Excluded node '{result.node.name}' from results"),
EventLevel.DEBUG,
)
for result in matched_results:
# Allow passing in -1 (or any negative number) to get all rows
table = result.agate_table
if self.args.limit >= 0:
table = table.limit(self.args.limit)
result.agate_table = table
# Hack to get Agate table output as string
output = io.StringIO()
if self.args.output == "json":
table.to_json(path=output)
else:
table.print_table(output=output, max_rows=None)
node_name = result.node.name
if hasattr(result.node, "version") and result.node.version:
node_name += f".v{result.node.version}"
fire_event(
ShowNode(
node_name=node_name,
preview=output.getvalue(),
is_inline=is_inline,
output_format=self.args.output,
unique_id=result.node.unique_id,
)
)
def _handle_result(self, result):
super()._handle_result(result)
if (
result.node.is_ephemeral_model
and type(self) is ShowTask
and (self.args.select or getattr(self.args, "inline", None))
):
self.node_results.append(result)
- ShowRunner 处理
从实际要说,就是macro 的执行生成编译的sql,然后调用adapter 的sql 查询进行数据获取,并处理
class ShowRunner(CompileRunner):
def __init__(self, config, adapter, node, node_index, num_nodes):
super().__init__(config, adapter, node, node_index, num_nodes)
self.run_ephemeral_models = True
def execute(self, compiled_node, manifest):
start_time = time.time()
# Allow passing in -1 (or any negative number) to get all rows
limit = None if self.config.args.limit < 0 else self.config.args.limit
# 模型上下文生成
model_context = generate_runtime_model_context(compiled_node, self.config, manifest)
# 执行get_show_sql 生成模型sql (编译的)
compiled_node.compiled_code = self.adapter.execute_macro(
macro_name="get_show_sql",
manifest=manifest,
context_override=model_context,
kwargs={
"compiled_code": model_context["compiled_code"],
"sql_header": model_context["config"].get("sql_header"),
"limit": limit,
},
)
# 执行macro 的sql (通过adapter 执行)
adapter_response, execute_result = self.adapter.execute(
compiled_node.compiled_code, fetch=True
)
end_time = time.time()
# 包装返回数据
return RunResult(
node=compiled_node,
status=RunStatus.Success,
timing=[],
thread_id=threading.current_thread().name,
execution_time=end_time - start_time,
message=None,
adapter_response=adapter_response.to_dict(),
agate_table=execute_result,
failures=None,
)
- get_show_sql macro实现
实际上就是为了方便处理基于jinja2 的macro 包装了一个macro 方便sql 生成(当前实现主要是get_limit_subquery_sql)
{% macro get_show_sql(compiled_code, sql_header, limit) -%}
{%- if sql_header -%}
{{ sql_header }}
{%- endif -%}
{%- if limit is not none -%}
{{ get_limit_subquery_sql(compiled_code, limit) }}
{%- else -%}
{{ compiled_code }}
{%- endif -%}
{% endmacro %}
{% macro get_limit_subquery_sql(sql, limit) %}
{{ adapter.dispatch('get_limit_subquery_sql', 'dbt')(sql, limit) }}
{% endmacro %}
{% macro default__get_limit_subquery_sql(sql, limit) %}
select *
from (
{{ sql }}
) as model_limit_subq
limit {{ limit }}
{% endmacro %}
说明
以上是关于dbt show 内部处理的说明,集合源码以及官方文档学习我们就会发现整体实现上还是比较简单的,而且也比较清晰
参考资料
core/dbt/task/show.py
core/dbt/include/global_project/macros/adapters/show.sql
https://docs.getdbt.com/reference/commands/show