dbt 项目依赖文件加载处理简单说明
核心是通过ManifestLoader 的load 方法中通过调用ReadFilesFromFileSystem 处理的,以前简单说明dbt 的一些任务执行是需要先生成manifest文件(比如run)
此任务数据的预处理是通过装饰器
ReadFilesFromFileSystem 类
@dataclass
class ReadFilesFromFileSystem:
all_projects: Mapping[str, Project]
files: MutableMapping[str, AnySourceFile] = field(default_factory=dict)
# saved_files is only used to compare schema files
saved_files: MutableMapping[str, AnySourceFile] = field(default_factory=dict)
# project_parser_files = {
# "my_project": {
# "ModelParser": ["my_project://models/my_model.sql"]
# }
# }
#
project_parser_files: Dict = field(default_factory=dict)
# 此处是load 中加载依赖以及项目信息的方法
def read_files(self):
for project in self.all_projects.values():
file_types = get_file_types_for_project(project)
self.read_files_for_project(project, file_types)
def read_files_for_project(self, project, file_types):
dbt_ignore_spec = generate_dbt_ignore_spec(project.project_root)
project_files = self.project_parser_files[project.project_name] = {}
for parse_ft, file_type_info in file_types.items():
project_files[file_type_info["parser"]] = read_files_for_parser(
project,
self.files,
parse_ft,
file_type_info,
self.saved_files,
dbt_ignore_spec,
)
流程参考处理
首选是基于manifest解析关联的,parse_manifest 通过 ManifestLoader.get_full_manifest 加载解析获取项目相关的依赖(比如deps)
之后基于依赖项目的路径遍历文件夹加载不同类型的资源(macro,model,source,seed,snapshot,analysis,schema,docs.... 基本就是dbt 的一些核心概念)
内部处理
- manifest 装饰器部分处理
# 通过parse_manifest 解析生成manifest
if ctx.obj.get("manifest") is None:
ctx.obj["manifest"] = parse_manifest(
runtime_config, write_perf_info, write, ctx.obj["flags"].write_json
)
- parse_manifest 处理
def parse_manifest(runtime_config, write_perf_info, write, write_json):
register_adapter(runtime_config, get_mp_context())
adapter = get_adapter(runtime_config)
# 处理macro 上下文,在macro 执行地方需要
adapter.set_macro_context_generator(generate_runtime_macro_context)
manifest = ManifestLoader.get_full_manifest(
runtime_config,
write_perf_info=write_perf_info,
)
if write and write_json:
write_manifest(manifest, runtime_config.project_target_path)
pm = plugins.get_plugin_manager(runtime_config.project_name)
plugin_artifacts = pm.get_manifest_artifacts(manifest)
for path, plugin_artifact in plugin_artifacts.items():
plugin_artifact.write(path)
return manifest
- generate_runtime_macro_context 处理
实际上就是dbt 对于macro 执行需要的context信息(方便jinja2 渲染处理)
def generate_runtime_macro_context(
macro: MacroProtocol,
config: RuntimeConfig,
manifest: Manifest,
package_name: Optional[str],
) -> Dict[str, Any]:
# context 都继承自ProviderContext ,后续会说明下
ctx = MacroContext(macro, config, manifest, OperationProvider(), package_name)
return ctx.to_dict()
- ManifestLoader.get_full_manifest
会通过runtime_config的配置信息进行进一步读取,runtime_config 也是通过装饰器预处理的(runtime_config 依赖的project 也是)
runtime_config 处理
def wrapper(*args, **kwargs):
ctx = args[0]
assert isinstance(ctx, Context)
req_strs = ["profile", "project"]
reqs = [ctx.obj.get(req_str) for req_str in req_strs]
if None in reqs:
raise DbtProjectError("profile and project required for runtime_config")
config = RuntimeConfig.from_parts(
ctx.obj["project"],
ctx.obj["profile"],
ctx.obj["flags"],
)
ctx.obj["runtime_config"] = config
project 处理
def load_project(
project_root: str,
version_check: bool,
profile: HasCredentials,
cli_vars: Optional[Dict[str, Any]] = None,
) -> Project:
# get the project with all of the provided information
project_renderer = DbtProjectYamlRenderer(profile, cli_vars)
## 注意此处还会处理项目依赖,比如packages,里边东西比较多,github 有一个简单的readme 说明,实际上就是基于dbt 的一些约定
(项目配置文件中的,加载相关的model,source,macro )
project = Project.from_project_root(
project_root, project_renderer, verify_version=version_check
)
# Save env_vars encountered in rendering for partial parsing
project.project_env_vars = project_renderer.ctx_obj.env_vars
return project
get_full_manifest 核心处理
runtime 配置获取依赖信息,参考load_dependencies
projects = config.load_dependencies()
loader = cls(
config,
projects,
macro_hook=macro_hook,
file_diff=file_diff,
)
# 加载生成manifest
manifest = loader.load()
_check_manifest(manifest, config)
manifest.build_flat_graph()
# This needs to happen after loading from a partial parse,
# so that the adapter has the query headers from the macro_hook.
loader.save_macros_to_adapter(adapter)
依赖处理
def load_dependencies(self, base_only=False) -> Mapping[str, "RuntimeConfig"]:
if self.dependencies is None:
all_projects = {self.project_name: self}
# 会包含global,依赖的(packages的)以及dbt core 的
internal_packages = get_include_paths(self.credentials.type)
if base_only:
# Test setup -- we want to load macros without dependencies
project_paths = itertools.chain(internal_packages)
else:
# raise exception if fewer installed packages than in packages.yml
count_packages_specified = len(self.packages.packages) # type: ignore
count_packages_installed = len(tuple(self._get_project_directories()))
if count_packages_specified > count_packages_installed:
raise UninstalledPackagesFoundError(
count_packages_specified,
count_packages_installed,
self.packages_specified_path,
self.packages_install_path,
)
project_paths = itertools.chain(internal_packages, self._get_project_directories())
for project_name, project in self.load_projects(project_paths):
if project_name in all_projects:
raise NonUniquePackageNameError(project_name)
all_projects[project_name] = project
self.dependencies = all_projects
return self.dependencies
load 方法处理
里边代码比较多,核心是基于ReadFilesFromFileSystem 或者ReadFilesFromDiff 文件reader 读取项目以及依赖相关的文件(dbt 相关的,比如macro,modele,schema)
项目文件读取
def read_files(self):
for project in self.all_projects.values():
file_types = get_file_types_for_project(project)
self.read_files_for_project(project, file_types)
def read_files_for_project(self, project, file_types):
dbt_ignore_spec = generate_dbt_ignore_spec(project.project_root)
project_files = self.project_parser_files[project.project_name] = {}
for parse_ft, file_type_info in file_types.items():
# read_files_for_parser 基于项目,文件类型,进行分类查询
project_files[file_type_info["parser"]] = read_files_for_parser(
project,
self.files,
parse_ft,
file_type_info,
self.saved_files,
dbt_ignore_spec,
)
read_files_for_parser 处理
def read_files_for_parser(project, files, parse_ft, file_type_info, saved_files, ignore_spec):
dirs = file_type_info["paths"]
parser_files = []
for extension in file_type_info["extensions"]:
# get_source_files 是实际的文件夹遍历操作实现文件查找
source_files = get_source_files(
project, dirs, extension, parse_ft, saved_files, ignore_spec
)
for sf in source_files:
files[sf.file_id] = sf
parser_files.append(sf.file_id)
return parser_files
get_source_files 处理
def get_source_files(project, paths, extension, parse_file_type, saved_files, ignore_spec):
# file path list 基于os.walk 遍历文件夹
fp_list = filesystem_search(project, paths, extension, ignore_spec)
# file block list
fb_list = []
for fp in fp_list:
if parse_file_type == ParseFileType.Seed:
fb_list.append(load_seed_source_file(fp, project.project_name))
# singular tests live in /tests but only generic tests live
# in /tests/generic so we want to skip those
else:
if parse_file_type == ParseFileType.SingularTest:
path = pathlib.Path(fp.relative_path)
if path.parts[0] == "generic":
continue
file = load_source_file(fp, parse_file_type, project.project_name, saved_files)
# only append the list if it has contents. added to fix #3568
if file:
fb_list.append(file)
return fb_list
一个比较有意思的设计是dbt 没有直接使用绝对路径,而是自己弄了一套相对路径的格式,具体是BaseSourceFile 类
fid 格式处理如下
@property
def file_id(self):
if isinstance(self.path, RemoteFile):
return None
return f"{self.project_name}://{self.path.original_file_path}"
一个参考格式
{'MacroParser': ['dbt_utils://macros/web/get_url_host.sql', 'dbt_utils://macros/web/get_url_path.sql', 'dbt_utils://macros/web/get_url_parameter.sql', 'dbt_utils://macros/generic_tests/fewer_rows_than.sql', 'dbt_utils://macros/generic_tests/equal_rowcount.sql', 'dbt_utils://macros/generic_tests/relationships_where.sql', 'dbt_utils://macros/generic_tests/recency.sql', 'dbt_utils://macros/generic_tests/not_constant.sql', 'dbt_utils://macros/generic_tests/accepted_range.sql', 'dbt_utils://macros/generic_tests/not_accepted_values.sql', 'dbt_utils://macros/generic_tests/at_least_one.sql', 'dbt_utils://macros/generic_tests/unique_combination_of_columns.sql', 'dbt_utils://macros/generic_tests/
获取到文件列表之后还会对于文件进行进一步解析,以及macro进行处理
get_parsing_files (上边read_files 返回的文件,目前看是对于文件增删的处理)
说明
dbt 项目依赖文件 的处理主要是进行项目的依赖解析,然后对于项目依赖进行加载(macro,model,source),以上尽管时候关于依赖加载的
但是也有不少关于manifest的,manifest 处理属于dbt 比较核心的部分,后续会结合这个继续说明下
参考资料
core/dbt/parser/read_files.py
core/dbt/parser/manifest.py
core/dbt/config/project.py
core/dbt/parser/README.md
core/dbt/parser/search.py
core/dbt/parser/base.py
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)
2023-04-16 Pake 基于rust 开发的快速web 页面打包app 的工具
2022-04-16 使用 TypeScriptToLua 开发lua 应用
2021-04-16 cube.js 自定义checkAuth 响应状态码
2020-04-16 easy-rules spring boot 一个简单的starter
2019-04-16 websocket 2 rest api
2019-04-16 编写一个简单的基于jmespath 的prometheus exporter
2014-04-16 search bar 创建的一些文章