OneFlow: Python 端构图
前言
在写这篇之前,已经写过一篇博客 [1] 分析了如何获取前端的表达式,不过我认为写的不够清晰,代码没怎么贴,后面再看发现只能看当时的思考,而不是源代码,所以就重新写一遍了。这篇文章会更加专注于 Python 端是如何构图的,更加关注算子的内部的执行过程。
带着如下几个问题去看代码:
- 什么时候获取前端表达式?
- 获取的前端表达式是什么样的格式的?
- 对于一个 InferenceSession,读取本地模型,逐个逐个将 Op 添加到 OneFlow 当中。对于一个 Job 函数,它是如何将 Op 添加到 OneFlow 当中的呢?
流程分析
首先先走一遍流程,这里截取了 [1] 的流程部分,顺便补上代码,加上自己的一些点评。
- Python 逐行执行,执行到了 @flow.global_function()。
Python 解释器运行的时候,会逐行读取代码并执行。文末有附代码。
# 文档中的例子 lenet
@flow.global_function(type="train")
def train_job(
images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
) -> tp.Numpy:
with flow.scope.placement("gpu", "0:0"):
logits = lenet(images, train=True)
loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
labels, logits, name="softmax_loss"
)
lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(loss)
return loss
- 跳进 api_oneflow_function,获取一个函数,这个函数是装饰器。根据静态图或者动态图,返回一个装饰器。
# python/oneflow/compatible/single_client/framework/function_util.py: 85
def api_oneflow_function(
type: str = "predict", function_config: FunctionConfig = None
) -> Callable[[Callable], Callable]:
"""Creates a callable OneFlow global function from a Python function.
For instance::
@oneflow.compatible.single_client.global_function(flow.FunctionConfig())
def train():
# your model
Args:
function_config (FunctionConfig, optional): a `FunctionConfig` object. Defaults to FunctionConfig().
Returns:
Callable[[Callable], Callable]: a callable which is called to execute the compiled function
"""
if isinstance(type, FunctionConfig):
function_config = type
print(
"WARNING: flow.global_function(func_config) is deprecated. Please replace it with flow.global_function(type, func_config).\n "
)
print(traceback.format_stack()[-2])
else:
assert type in ["train", "predict"]
if function_config is None:
function_config = FunctionConfig()
if type == "train":
function_config.function_desc.job_config_proto.mutable_train_conf()
else:
function_config.function_desc.job_config_proto.mutable_predict_conf()
api = enable_if.unique([eager_oneflow_function, lazy_oneflow_function])
return api(function_config)
- 上面代码获取到的是 lazy_oneflow_function。这里会创建一个闭包,闭包里面包含有运行这个 Job 的 Session 等信息,以及用户定义的 Job 函数。
# python/oneflow/compatible/single_client/framework/function_util.py: 143
@enable_if.condition(
hob.in_normal_mode & ~hob.eager_execution_enabled & ~hob.session_initialized
)
def lazy_oneflow_function(function_config=FunctionConfig()):
assert isinstance(function_config, FunctionConfig)
def Decorator(job_func):
if not hasattr(job_func, "__oneflow_function_signature__"):
job_func.__oneflow_function_signature__ = inspect.signature(job_func)
oft_util.CheckGlobalFunctionAnnotation(job_func.__oneflow_function_signature__)
sess = session_ctx.GetDefaultSession()
@functools.wraps(job_func)
def Func(*args, **kwargs):
return _RunLazyJob(sess, job_func, *args, **kwargs)
sess.AddJob(_CloneFunctionDesc(function_config.function_desc, job_func))
for x in dir(job_func):
if x.startswith("__oneflow_"):
setattr(Func, x, getattr(job_func, x))
return Func
return Decorator
- 经过装饰器之后,被装饰的函数最终会执行的方法是 _RunLazyJob。如果用户第一次调用 Job 函数,会进行 Session 的初始化。初始化结束之后,会 LazyRun。被 @flow.global_function() 装饰的函数,第一次调用的时候,执行初始化 TryInit,后续调用不再重复初始化。
# python/oneflow/compatible/single_client/framework/function_util.py: 225
def _RunLazyJob(session, job_func, *args, **kwargs):
return session.TryInit().LazyRun(job_func, *args, **kwargs)
- Session 初始化的时候,进行编译。调用 compiler 的 Compile 方法,对 Job 进行编译。
# python/oneflow/compatible/single_client/framework/session_util.py: 183
def Init(self):
assert self.status_ is SessionStatus.OPEN
self.status_ = SessionStatus.RUNNING
if not oneflow._oneflow_internal.IsEnvInited():
flow.env.init()
_TryCompleteConfigProto(self.config_proto)
self.resource_ = self.config_proto.resource
if not oneflow._oneflow_internal.EagerExecutionEnabled():
c_api_util.InitLazyGlobalSession(self.config_proto)
for (job_name, func_desc) in self.job_name2function_desc_.items():
compiler.Compile(self, func_desc, self.config_proto)
self.existed_module_names_ = set()
self.job_name2var_name2var_blob_ = dict()
assert len(self.job_name2function_desc_.items()) > 0
oneflow._oneflow_internal.StartLazyGlobalSession()
self.inter_user_job_info_ = c_api_util.GetInterUserJobInfo()
self.UpdateInfo4InterfaceOp()
if not config_util.api_legacy_model_io_enabled():
check_point_v2.Init()
else:
self.eager_config_proto_ctx_ = oneflow._oneflow_internal.LogicalConfigProtoContext(
str(self.config_proto)
)
return self
- 在编译之前需要设置好环境,可以看到 Compile 中的 with,指向了一个方法 InterpretScope。在 InterpretScope 的最后,主要的工作是:打开 JobBuildAndInferCtx,设置 JobBuildAndInferCtx,设置 Python 的运行模式,设置运行时的 scope。这些设置放在了 with 里面,在这一段结束的时候,还会进行一些清理的工作,比如关闭 JobBuildAndInferCtx 等。
# python/oneflow/compatible/single_client/framework/compiler.py: 44
def Compile(session, function_desc, config_proto):
with InterpretScope(session, function_desc, config_proto):
_CompileJob(session, function_desc)
session.StashJob(function_desc.job_func.__name__)
oneflow._oneflow_internal.CurJobBuildAndInferCtx_Complete()
session.StashJob(
function_desc.job_func.__name__,
function_desc.job_func.__name__ + "_after_complete",
)
# python/oneflow/compatible/single_client/framework/compiler.py: 63
@contextmanager
def InterpretScope(session, function_desc, config_proto):
job_conf = function_desc.job_config_proto
job_conf.set_job_name(function_desc.job_func.__name__)
placement_scope = function_desc.function_attribute.default_placement_scope
if placement_scope is None:
tag_and_dev_ids = placement_util.GetDefaultMachineDeviceIds(session.resource)
hierarchy = None
else:
assert isinstance(placement_scope, placement_ctx.EmptyPlacementScope)
tag_and_dev_ids = (
placement_scope.device_tag,
placement_scope.machine_device_ids,
)
hierarchy = placement_scope.hierarchy
distribute_strategy = function_desc.function_attribute.default_distribute_strategy
if distribute_strategy is None:
distribute_strategy = distribute_util.DistributeConsistentStrategy()
is_mirrored = isinstance(
distribute_strategy, distribute_util.DistributeMirroredStrategy
)
assert isinstance(hierarchy, (list, tuple)) or hierarchy is None
if hierarchy is not None:
hierarchy = oneflow._oneflow_internal.Size(tuple(hierarchy))
scope = scope_util.MakeInitialScope(
job_conf, *tag_and_dev_ids, hierarchy, is_mirrored
)
with _JobBuildAndInferCtx(job_conf.job_name()), distribute_strategy:
c_api_util.CurJobBuildAndInferCtx_SetJobConf(job_conf)
with runtime_mode.ModeScope(runtime_mode.GLOBAL_MODE):
with scope_util.ScopeContext(scope):
yield
# python/oneflow/compatible/single_client/framework/compiler.py: 145
@contextmanager
def _JobBuildAndInferCtx(job_name):
c_api_util.JobBuildAndInferCtx_Open(job_name)
try:
yield
finally:
oneflow._oneflow_internal.JobBuildAndInferCtx_Close()
# python/oneflow/compatible/single_client/framework/scope_util.py: 105
@contextmanager
def ScopeContext(scope):
old_scope = oneflow._oneflow_internal.GetCurrentScope()
oneflow._oneflow_internal.GlobalScopeStackPush(scope)
try:
yield
finally:
assert oneflow._oneflow_internal.GetCurrentScope() is scope
oneflow._oneflow_internal.GlobalScopeStackPop()
assert oneflow._oneflow_internal.GetCurrentScope() is old_scope
- 接下来执行 _CompileJob,其中会执行用户定义的 Job 函数。我可以看到调用了 func 方法,这个实际上就是 Job 函数。
# python/oneflow/compatible/single_client/framework/compiler.py: 97
def _CompileJob(session, function_desc):
func = function_desc.job_func
parameters = func.__oneflow_function_signature__.parameters
if len(parameters) == 0:
func.__oneflow_input_blob_defs__ = ()
elif all((p.annotation is inspect._empty for (_, p) in parameters.items())):
func.__oneflow_input_blob_defs__ = _GetArgDefault(func)
elif all((p.annotation is not inspect._empty for (_, p) in parameters.items())):
func.__oneflow_input_blob_defs__ = _MakeInputBlobDefFromParameterSignature(
parameters
)
else:
raise NotImplementedError(
"All parameters of global function should be annotated"
)
inputs = _RecursiveMakeInputBlobs(func.__oneflow_input_blob_defs__)
ret = func(*inputs)
return_annotation = func.__oneflow_function_signature__.return_annotation
oft_util.CheckReturnByAnnotation(func.__name__, ret, return_annotation)
func.__oneflow_output_remote_blobs__ = _RecursiveMakeRetRemoteBlobs(
ret, allow_cpu_return_op=function_desc.function_attribute.allow_cpu_return_op
)
- Job 函数内部执行的时候,就是到了具体的算子。每个算子的结尾都会去调用 user_op_builder.py 的方法。比较重要的一个方法是 InferAndTryRun,这个方法将会调用 CurJobAddOp 添加算子到图里。下面以 pad 算子为例子。Python 端的算子会检查用户输入是否合法,最后调用 user_op_builder 将算子加入到计算图中。
# python/oneflow/compatible/single_client/ops/pad.py: 229
def reflection_pad2d(
x: oneflow._oneflow_internal.BlobDesc,
padding: Union[int, tuple, list],
name: Optional[str] = None,
) -> oneflow._oneflow_internal.BlobDesc:
(H, W) = (x.shape[2], x.shape[3])
if isinstance(padding, (tuple, list)):
assert len(padding) == len(x.shape), ValueError(
"padding boundry must be the same size of input dims"
)
assert (
padding[2] < H and padding[3] < H and (padding[0] < W) and (padding[1] < W)
), ValueError(
"Padding size should be less than the corresponding input dimension!"
)
boundry = [padding[0], padding[1], padding[2], padding[3]]
elif isinstance(padding, int):
assert padding < H and padding < W, ValueError(
"Padding size should be less than the corresponding input dimension!"
)
boundry = [padding, padding, padding, padding]
else:
raise ValueError("padding must be in or list or tuple!")
return (
flow.user_op_builder(
name if name is not None else id_util.UniqueStr("Reflection_Pad2d_")
)
.Op("reflection_pad2d")
.Input("x", [x])
.Output("y")
.Attr("padding", list(boundry))
.Build()
.InferAndTryRun()
.RemoteBlobList()[0]
)
# python/oneflow/compatible/single_client/ops/user_op_builder.py: 163
class LazyUserOp(UserOp):
def __init__(self, op_name, op_type_name):
UserOp.__init__(self, op_name, op_type_name)
def InferAndTryRun(self):
compile_context.CurJobAddOp(self.op_conf_)
return self
def MakeRemoteBlob(self, lbi):
return remote_blob_util.RemoteBlob(lbi)
- 之后再次执行这些函数的时候,会跳过 TryInit,执行 LazyRun。调用 LaunchUserJob 启动 Job 函数,进行前向传播。
# python/oneflow/compatible/single_client/framework/session_util.py: 275
def LazyRun(self, job_func, *arg):
assert self.status_ is SessionStatus.RUNNING
remote_blobs = self.LaunchUserJob(job_func, *arg)
if remote_blobs is None:
return
future_blob = LazyFutureRemoteBlobs(self).SetResult(remote_blobs).Inited()
annotation = inspect.signature(job_func).return_annotation
return oft_util.TransformGlobalFunctionResult(future_blob, annotation)
计算图构建
流程分析完了,来分析一下用户的计算图是如何构建的?CurJobAddOp 的执行意味着什么?
在 InferenceSession 中,不需要构图,直接从文件系统读取一个计算图,然后对每个 Op 调用 CurJobAddOp。CurJobAddOp 输入是一个 OpConf,下面以一个卷积的激活函数为例子,这个 OpConf 是我从 lenet 保存下来的计算图中截取出来的。我们可以看到有名字,设备,输入,输出,op 类型等信息。
op_list {
name: "conv1-activation"
device_tag: "gpu"
scope_symbol_id: 4611686018427531262
user_conf {
op_type_name: "relu"
input {
key: "in"
value {
s: "conv1-bias_add/out_0"
}
}
output {
key: "out"
value {
s: "conv1-activation/out_0"
}
}
}
}
- 在 Job 函数中,我们如何知道 OpConf 的信息是什么呢?以 ReLU 为例子。relu 函数调用了 user_op_builder 来添加算子到计算图中,添加的时候调用的是 InferAndTryRun,将算子加到计算图中。CurJobAddOp 需要的信息,就是上面的 OpConf。这个 OpConf 由 user_op_builder 使用 Op、Input、Output、Build 构建而来。调用了这一些操作之后,我们就有了一个 OpConf 了,接着调用 InferAndTryRun 将 OpConf 添加到计算图当中。
# python/oneflow/compatible/single_client/ops/math_ops.py: 564
def relu(
x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
) -> oneflow._oneflow_internal.BlobDesc:
return (
flow.user_op_builder(name if name is not None else id_util.UniqueStr("Relu_"))
.Op("relu")
.Input("in", [x])
.Output("out")
.Build()
.InferAndTryRun()
.RemoteBlobList()[0]
)
# python/oneflow/compatible/single_client/ops/user_op_builder.py: 167
def InferAndTryRun(self):
compile_context.CurJobAddOp(self.op_conf_)
return self
# python/oneflow/compatible/single_client/ops/user_op_builder.py: 257
def Input(self, input_name, input_blob_list):
"""Set input blob of op
Args:
input_name (str): input name of blob
input_blob_list : list of blobs
Returns:
self
"""
assert isinstance(input_blob_list, (tuple, list))
input_conf = self.user_op_.op_conf_.user_conf.input
input_conf[input_name].ClearField("s")
for input_blob in input_blob_list:
input_conf[input_name].s.append(input_blob.unique_name)
return self
def Output(self, output_name, num=1):
"""Set output blob of op
Args:
output_name (str): name of output blob
num (int, optional): Defaults to 1.
Returns:
self
"""
assert isinstance(num, int) and num >= 1
out_lbns = []
for i in range(num):
lbn = "{}/{}_{}".format(self.user_op_.op_conf_.name, output_name, i)
out_lbns.append(lbn)
self.user_op_.op_conf_.user_conf.output[output_name].s[:] = out_lbns
self.user_op_.output_arg_key_list_.append(output_name)
return self
def CheckAndComplete(self):
assert self.user_op_.op_conf_.user_conf.op_type_name != ""
self.user_op_.op_conf_ = c_api_util.CheckAndCompleteUserOpConf(
self.user_op_.op_conf_
)
return self
def Build(self):
"""Build op when in/output and other attribute set up.
Returns:
self
"""
return self.CheckAndComplete().user_op_
BlobDesc
接下来我们来看看 oneflow._oneflow_internal.BlobDesc,在构图的过程中,BlobDesc 保存了一些信息,有 shape,op 名字等。这些信息可以用在构图过程中,设置 OpConf 的属性,比如设置 OpConf 的输入就依赖了 BlobDesc 的 unique_name 属性。
BlobDesc 的实现在 C++ 里面,正如名字所暗示的那样,是 Blob 的描述,用来保存 Blob 的描述信息。在文末有 lenet 的完整训练脚本,lenet 函数里面的构造过程,像 conv1,pool1,conv2,pool2 等中间变量,其实就是一个 BlobDesc,而调用 flow.layers.conv2d 的方法,将会根据这些 BlobDesc 构建一个 OpConf,然后通过调用 CurJobAddOp 将 OpConf 放入计算图当中。
// oneflow/core/framework/py_blob_desc.h: 40
class Tensor {
public:
virtual ~Tensor() = default;
virtual std::shared_ptr<cfg::LogicalBlobId> lbi() const = 0;
virtual std::string logical_blob_name() const = 0;
virtual std::string op_name() const = 0;
virtual std::string blob_name() const = 0;
virtual std::shared_ptr<Shape> shape() const = 0;
virtual DataType dtype() const = 0;
virtual std::shared_ptr<cfg::ParallelConf> parallel_conf() const = 0;
};
class BlobDesc : public Tensor {
public:
BlobDesc(const std::shared_ptr<cfg::LogicalBlobId>& lbi,
const std::shared_ptr<Distribute>& distribute);
BlobDesc(const BlobDesc& blob_desc) = default;
virtual ~BlobDesc() override = default;
virtual std::shared_ptr<cfg::LogicalBlobId> lbi() const override;
virtual std::string logical_blob_name() const override;
virtual std::string op_name() const override;
virtual std::string blob_name() const override;
virtual std::shared_ptr<Shape> shape() const override;
virtual DataType dtype() const override;
virtual std::shared_ptr<cfg::ParallelConf> parallel_conf() const override;
virtual bool is_dynamic() const;
virtual std::shared_ptr<Distribute> distribute() const;
virtual std::string unique_name() const;
void set_distribute(const std::shared_ptr<Distribute> distribute);
protected:
Maybe<std::string> Distribute2Str() const;
std::shared_ptr<cfg::LogicalBlobId> lbi_;
std::shared_ptr<Distribute> distribute_;
std::string lbn_;
};
// oneflow/core/framework/py_remote_blob.h
class ConsistentBlob : public BlobDesc {};
class LazyConsistentBlob : public ConsistentBlob {};
class MirroredBlob : public BlobDesc {};
class LazyMirroredBlob : public MirroredBlob {};
class EagerBlobTrait {};
class EagerConsistentBlob : public EagerBlobTrait, public ConsistentBlob {};
class EagerMirroredBlob : public EagerBlobTrait, public MirroredBlob {};
总结
Python 端是如何构图的呢?现在已经可以回答这个问题了,在 Job 函数内,使用 BlobDesc 来保存 Blob 描述信息,每个算子调用 user_op_builder 的方法利用 BlobDesc 的信息将 OpConf 构造出来,然后调用 CurJobAddOp 将 Op 添加到计算图当中。
这篇文章的开头,还讲了构图的触发时机。在用户第一次调用 Job 函数的时候,触发构图。原理是通过一个 global_function 装饰器,装饰用户定义的 Job 函数,用户第一次调用 Job 函数的时候,会尝试初始化 Session,如果已经初始化了,那么不再初始化。在初始化 Session 的时候,去走一遍用户定义的 Job 函数,构建计算图,也就是上面说的:每个算子调用 user_op_builder 的方法利用 BlobDesc 的信息将 OpConf 构造出来,然后调用 CurJobAddOp 将 Op 添加到计算图当中。
从上面的分析可以看出来,Python 端的算子,会检查用户的输入是否合理,比如卷积层的 groups 参数是否比 filters 参数大等。检查结束后,将参数输入给 user_op_builder 来构图。
参考链接
[1] https://www.cnblogs.com/zzk0/p/15009227.html
附: lenet 的训练脚本
# lenet_train.py
from oneflow.compatible import single_client as flow
from oneflow.compatible.single_client import typing as tp
BATCH_SIZE = 100
flow.config.enable_legacy_model_io(False)
# flow.config.gpu_device_num(2)
# flow.env.ctrl_port(9998)
# func_config = flow.function_config()
# func_config.default_logical_view(flow.scope.mirrored_view())
def lenet(data, train=False):
initializer = flow.truncated_normal(0.1)
conv1 = flow.layers.conv2d(
data,
32,
5,
padding="SAME",
activation=flow.nn.relu,
name="conv1",
kernel_initializer=initializer,
)
pool1 = flow.nn.max_pool2d(
conv1, ksize=2, strides=2, padding="SAME", name="pool1", data_format="NCHW"
)
conv2 = flow.layers.conv2d(
pool1,
64,
5,
padding="SAME",
activation=flow.nn.relu,
name="conv2",
kernel_initializer=initializer,
)
pool2 = flow.nn.max_pool2d(
conv2, ksize=2, strides=2, padding="SAME", name="pool2", data_format="NCHW"
)
# reshape = flow.reshape(pool2, [pool2.shape[0], -1])
reshape = flow.flatten(pool2, start_dim=1)
hidden = flow.layers.dense(
reshape,
512,
activation=flow.nn.relu,
kernel_initializer=initializer,
name="dense1",
)
if train:
hidden = flow.nn.dropout(hidden, rate=0.5, name="dropout")
return flow.layers.dense(hidden, 10, kernel_initializer=initializer, name="dense2")
@flow.global_function(type="train")
def train_job(
images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
) -> tp.Numpy:
with flow.scope.placement("gpu", "0:0"):
logits = lenet(images, train=True)
loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
labels, logits, name="softmax_loss"
)
lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(loss)
return loss
if __name__ == "__main__":
(train_images, train_labels), (test_images, test_labels) = flow.data.load_mnist(
BATCH_SIZE, BATCH_SIZE
)
for epoch in range(20):
for i, (images, labels) in enumerate(zip(train_images, train_labels)):
loss = train_job(images, labels)
if i % 20 == 0:
print(loss.mean())
flow.checkpoint.save("./lenet_models_1") # need remove the existed folder
print("model saved")