OneFlow: Python 端构图

前言

在写这篇之前，已经写过一篇博客 [1] 分析了如何获取前端的表达式，不过我认为写的不够清晰，代码没怎么贴，后面再看发现只能看当时的思考，而不是源代码，所以就重新写一遍了。这篇文章会更加专注于 Python 端是如何构图的，更加关注算子的内部的执行过程。

带着如下几个问题去看代码：

什么时候获取前端表达式？
获取的前端表达式是什么样的格式的？
对于一个 InferenceSession，读取本地模型，逐个逐个将 Op 添加到 OneFlow 当中。对于一个 Job 函数，它是如何将 Op 添加到 OneFlow 当中的呢？

流程分析

首先先走一遍流程，这里截取了 [1] 的流程部分，顺便补上代码，加上自己的一些点评。

Python 逐行执行，执行到了 @flow.global_function()。

Python 解释器运行的时候，会逐行读取代码并执行。文末有附代码。

# 文档中的例子 lenet 
@flow.global_function(type="train")
def train_job(
    images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
    labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
) -> tp.Numpy:
    with flow.scope.placement("gpu", "0:0"):
        logits = lenet(images, train=True)
        loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
            labels, logits, name="softmax_loss"
        )
    lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
    flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(loss)
    return loss

跳进 api_oneflow_function，获取一个函数，这个函数是装饰器。根据静态图或者动态图，返回一个装饰器。

# python/oneflow/compatible/single_client/framework/function_util.py: 85
def api_oneflow_function(
    type: str = "predict", function_config: FunctionConfig = None
) -> Callable[[Callable], Callable]:
    """Creates a callable OneFlow global function from a Python function.

    For instance::

        @oneflow.compatible.single_client.global_function(flow.FunctionConfig())
        def train():
            # your model

    Args:
        function_config (FunctionConfig, optional): a `FunctionConfig` object. Defaults to FunctionConfig().

    Returns:
        Callable[[Callable], Callable]: a callable which is called to execute the compiled function
    """
    if isinstance(type, FunctionConfig):
        function_config = type
        print(
            "WARNING: flow.global_function(func_config) is deprecated. Please replace it with flow.global_function(type, func_config).\n            "
        )
        print(traceback.format_stack()[-2])
    else:
        assert type in ["train", "predict"]
        if function_config is None:
            function_config = FunctionConfig()
        if type == "train":
            function_config.function_desc.job_config_proto.mutable_train_conf()
        else:
            function_config.function_desc.job_config_proto.mutable_predict_conf()
    api = enable_if.unique([eager_oneflow_function, lazy_oneflow_function])
    return api(function_config)

上面代码获取到的是 lazy_oneflow_function。这里会创建一个闭包，闭包里面包含有运行这个 Job 的 Session 等信息，以及用户定义的 Job 函数。

# python/oneflow/compatible/single_client/framework/function_util.py: 143
@enable_if.condition(
    hob.in_normal_mode & ~hob.eager_execution_enabled & ~hob.session_initialized
)
def lazy_oneflow_function(function_config=FunctionConfig()):
    assert isinstance(function_config, FunctionConfig)

    def Decorator(job_func):
        if not hasattr(job_func, "__oneflow_function_signature__"):
            job_func.__oneflow_function_signature__ = inspect.signature(job_func)
        oft_util.CheckGlobalFunctionAnnotation(job_func.__oneflow_function_signature__)
        sess = session_ctx.GetDefaultSession()

        @functools.wraps(job_func)
        def Func(*args, **kwargs):
            return _RunLazyJob(sess, job_func, *args, **kwargs)

        sess.AddJob(_CloneFunctionDesc(function_config.function_desc, job_func))
        for x in dir(job_func):
            if x.startswith("__oneflow_"):
                setattr(Func, x, getattr(job_func, x))
        return Func

    return Decorator

经过装饰器之后，被装饰的函数最终会执行的方法是 _RunLazyJob。如果用户第一次调用 Job 函数，会进行 Session 的初始化。初始化结束之后，会 LazyRun。被 @flow.global_function() 装饰的函数，第一次调用的时候，执行初始化 TryInit，后续调用不再重复初始化。

# python/oneflow/compatible/single_client/framework/function_util.py: 225
def _RunLazyJob(session, job_func, *args, **kwargs):
    return session.TryInit().LazyRun(job_func, *args, **kwargs)

Session 初始化的时候，进行编译。调用 compiler 的 Compile 方法，对 Job 进行编译。

# python/oneflow/compatible/single_client/framework/session_util.py： 183
def Init(self):
    assert self.status_ is SessionStatus.OPEN
    self.status_ = SessionStatus.RUNNING
    if not oneflow._oneflow_internal.IsEnvInited():
        flow.env.init()
    _TryCompleteConfigProto(self.config_proto)
    self.resource_ = self.config_proto.resource
    if not oneflow._oneflow_internal.EagerExecutionEnabled():
        c_api_util.InitLazyGlobalSession(self.config_proto)
        for (job_name, func_desc) in self.job_name2function_desc_.items():
            compiler.Compile(self, func_desc, self.config_proto)
            self.existed_module_names_ = set()
        self.job_name2var_name2var_blob_ = dict()
        assert len(self.job_name2function_desc_.items()) > 0
        oneflow._oneflow_internal.StartLazyGlobalSession()
        self.inter_user_job_info_ = c_api_util.GetInterUserJobInfo()
        self.UpdateInfo4InterfaceOp()
        if not config_util.api_legacy_model_io_enabled():
            check_point_v2.Init()
    else:
        self.eager_config_proto_ctx_ = oneflow._oneflow_internal.LogicalConfigProtoContext(
            str(self.config_proto)
        )
    return self

在编译之前需要设置好环境，可以看到 Compile 中的 with，指向了一个方法 InterpretScope。在 InterpretScope 的最后，主要的工作是：打开 JobBuildAndInferCtx，设置 JobBuildAndInferCtx，设置 Python 的运行模式，设置运行时的 scope。这些设置放在了 with 里面，在这一段结束的时候，还会进行一些清理的工作，比如关闭 JobBuildAndInferCtx 等。

# python/oneflow/compatible/single_client/framework/compiler.py: 44
def Compile(session, function_desc, config_proto):
    with InterpretScope(session, function_desc, config_proto):
        _CompileJob(session, function_desc)
        session.StashJob(function_desc.job_func.__name__)
        oneflow._oneflow_internal.CurJobBuildAndInferCtx_Complete()
        session.StashJob(
            function_desc.job_func.__name__,
            function_desc.job_func.__name__ + "_after_complete",
        )

# python/oneflow/compatible/single_client/framework/compiler.py: 63
@contextmanager
def InterpretScope(session, function_desc, config_proto):
    job_conf = function_desc.job_config_proto
    job_conf.set_job_name(function_desc.job_func.__name__)
    placement_scope = function_desc.function_attribute.default_placement_scope
    if placement_scope is None:
        tag_and_dev_ids = placement_util.GetDefaultMachineDeviceIds(session.resource)
        hierarchy = None
    else:
        assert isinstance(placement_scope, placement_ctx.EmptyPlacementScope)
        tag_and_dev_ids = (
            placement_scope.device_tag,
            placement_scope.machine_device_ids,
        )
        hierarchy = placement_scope.hierarchy
    distribute_strategy = function_desc.function_attribute.default_distribute_strategy
    if distribute_strategy is None:
        distribute_strategy = distribute_util.DistributeConsistentStrategy()
    is_mirrored = isinstance(
        distribute_strategy, distribute_util.DistributeMirroredStrategy
    )
    assert isinstance(hierarchy, (list, tuple)) or hierarchy is None
    if hierarchy is not None:
        hierarchy = oneflow._oneflow_internal.Size(tuple(hierarchy))
    scope = scope_util.MakeInitialScope(
        job_conf, *tag_and_dev_ids, hierarchy, is_mirrored
    )
    with _JobBuildAndInferCtx(job_conf.job_name()), distribute_strategy:
        c_api_util.CurJobBuildAndInferCtx_SetJobConf(job_conf)
        with runtime_mode.ModeScope(runtime_mode.GLOBAL_MODE):
            with scope_util.ScopeContext(scope):
                yield

# python/oneflow/compatible/single_client/framework/compiler.py: 145
@contextmanager
def _JobBuildAndInferCtx(job_name):
    c_api_util.JobBuildAndInferCtx_Open(job_name)
    try:
        yield
    finally:
        oneflow._oneflow_internal.JobBuildAndInferCtx_Close()

# python/oneflow/compatible/single_client/framework/scope_util.py: 105
@contextmanager
def ScopeContext(scope):
    old_scope = oneflow._oneflow_internal.GetCurrentScope()
    oneflow._oneflow_internal.GlobalScopeStackPush(scope)
    try:
        yield
    finally:
        assert oneflow._oneflow_internal.GetCurrentScope() is scope
        oneflow._oneflow_internal.GlobalScopeStackPop()
        assert oneflow._oneflow_internal.GetCurrentScope() is old_scope

接下来执行 _CompileJob，其中会执行用户定义的 Job 函数。我可以看到调用了 func 方法，这个实际上就是 Job 函数。

# python/oneflow/compatible/single_client/framework/compiler.py: 97
def _CompileJob(session, function_desc):
    func = function_desc.job_func
    parameters = func.__oneflow_function_signature__.parameters
    if len(parameters) == 0:
        func.__oneflow_input_blob_defs__ = ()
    elif all((p.annotation is inspect._empty for (_, p) in parameters.items())):
        func.__oneflow_input_blob_defs__ = _GetArgDefault(func)
    elif all((p.annotation is not inspect._empty for (_, p) in parameters.items())):
        func.__oneflow_input_blob_defs__ = _MakeInputBlobDefFromParameterSignature(
            parameters
        )
    else:
        raise NotImplementedError(
            "All parameters of global function should be annotated"
        )
    inputs = _RecursiveMakeInputBlobs(func.__oneflow_input_blob_defs__)
    ret = func(*inputs)
    return_annotation = func.__oneflow_function_signature__.return_annotation
    oft_util.CheckReturnByAnnotation(func.__name__, ret, return_annotation)
    func.__oneflow_output_remote_blobs__ = _RecursiveMakeRetRemoteBlobs(
        ret, allow_cpu_return_op=function_desc.function_attribute.allow_cpu_return_op
    )

Job 函数内部执行的时候，就是到了具体的算子。每个算子的结尾都会去调用 user_op_builder.py 的方法。比较重要的一个方法是 InferAndTryRun，这个方法将会调用 CurJobAddOp 添加算子到图里。下面以 pad 算子为例子。Python 端的算子会检查用户输入是否合法，最后调用 user_op_builder 将算子加入到计算图中。

# python/oneflow/compatible/single_client/ops/pad.py: 229
def reflection_pad2d(
    x: oneflow._oneflow_internal.BlobDesc,
    padding: Union[int, tuple, list],
    name: Optional[str] = None,
) -> oneflow._oneflow_internal.BlobDesc:
    (H, W) = (x.shape[2], x.shape[3])
    if isinstance(padding, (tuple, list)):
        assert len(padding) == len(x.shape), ValueError(
            "padding boundry must be the same size of input dims"
        )
        assert (
            padding[2] < H and padding[3] < H and (padding[0] < W) and (padding[1] < W)
        ), ValueError(
            "Padding size should be less than the corresponding input dimension!"
        )
        boundry = [padding[0], padding[1], padding[2], padding[3]]
    elif isinstance(padding, int):
        assert padding < H and padding < W, ValueError(
            "Padding size should be less than the corresponding input dimension!"
        )
        boundry = [padding, padding, padding, padding]
    else:
        raise ValueError("padding must be in or list or tuple!")
    return (
        flow.user_op_builder(
            name if name is not None else id_util.UniqueStr("Reflection_Pad2d_")
        )
        .Op("reflection_pad2d")
        .Input("x", [x])
        .Output("y")
        .Attr("padding", list(boundry))
        .Build()
        .InferAndTryRun()
        .RemoteBlobList()[0]
    )

# python/oneflow/compatible/single_client/ops/user_op_builder.py: 163
class LazyUserOp(UserOp):
    def __init__(self, op_name, op_type_name):
        UserOp.__init__(self, op_name, op_type_name)

    def InferAndTryRun(self):
        compile_context.CurJobAddOp(self.op_conf_)
        return self

    def MakeRemoteBlob(self, lbi):
        return remote_blob_util.RemoteBlob(lbi)

之后再次执行这些函数的时候，会跳过 TryInit，执行 LazyRun。调用 LaunchUserJob 启动 Job 函数，进行前向传播。

# python/oneflow/compatible/single_client/framework/session_util.py: 275
def LazyRun(self, job_func, *arg):
    assert self.status_ is SessionStatus.RUNNING
    remote_blobs = self.LaunchUserJob(job_func, *arg)
    if remote_blobs is None:
        return
    future_blob = LazyFutureRemoteBlobs(self).SetResult(remote_blobs).Inited()
    annotation = inspect.signature(job_func).return_annotation
    return oft_util.TransformGlobalFunctionResult(future_blob, annotation)

计算图构建

流程分析完了，来分析一下用户的计算图是如何构建的？CurJobAddOp 的执行意味着什么？

在 InferenceSession 中，不需要构图，直接从文件系统读取一个计算图，然后对每个 Op 调用 CurJobAddOp。CurJobAddOp 输入是一个 OpConf，下面以一个卷积的激活函数为例子，这个 OpConf 是我从 lenet 保存下来的计算图中截取出来的。我们可以看到有名字，设备，输入，输出，op 类型等信息。

    op_list {
      name: "conv1-activation"
      device_tag: "gpu"
      scope_symbol_id: 4611686018427531262
      user_conf {
        op_type_name: "relu"
        input {
          key: "in"
          value {
            s: "conv1-bias_add/out_0"
          }
        }
        output {
          key: "out"
          value {
            s: "conv1-activation/out_0"
          }
        }
      }
    }

在 Job 函数中，我们如何知道 OpConf 的信息是什么呢？以 ReLU 为例子。relu 函数调用了 user_op_builder 来添加算子到计算图中，添加的时候调用的是 InferAndTryRun，将算子加到计算图中。CurJobAddOp 需要的信息，就是上面的 OpConf。这个 OpConf 由 user_op_builder 使用 Op、Input、Output、Build 构建而来。调用了这一些操作之后，我们就有了一个 OpConf 了，接着调用 InferAndTryRun 将 OpConf 添加到计算图当中。

# python/oneflow/compatible/single_client/ops/math_ops.py: 564
def relu(
    x: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None
) -> oneflow._oneflow_internal.BlobDesc:
    return (
        flow.user_op_builder(name if name is not None else id_util.UniqueStr("Relu_"))
        .Op("relu")
        .Input("in", [x])
        .Output("out")
        .Build()
        .InferAndTryRun()
        .RemoteBlobList()[0]
    )

# python/oneflow/compatible/single_client/ops/user_op_builder.py: 167
def InferAndTryRun(self):
    compile_context.CurJobAddOp(self.op_conf_)
    return self

# python/oneflow/compatible/single_client/ops/user_op_builder.py: 257
def Input(self, input_name, input_blob_list):
    """Set input blob of op

    Args:
        input_name (str): input name of blob
        input_blob_list : list of blobs

    Returns:
        self
    """
    assert isinstance(input_blob_list, (tuple, list))
    input_conf = self.user_op_.op_conf_.user_conf.input
    input_conf[input_name].ClearField("s")
    for input_blob in input_blob_list:
        input_conf[input_name].s.append(input_blob.unique_name)
    return self

def Output(self, output_name, num=1):
    """Set output blob of op

    Args:
        output_name (str): name of output blob
        num (int, optional):  Defaults to 1.

    Returns:
        self
    """
    assert isinstance(num, int) and num >= 1
    out_lbns = []
    for i in range(num):
        lbn = "{}/{}_{}".format(self.user_op_.op_conf_.name, output_name, i)
        out_lbns.append(lbn)
    self.user_op_.op_conf_.user_conf.output[output_name].s[:] = out_lbns
    self.user_op_.output_arg_key_list_.append(output_name)
    return self

def CheckAndComplete(self):
    assert self.user_op_.op_conf_.user_conf.op_type_name != ""
    self.user_op_.op_conf_ = c_api_util.CheckAndCompleteUserOpConf(
        self.user_op_.op_conf_
    )
    return self

def Build(self):
    """Build op when in/output and other attribute set up.

    Returns:
        self

    """
    return self.CheckAndComplete().user_op_

BlobDesc

接下来我们来看看 oneflow._oneflow_internal.BlobDesc，在构图的过程中，BlobDesc 保存了一些信息，有 shape，op 名字等。这些信息可以用在构图过程中，设置 OpConf 的属性，比如设置 OpConf 的输入就依赖了 BlobDesc 的 unique_name 属性。

BlobDesc 的实现在 C++ 里面，正如名字所暗示的那样，是 Blob 的描述，用来保存 Blob 的描述信息。在文末有 lenet 的完整训练脚本，lenet 函数里面的构造过程，像 conv1，pool1，conv2，pool2 等中间变量，其实就是一个 BlobDesc，而调用 flow.layers.conv2d 的方法，将会根据这些 BlobDesc 构建一个 OpConf，然后通过调用 CurJobAddOp 将 OpConf 放入计算图当中。

// oneflow/core/framework/py_blob_desc.h: 40
class Tensor {
 public:
  virtual ~Tensor() = default;

  virtual std::shared_ptr<cfg::LogicalBlobId> lbi() const = 0;
  virtual std::string logical_blob_name() const = 0;
  virtual std::string op_name() const = 0;
  virtual std::string blob_name() const = 0;
  virtual std::shared_ptr<Shape> shape() const = 0;
  virtual DataType dtype() const = 0;
  virtual std::shared_ptr<cfg::ParallelConf> parallel_conf() const = 0;
};

class BlobDesc : public Tensor {
 public:
  BlobDesc(const std::shared_ptr<cfg::LogicalBlobId>& lbi,
           const std::shared_ptr<Distribute>& distribute);

  BlobDesc(const BlobDesc& blob_desc) = default;
  virtual ~BlobDesc() override = default;

  virtual std::shared_ptr<cfg::LogicalBlobId> lbi() const override;
  virtual std::string logical_blob_name() const override;
  virtual std::string op_name() const override;
  virtual std::string blob_name() const override;
  virtual std::shared_ptr<Shape> shape() const override;
  virtual DataType dtype() const override;
  virtual std::shared_ptr<cfg::ParallelConf> parallel_conf() const override;

  virtual bool is_dynamic() const;
  virtual std::shared_ptr<Distribute> distribute() const;
  virtual std::string unique_name() const;

  void set_distribute(const std::shared_ptr<Distribute> distribute);

 protected:
  Maybe<std::string> Distribute2Str() const;

  std::shared_ptr<cfg::LogicalBlobId> lbi_;
  std::shared_ptr<Distribute> distribute_;
  std::string lbn_;
};

// oneflow/core/framework/py_remote_blob.h
class ConsistentBlob : public BlobDesc {};
class LazyConsistentBlob : public ConsistentBlob {};
class MirroredBlob : public BlobDesc {};
class LazyMirroredBlob : public MirroredBlob {};
class EagerBlobTrait {};
class EagerConsistentBlob : public EagerBlobTrait, public ConsistentBlob {};
class EagerMirroredBlob : public EagerBlobTrait, public MirroredBlob {};

总结

Python 端是如何构图的呢？现在已经可以回答这个问题了，在 Job 函数内，使用 BlobDesc 来保存 Blob 描述信息，每个算子调用 user_op_builder 的方法利用 BlobDesc 的信息将 OpConf 构造出来，然后调用 CurJobAddOp 将 Op 添加到计算图当中。

这篇文章的开头，还讲了构图的触发时机。在用户第一次调用 Job 函数的时候，触发构图。原理是通过一个 global_function 装饰器，装饰用户定义的 Job 函数，用户第一次调用 Job 函数的时候，会尝试初始化 Session，如果已经初始化了，那么不再初始化。在初始化 Session 的时候，去走一遍用户定义的 Job 函数，构建计算图，也就是上面说的：每个算子调用 user_op_builder 的方法利用 BlobDesc 的信息将 OpConf 构造出来，然后调用 CurJobAddOp 将 Op 添加到计算图当中。

从上面的分析可以看出来，Python 端的算子，会检查用户的输入是否合理，比如卷积层的 groups 参数是否比 filters 参数大等。检查结束后，将参数输入给 user_op_builder 来构图。

参考链接

[1] https://www.cnblogs.com/zzk0/p/15009227.html

附: lenet 的训练脚本

# lenet_train.py
from oneflow.compatible import single_client as flow
from oneflow.compatible.single_client import typing as tp

BATCH_SIZE = 100
flow.config.enable_legacy_model_io(False)
# flow.config.gpu_device_num(2)
# flow.env.ctrl_port(9998)
# func_config = flow.function_config()
# func_config.default_logical_view(flow.scope.mirrored_view())


def lenet(data, train=False):
    initializer = flow.truncated_normal(0.1)
    conv1 = flow.layers.conv2d(
        data,
        32,
        5,
        padding="SAME",
        activation=flow.nn.relu,
        name="conv1",
        kernel_initializer=initializer,
    )
    pool1 = flow.nn.max_pool2d(
        conv1, ksize=2, strides=2, padding="SAME", name="pool1", data_format="NCHW"
    )
    conv2 = flow.layers.conv2d(
        pool1,
        64,
        5,
        padding="SAME",
        activation=flow.nn.relu,
        name="conv2",
        kernel_initializer=initializer,
    )
    pool2 = flow.nn.max_pool2d(
        conv2, ksize=2, strides=2, padding="SAME", name="pool2", data_format="NCHW"
    )
    # reshape = flow.reshape(pool2, [pool2.shape[0], -1])
    reshape = flow.flatten(pool2, start_dim=1)
    hidden = flow.layers.dense(
        reshape,
        512,
        activation=flow.nn.relu,
        kernel_initializer=initializer,
        name="dense1",
    )
    if train:
        hidden = flow.nn.dropout(hidden, rate=0.5, name="dropout")
    return flow.layers.dense(hidden, 10, kernel_initializer=initializer, name="dense2")


@flow.global_function(type="train")
def train_job(
    images: tp.Numpy.Placeholder((BATCH_SIZE, 1, 28, 28), dtype=flow.float),
    labels: tp.Numpy.Placeholder((BATCH_SIZE,), dtype=flow.int32),
) -> tp.Numpy:
    with flow.scope.placement("gpu", "0:0"):
        logits = lenet(images, train=True)
        loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
            labels, logits, name="softmax_loss"
        )
    lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.1])
    flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(loss)
    return loss


if __name__ == "__main__":
    (train_images, train_labels), (test_images, test_labels) = flow.data.load_mnist(
        BATCH_SIZE, BATCH_SIZE
    )

    for epoch in range(20):
        for i, (images, labels) in enumerate(zip(train_images, train_labels)):
            loss = train_job(images, labels)
            if i % 20 == 0:
                print(loss.mean())
    flow.checkpoint.save("./lenet_models_1")  # need remove the existed folder
    print("model saved")

posted @ 2021-09-01 10:43 楷哥阅读(311) 评论(0) 收藏举报

刷新页面返回顶部

博客园

普普通通的大学生