1. save the trained model

# in module file of tfx component trainer
def _apply_preprocessing(raw_features, tft_layer):
    transformed_features = tft_layer(raw_features)
    if _LABEL_KEY in raw_features:
        transformed_label = transformed_features.pop(_LABEL_KEY)
        return transformed_features, transformed_label
    else:
        return transformed_features, None
    
### return a function which makes inference on raw features,
### it will be specified as signature of model.save().
def _get_serve_tf_examples_fn(model, tf_transform_output):
    model.tft_layer = tf_transform_output.transform_features_layer()

### name should be "instances", which is inputs key of
### tensorflow/serving/predict.    
    @tf.function(input_signature=[
        tf.TensorSpec(shape=[None], dtype=tf.string, name='instances')
    ])
    def serve_tf_examples_fn(serialized_tf_examples):
        feature_spec = tf_transform_output.raw_feature_spec()
        
        required_feature_spec = {
            k: v for k, v in feature_spec.items() if k in _FEATURE_KEYS
        }
        
        parsed_features = tf.io.parse_example(serialized_tf_examples, 
                                              required_feature_spec)
        
        transformed_features, _ = _apply_preprocessing(parsed_features, 
                                                      model.tft_layer)
        
        return model(transformed_features)
    
    return serve_tf_examples_fn


def _build_keras_model() -> tf.keras.Model: 
    _METRICS = [
    keras.metrics.BinaryCrossentropy(name='cross entropy'),
    keras.metrics.TruePositives(name='tp'),
    keras.metrics.FalsePositives(name='fp'),
    keras.metrics.TrueNegatives(name='tn'),
    keras.metrics.FalseNegatives(name='fn'),
    keras.metrics.Precision(name='precision'),
    keras.metrics.Recall(name='recall'),
    keras.metrics.AUC(name='auc'),
    keras.metrics.AUC(name='prc', curve='PR'),
    
]
    
    inputs = [
        keras.layers.Input(shape=(1,), name=key)
        for key in _FEATURE_KEYS
    ]
    
    d = keras.layers.concatenate(inputs)
    for _ in range(2):
        d = keras.layers.Dense(16, activation='tanh', 
                               kernel_regularizer=keras.regularizers.l2(1e-5))(d)
    outputs = keras.layers.Dense(1, activation='sigmoid')(d)
    
    model = keras.Model(inputs=inputs, outputs=outputs)
    
    lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
        1e-3,
        decay_steps=_STEPS_PER_EPOCH*1000,
        decay_rate=1,
        staircase=False,
    )
      
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=_METRICS
    )
    
    model.summary(print_fn=logging.info)
    return model
    

def run_fn(fn_args: tfx.components.FnArgs):
    """
    cluster_dict = {}
    #cluster_dict["worker"] = ["dist-strat-example-worker-0:5000", "dist-strat-example-worker-1:5000"]
    #cluster_dict["ps"] = ["dist-strat-example-ps-0:5000"]
    
    cluster_dict["worker"] = ["10.105.206.29:5000", "10.102.137.138:5000"]
    cluster_dict["ps"] = ["10.105.27.97:5000"]
    
    cluster_spec = tf.train.ClusterSpec(cluster_dict)
    
    cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(
      cluster_spec, rpc_layer="grpc")
    
    strategy = tf.distribute.ParameterServerStrategy(
    cluster_resolver,)
 """   
    
    
    tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)
    
    train_dataset = _input_fn(
        fn_args.train_files,
        fn_args.data_accessor,
        tf_transform_output,
        batch_size=_TRAIN_BATCH_SIZE,
    )
       
    resampled_train_dataset = _resample_train_dataset(train_dataset, 
                                                      batch_size=_TRAIN_BATCH_SIZE)
    
    #tf.print(f"resampled_train_dataset {resampled_train_dataset.cardinality()}")
    
    val_dataset = _input_fn(
        fn_args.eval_files,
        fn_args.data_accessor,
        tf_transform_output,
        batch_size=_EVAL_BATCH_SIZE,
    )
    
    val_dataset = val_dataset.repeat()
    
    #tf.print(f"val_dataset cardinality: {val_dataset.cardinality()}")
    
    
    #with strategy.scope():
    #    model = _build_keras_model()
         
    model = _build_keras_model()

  
    #log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    #tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,)
    
    backup_dir = os.path.join("/home/maye/maye_temp", "backup")
    
    callbacks = [
    tf.keras.callbacks.BackupAndRestore(backup_dir=backup_dir),
]
    
    
    trainer_train_history = model.fit(
        resampled_train_dataset,
        epochs=fn_args.custom_config['epochs'],
        steps_per_epoch=fn_args.train_steps,
        validation_data=val_dataset,
        
        validation_steps=3,
        
        callbacks=callbacks,
    )
    
    #tf.print(f"train_history: \n {train_history.history}")
    
    
    with open('trainer_train_history.json', 'w') as f:
        json.dump(trainer_train_history.history, f)

### argument signatures of model.save() specify the functions 
### tensorflow/serving will use.    
    signatures = {
        'serving_default': _get_serve_tf_examples_fn(model, tf_transform_output),
    }

"""
fn.args.serving_model_dir of tfx component trainer is os.path.join(<uri-of-artifact-model-of-trainer>, "Format-Serving"), which can not be changed, or other tfx components can not find artifact model. 
"""  
    model.save(fn_args.serving_model_dir, save_format='tf', signatures=signatures)

This creates

SignatureDef: { 
  value: {
    inputs {
      key: "instances"
      value {
        name: "serving_default_examples:0"
        dtype: DT_STRING
        tensor_shape {
          dim {
            size: -1
          }
        }
      }
    } 
    method_name: "tensorflow/serving/predict"
  }
}

Note:

  1. Signature specifies what type of model is being exported, and the input/output tensors to bind to when running inference.
    The special signature key serving_default specifies the default serving signature. The default serving signature def key, along with other constants related to signatures, are defined as part of SavedModel signature constants.
    [1]
    2.The saved model has attribute "signatures", which can be called:
wafer_serving_model = tf.keras.models.load_model("/home/maye/maye_temp/wafer/347")

csv_example_train_filepath = 'pipelines/detect_anomolies_on_wafer_tfdv_schema/train_eval_data/train_data'
raw_dataset = tf.data.TFRecordDataset(csv_example_train_filepath)

### example is serialized tf.Example, namely bytes, 
### tf.Example is protobuf.
for example in raw_dataset.take(1):
    example_infer = wafer_serving_model.signatures['serving_default'](instances=[example.numpy()]
for raw_record in raw_dataset.take(3):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)
### This is tf.Example protobuf, one "features" = one example.
features {
  feature {
    key: "Class"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "feature_1"
    value {
      int64_list {
        value: 40
      }
    }
  }
  feature {
    key: "feature_10"
    value {
      int64_list {
        value: 0
      }
    }
  }
...
}

2. run container of tensorflow/serving

[2]

2.1 Download container image of tensorflow/serving

# Download the TensorFlow Serving container image and repo
nerdctl pull tensorflow/serving

git clone https://github.com/tensorflow/serving

Attention:
tensorflow/serving = hub.docker.com/tensorflow/serving, hub.docker.com is not accessible in china, replace it with a mirror image repository which is accessible in china, refer to « Run a tfx pipeline using kubeflow pipeline: mirror websites of hub.docker.com » https://www.cnblogs.com/zhenxia-jiuyou/p/18003167
Then change the image tag to "tensorflow/serving".

2.2 run container of tensorflow/serving

(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_serving$ sudo nerdctl run -t --rm -p 8500:8500 -p 8501:8501 -v "/home/maye/maye_temp/wafer:/models/wafer" -e MODEL_NAME=wafer  tensorflow/serving 
[sudo] password for maye: 
2024-02-16 07:16:23.531943: I tensorflow_serving/model_servers/server.cc:74] Building single TensorFlow model file config:  model_name: half_plus_two model_base_path: /models/half_plus_two
2024-02-16 07:16:23.617629: I tensorflow_serving/model_servers/server_core.cc:467] Adding/updating models.
2024-02-16 07:16:23.617750: I tensorflow_serving/model_servers/server_core.cc:596]  (Re-)adding model: half_plus_two
2024-02-16 07:16:23.824056: I tensorflow_serving/core/basic_manager.cc:739] Successfully reserved resources to load servable {name: half_plus_two version: 123}
2024-02-16 07:16:23.824111: I tensorflow_serving/core/loader_harness.cc:66] Approving load for servable version {name: half_plus_two version: 123}
2024-02-16 07:16:23.824140: I tensorflow_serving/core/loader_harness.cc:74] Loading servable version {name: half_plus_two version: 123}
2024-02-16 07:16:23.841445: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /models/half_plus_two/00000123
2024-02-16 07:16:23.848473: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:51] Reading meta graph with tags { serve }
2024-02-16 07:16:23.848516: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:146] Reading SavedModel debug info (if present) from: /models/half_plus_two/00000123
2024-02-16 07:16:23.883025: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-16 07:16:24.032082: I external/org_tensorflow/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:382] MLIR V1 optimization pass is not enabled
2024-02-16 07:16:24.090038: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:233] Restoring SavedModel bundle.
2024-02-16 07:16:25.306378: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:217] Running initialization op on SavedModel bundle at path: /models/half_plus_two/00000123
2024-02-16 07:16:25.328697: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:316] SavedModel load for tags { serve }; Status: success: OK. Took 1487618 microseconds.
2024-02-16 07:16:25.329046: I tensorflow_serving/servables/tensorflow/saved_model_warmup_util.cc:80] No warmup data file found at /models/half_plus_two/00000123/assets.extra/tf_serving_warmup_requests
2024-02-16 07:16:25.429497: I tensorflow_serving/core/loader_harness.cc:95] Successfully loaded servable version {name: half_plus_two version: 123}
2024-02-16 07:16:25.430402: I tensorflow_serving/model_servers/server_core.cc:488] Finished adding/updating models
2024-02-16 07:16:25.430447: I tensorflow_serving/model_servers/server.cc:118] Using InsecureServerCredentials
2024-02-16 07:16:25.430496: I tensorflow_serving/model_servers/server.cc:383] Profiler service is enabled
2024-02-16 07:16:25.491575: I tensorflow_serving/model_servers/server.cc:409] Running gRPC ModelServer at 0.0.0.0:8500 ...
2024-02-16 07:16:25.499873: I tensorflow_serving/model_servers/server.cc:430] Exporting HTTP/REST API at:localhost:8501 ...
[evhttp_server.cc : 245] NET_LOG: Entering the event loop ...

Note:

  1. -p host-port:container-port -- map host-port to container port
    8500 is tensorflow/serving grpc API port, 8501 is tensorflow/serving REST API port. [3]
  2. -v model-path-on-host:model-path-on-container -- mount volume, namely mount model-path-on-host to model-path-on-container.
    Attention:
    1. There should be no number in the path, or raise error: "invalid reference format", e.g. "/home/maye/maye_temp/312/wafer" .
    2. There should be directory whose name is number inside model-path, which represents model version,
      e.g. The model path on host in this example is: "/home/maye/maye_temp/wafer",
      inside it is:
(base) maye@maye-Inspiron-5547:~$ ls /home/maye/maye_temp/wafer
322  327  332  342  347
(base) maye@maye-Inspiron-5547:~$ 

tensorflow/serving detects model versions inside model path automatically, When sending request to tensorflow/serving without specifying model version, namely url is http://10.4.0.9:8501/v1/models/wafer, it will use the latest version, namely the model version directory whose name is the largest number.
Specify model version with url http://10.4.0.9:8501/v1/models/wafer/versions/<version-number>.

(base) maye@maye-Inspiron-5547:~$ curl -d '{"instances": [{"b64": ""}]}' -X POST http://10.4.0.9:8501/v1/models/wafer:predict

output: 
{
    "predictions": [[0.434313387]
    ]
}(base) maye@maye-Inspiron-5547:~$ 
  1. -e MODEL_NAME=wafer -- specify environment variable MODEL_NAME=wafer, Note that the MODEL_NAME should be the name of model directory.

3. send REST request to tensorflow/serving

Option 1: use curl

(base) maye@maye-Inspiron-5547:~$ curl -d '{"instances": [{"b64": ""}]}' -X POST http://10.4.0.9:8501/v1/models/wafer:predict

output: 
{
    "predictions": [[0.434313387]
    ]
}(base) maye@maye-Inspiron-5547:~$ 

Option 2: use python package "requests"

train_examples_file_path = os.path.join('pipelines/detect_anomolies_on_wafer_tfdv_schema/CsvExampleGen/examples/19', 'Split-train/data_tfrecord-00000-of-00001.gz')

for example in raw_dataset.take(1):
    #print(example.numpy())
    
    example_base64 = base64.b64encode(example.numpy())
    
    #print(example_base64)
    
    #pay_load = {"instances": [{"b64": example_base64}]}
    
    
    headers = {"Content-Type": "application/json"}
    
    #print(f"pay_load: {pay_load}")
    
    pay_load = {"instances": [{"b64": ""}]}
    
    print(f"type(pay_load): {type(pay_load)}")
    
    pay_load_jsons = json.dumps(pay_load)
    
    #print(f"type(pay_load_jsons): {type(pay_load_jsons)}")
    
    response = requests.post('http://10.4.0.9:8501/v1/models/wafer:predict', headers=headers, data=pay_load_jsons)
    print(response.json()
type(pay_load): <class 'dict'>
{'predictions': [[0.434313387]]}

Attention:
1. in http://10.4.0.9:8501, "10.4.0.9" is container IP of tensorflow/serving , or the host IP (gotten by $ ip addr), 8501 is REST API port of tensorflow serving. [4]

How to get container IP:

# get container-id of running container
nerdctl ps

# get container-IP
nerdctl instapect <container-id>
(base) maye@maye-Inspiron-5547:~$ sudo nerdctl inspect a876dff59e3b 
[
    {
        "Id": "a876dff59e3b37bd3963f67e079e9b2a584665cfa7dd1fd8794a4f7843fd0c53",
        "Created": "2024-02-16T10:16:17.10086903Z",
        "Path": "/usr/bin/tf_serving_entrypoint.sh",
        "Args": null,
        "State": {
            "Status": "running",
            "Running": true,
            "Paused": false,
            "Restarting": false,
            "Pid": 1090243,
            "ExitCode": 0,
            "Error": "",
            "FinishedAt": "0001-01-01T00:00:00Z"
        },
        "Image": "docker.io/tensorflow/serving:latest",
        "ResolvConfPath": "/var/lib/nerdctl/1935db59/containers/default/a876dff59e3b37bd3963f67e079e9b2a584665cfa7dd1fd8794a4f7843fd0c53/resolv.conf",
        "HostnamePath": "/var/lib/nerdctl/1935db59/containers/default/a876dff59e3b37bd3963f67e079e9b2a584665cfa7dd1fd8794a4f7843fd0c53/hostname",
        "LogPath": "/var/lib/nerdctl/1935db59/containers/default/a876dff59e3b37bd3963f67e079e9b2a584665cfa7dd1fd8794a4f7843fd0c53/a876dff59e3b37bd3963f67e079e9b2a584665cfa7dd1fd8794a4f7843fd0c53-json.log",
        "Name": "serving-a876d",
        "RestartCount": 0,
        "Driver": "overlayfs",
        "Platform": "linux",
        "AppArmorProfile": "nerdctl-default",
        "Mounts": [
            {
                "Type": "bind",
                "Source": "/home/maye/github_repository/tensorflow_serving/tensorflow_serving/servables/tensorflow/testdata/saved_model_half_plus_two_cpu",
                "Destination": "/models/half_plus_two",
                "Mode": "",
                "RW": true,
                "Propagation": ""
            }
        ],
        "Config": {
            "Hostname": "a876dff59e3b",
            "AttachStdin": false,
            "Labels": {
                "io.containerd.image.config.stop-signal": "SIGTERM",
                "nerdctl/extraHosts": "null",
                "nerdctl/hostname": "a876dff59e3b",
                "nerdctl/log-uri": "binary:///usr/local/bin/nerdctl?_NERDCTL_INTERNAL_LOGGING=%2Fvar%2Flib%2Fnerdctl%2F1935db59",
                "nerdctl/mounts": "[{\"Type\":\"bind\",\"Source\":\"/home/maye/github_repository/tensorflow_serving/tensorflow_serving/servables/tensorflow/testdata/saved_model_half_plus_two_cpu\",\"Destination\":\"/models/half_plus_two\",\"Mode\":\"\",\"RW\":false,\"Propagation\":\"\"}]",
                "nerdctl/name": "serving-a876d",
                "nerdctl/namespace": "default",
                "nerdctl/networks": "[\"bridge\"]",
                "nerdctl/platform": "linux/amd64",
                "nerdctl/ports": "[{\"HostPort\":8500,\"ContainerPort\":8500,\"Protocol\":\"tcp\",\"HostIP\":\"0.0.0.0\"},{\"HostPort\":8501,\"ContainerPort\":8501,\"Protocol\":\"tcp\",\"HostIP\":\"0.0.0.0\"}]",
                "nerdctl/state-dir": "/var/lib/nerdctl/1935db59/containers/default/a876dff59e3b37bd3963f67e079e9b2a584665cfa7dd1fd8794a4f7843fd0c53"
            }
        },
        "NetworkSettings": {
            "Ports": {
                "8500/tcp": [
                    {
                        "HostIp": "0.0.0.0",
                        "HostPort": "8500"
                    }
                ],
                "8501/tcp": [
                    {
                        "HostIp": "0.0.0.0",
                        "HostPort": "8501"
                    }
                ]
            },
            "GlobalIPv6Address": "",
            "GlobalIPv6PrefixLen": 0,
            "IPAddress": "10.4.0.7",
            "IPPrefixLen": 24,
            "MacAddress": "22:cf:98:69:69:c5",
            "Networks": {
                "unknown-eth0": {
                    "IPAddress": "10.4.0.7",
                    "IPPrefixLen": 24,
                    "GlobalIPv6Address": "",
                    "GlobalIPv6PrefixLen": 0,
                    "MacAddress": "22:cf:98:69:69:c5"
                }
            }
        }
    }
]
(base) maye@maye-Inspiron-5547:~$ 

Note:

  1. When there is only one named input for signature function, specify the value of instances key to be the value of the input.
  2. In this example, signature function has one named input "instances", and its datatype is string, since in the function it processes serialized tf.Example, which is bytes, if the input is not serialized tf.Example, it raises error: "can not parse example".
  3. The request body for predict API must be JSON object formatted as follows:
{
  // (Optional) Serving signature to use.
  // If unspecifed default serving signature is used.
  "signature_name": <string>,

  // Input Tensors in row ("instances") or columnar ("inputs") format.
  // A request can have either of them but NOT both.
  "instances": <value>|<(nested)list>|<list-of-objects>
  "inputs": <value>|<(nested)list>|<object>
}

json object is a data structure, namely key-value structure.
The request body should be json string, namely: '{"instances": [instance_1, instance_2, ...]}'.
value of a json object can be primitive datatype (bool, int, float, string), or a json object, can not be bytes, or raise error: "TypeError: Object of type bytes is not JSON serializable".
bytes needs to be encoded using base64, then using request body:
'{"instances": [{"b64": <encoded-base64-string-of-instance_1>}, ...]}'
[5]

Attention:
<encoded-base64-string-of-instance_1> should not have prefix "b", or it will raise error: "json parse error".

Note

  1. What is base64?
Base64,顾名思义,就是包括小写字母a-z、大写字母A-Z、数字0-9、符号"+"、"/"一共64个字符的字符集,(另加一个“=”,实际是65个字符,至于为什么还会有一个“=",这个后面再说)。任何符号都可以转换成这个字符集中的字符,这个转换过程就叫做base64编码。
  
 首先将字符串(图片等)转换成二进制序列,然后按每6个二进制位为一组,分成若干组,如果不足6位,则低位补0。每6位组成一个新的字节,高位补00,构成一个新的二进制序列,最后根据base64索引表中的值找到对应的字符。
  
 我们举个例子, 假设有字符串“abc", 我们要对其进行base64编码,最后结果会是什么呢?
字符串abc对应3个字节,一共24位,按6位为一组可分为4组,在每组的高位补上00,经过转换,abc 的 base64 编码是 YWJj, 由原来的3个字母变成了4个,所以base64会比原字符串更长。

  # 编码
>>> base64.b64encode(b'abc')
b'YWJj'

# 解码
>>> base64.b64decode(b'YWJj')
b'abc'

[6]

Base64是一种基于64个可打印字符来表示二进制数据的表示方法。由于2^6=64,所以每6个比特为一个单元,对应某个可打印字符。3个字节有24个比特,对应于4个Base64单元,即3个字节可由4个可打印字符来表示。在Base64中的可打印字符包括字母A-Z、a-z、数字0-9,这样共有62个字符,此外两个可打印符号在不同的系统中而不同。

python标准库中提供了base64模块,用来进行转换
base64.b64encode() 将bytes类型数据进行base64编码,返回编码后的bytes类型
base64.b64deocde() 将base64编码的bytes类型进行解码,返回解码后的bytes类型

>>> import base64
>>> s
b'\x80\x03}q\x00(X\x01\x00\x00\x001q\x01}q\x02(X\x05\x00\x00\x00countq\x03K\nX\x08\x00\x00\x00selectedq\x04\x88uX\x01\x00\x00\x002q\x05}q\x06(h\x03K\x14h\x04\x89uu.'
>>> b = base64.b64encode(s)
>>> b
b'gAN9cQAoWAEAAAAxcQF9cQIoWAUAAABjb3VudHEDSwpYCAAAAHNlbGVjdGVkcQSIdVgBAAAAMnEFfXEGKGgDSxRoBIl1dS4='
>>> base64.b64decode(b)
b'\x80\x03}q\x00(X\x01\x00\x00\x001q\x01}q\x02(X\x05\x00\x00\x00countq\x03K\nX\x08\x00\x00\x00selectedq\x04\x88uX\x01\x00\x00\x002q\x05}q\x06(h\x03K\x14h\x04\x89uu.'

[7]

  1. How to convert base64 encoded bytes to char string:
import base64

base64_encoded_bytes = base64.b64encode(raw_bytes)

# the content inside " " of based64_encoded_string is same 
# with the content inside b" " of base64_encoded_bytes.
base64_encoded_string = base64_encoded_bytes.decode('utf-8')

[8]

for example in raw_dataset.take(1):
    #print(example.numpy())
    example_base64 = base64.b64encode(example.numpy()) 
    example_base64_decode = example_base64.decode('utf-8')
    
    print(example_base64)
    print("\n")
    print(example_base64_decode)
b''

vs




References:


  1. https://tensorflow.google.cn/tfx/serving/signature_defs?hl=en ↩︎

  2. https://github.com/tensorflow/serving ↩︎

  3. https://note.qidong.name/2018/11/tensorflow-serving/ ↩︎

  4. https://blog.csdn.net/zb1165048017/article/details/100527880 ↩︎

  5. https://tensorflow.google.cn/tfx/serving/api_rest?hl=en#encoding_binary_values ↩︎

  6. https://zhuanlan.zhihu.com/p/339477329 ↩︎

  7. https://www.cnblogs.com/oscarli/p/12441049.html ↩︎

  8. https://blog.51cto.com/u_16213372/9138546 ↩︎