[ERROR: Exit Code: 132, and log of pod is empty.]

(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-worker-1-qv8wp 
Name:             dist-strat-example-worker-1-qv8wp
Namespace:        default
Priority:         0
Service Account:  default
Node:             maye-laptop/192.168.0.102
Start Time:       Wed, 14 Feb 2024 00:15:17 +0800
Labels:           job=worker
                  name=dist-strat-example
                  task=1
Annotations:      <none>
Status:           Running
IP:               10.244.1.194
IPs:
  IP:           10.244.1.194
Controlled By:  ReplicationController/dist-strat-example-worker-1
Containers:
  tensorflow:
    Container ID:  containerd://50de97cfb3a9b4d735826d6c77f7b21e7949251f15bd64ec388e08370827003b
    Image:         tf_std_server:v1
    Image ID:      sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
    Port:          5000/TCP
    Host Port:     0/TCP
    Command:
      /usr/bin/python
      /tf_std_server.py
      
    State:          Waiting
      Reason:       CrashLoopBackOff
    Last State:     Terminated
      Reason:       Error
      Exit Code:    132
      Started:      Wed, 14 Feb 2024 00:18:28 +0800
      Finished:     Wed, 14 Feb 2024 00:18:29 +0800
    Ready:          False
    Restart Count:  5
    Environment:
      TF_CONFIG:  { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "worker", "index": "1" } }
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-6nvjs (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             False 
  ContainersReady   False 
  PodScheduled      True 
Volumes:
  kube-api-access-6nvjs:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    ConfigMapOptional:       <nil>
    DownwardAPI:             true
QoS Class:                   BestEffort
Node-Selectors:              <none>
Tolerations:                 node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
                             node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
  Type     Reason     Age                   From               Message
  ----     ------     ----                  ----               -------
  Normal   Scheduled  3m33s                 default-scheduler  Successfully assigned default/dist-strat-example-worker-1-qv8wp to maye-laptop
  Normal   Pulled     117s (x5 over 3m31s)  kubelet            Container image "tf_std_server:v1" already present on machine
  Normal   Created    116s (x5 over 3m31s)  kubelet            Created container tensorflow
  Normal   Started    116s (x5 over 3m30s)  kubelet            Started container tensorflow
  Warning  BackOff    77s (x10 over 3m25s)  kubelet            Back-off restarting failed container tensorflow in pod dist-strat-example-worker-1-qv8wp_default(b38de917-de58-4bfc-8616-498bde7900a6)
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ 

### log is empty.
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl logs dist-strat-example-worker-1-qv8wp
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$

[ANALYSIS]

try 1:

(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-ps-0-c5gcx
Name:             dist-strat-example-ps-0-c5gcx
Namespace:        default
Priority:         0
Service Account:  default
Node:             maye-inspiron-5547/192.168.0.104
Start Time:       Wed, 14 Feb 2024 00:15:18 +0800
Labels:           job=ps
                  name=dist-strat-example
                  task=0
Annotations:      <none>
Status:           Running
IP:               10.244.0.173
IPs:
  IP:           10.244.0.173
Controlled By:  ReplicationController/dist-strat-example-ps-0
Containers:
  tensorflow:
    Container ID:  containerd://1020f3bbdd4bae086c7d60115e989c26e77257b815bdf312f7ca564aa0e4d855
    Image:         tf_std_server:v1
    Image ID:      sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
    Port:          5000/TCP
    Host Port:     0/TCP
    Command:
      /usr/bin/python
      /tf_std_server.py
      
    State:          Running
      Started:      Wed, 14 Feb 2024 00:15:21 +0800
    Ready:          True
    Restart Count:  0
    Environment:
      TF_CONFIG:  { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "ps", "index": "0" } }
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-9dh2c (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             True 
  ContainersReady   True 
  PodScheduled      True 
Volumes:
  kube-api-access-9dh2c:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    ConfigMapOptional:       <nil>
    DownwardAPI:             true
QoS Class:                   BestEffort
Node-Selectors:              <none>
Tolerations:                 node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
                             node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
  Type    Reason     Age    From               Message
  ----    ------     ----   ----               -------
  Normal  Scheduled  3m     default-scheduler  Successfully assigned default/dist-strat-example-ps-0-c5gcx to maye-inspiron-5547
  Normal  Pulled     2m57s  kubelet            Container image "tf_std_server:v1" already present on machine
  Normal  Created    2m57s  kubelet            Created container tensorflow
  Normal  Started    2m56s  kubelet            Started container tensorflow
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ 

(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-worker-0-bhp4t 
Name:             dist-strat-example-worker-0-bhp4t
Namespace:        default
Priority:         0
Service Account:  default
Node:             maye-inspiron-5547/192.168.0.104
Start Time:       Wed, 14 Feb 2024 00:15:16 +0800
Labels:           job=worker
                  name=dist-strat-example
                  task=0
Annotations:      <none>
Status:           Running
IP:               10.244.0.172
IPs:
  IP:           10.244.0.172
Controlled By:  ReplicationController/dist-strat-example-worker-0
Containers:
  tensorflow:
    Container ID:  containerd://116efacbe599cbd20d0a1ea16c869e1ff640fe02e92a4e636b0e583cc9231b64
    Image:         tf_std_server:v1
    Image ID:      sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
    Port:          5000/TCP
    Host Port:     0/TCP
    Command:
      /usr/bin/python
      /tf_std_server.py
      
    State:          Running
      Started:      Wed, 14 Feb 2024 00:15:19 +0800
    Ready:          True
    Restart Count:  0
    Environment:
      TF_CONFIG:  { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "worker", "index": "0" } }
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-b7mpx (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             True 
  ContainersReady   True 
  PodScheduled      True 
Volumes:
  kube-api-access-b7mpx:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    ConfigMapOptional:       <nil>
    DownwardAPI:             true
QoS Class:                   BestEffort
Node-Selectors:              <none>
Tolerations:                 node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
                             node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
  Type    Reason     Age    From               Message
  ----    ------     ----   ----               -------
  Normal  Scheduled  4m41s  default-scheduler  Successfully assigned default/dist-strat-example-worker-0-bhp4t to maye-inspiron-5547
  Normal  Pulled     4m39s  kubelet            Container image "tf_std_server:v1" already present on machine
  Normal  Created    4m38s  kubelet            Created container tensorflow
  Normal  Started    4m38s  kubelet            Started container tensorflow
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ 

(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-worker-0-bhp4t 
Name:             dist-strat-example-worker-0-bhp4t
Namespace:        default
Priority:         0
Service Account:  default
Node:             maye-inspiron-5547/192.168.0.104
Start Time:       Wed, 14 Feb 2024 00:15:16 +0800
Labels:           job=worker
                  name=dist-strat-example
                  task=0
Annotations:      <none>
Status:           Running
IP:               10.244.0.172
IPs:
  IP:           10.244.0.172
Controlled By:  ReplicationController/dist-strat-example-worker-0
Containers:
  tensorflow:
    Container ID:  containerd://116efacbe599cbd20d0a1ea16c869e1ff640fe02e92a4e636b0e583cc9231b64
    Image:         tf_std_server:v1
    Image ID:      sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
    Port:          5000/TCP
    Host Port:     0/TCP
    Command:
      /usr/bin/python
      /tf_std_server.py
      
    State:          Running
      Started:      Wed, 14 Feb 2024 00:15:19 +0800
    Ready:          True
    Restart Count:  0
    Environment:
      TF_CONFIG:  { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "worker", "index": "0" } }
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-b7mpx (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             True 
  ContainersReady   True 
  PodScheduled      True 
Volumes:
  kube-api-access-b7mpx:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    ConfigMapOptional:       <nil>
    DownwardAPI:             true
QoS Class:                   BestEffort
Node-Selectors:              <none>
Tolerations:                 node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
                             node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
  Type    Reason     Age    From               Message
  ----    ------     ----   ----               -------
  Normal  Scheduled  4m41s  default-scheduler  Successfully assigned default/dist-strat-example-worker-0-bhp4t to maye-inspiron-5547
  Normal  Pulled     4m39s  kubelet            Container image "tf_std_server:v1" already present on machine
  Normal  Created    4m38s  kubelet            Created container tensorflow
  Normal  Started    4m38s  kubelet            Started container tensorflow
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ 

try 2:

(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-worker-0-slzhk 
Name:             dist-strat-example-worker-0-slzhk
Namespace:        default
Priority:         0
Service Account:  default
Node:             maye-laptop/192.168.0.102
Start Time:       Wed, 14 Feb 2024 00:25:14 +0800
Labels:           job=worker
                  name=dist-strat-example
                  task=0
Annotations:      <none>
Status:           Running
IP:               10.244.1.196
IPs:
  IP:           10.244.1.196
Controlled By:  ReplicationController/dist-strat-example-worker-0
Containers:
  tensorflow:
    Container ID:  containerd://d49676ed9f5a21a371a38c2493136fb4d409d4e06645d2c3d84e576f3f499951
    Image:         tf_std_server:v1
    Image ID:      sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
    Port:          5000/TCP
    Host Port:     0/TCP
    Command:
      /usr/bin/python
      /tf_std_server.py
      
    State:          Terminated
      Reason:       Error
      Exit Code:    132
      Started:      Wed, 14 Feb 2024 00:26:05 +0800
      Finished:     Wed, 14 Feb 2024 00:26:06 +0800
    Last State:     Terminated
      Reason:       Error
      Exit Code:    132
      Started:      Wed, 14 Feb 2024 00:25:37 +0800
      Finished:     Wed, 14 Feb 2024 00:25:38 +0800
    Ready:          False
    Restart Count:  3
    Environment:
      TF_CONFIG:  { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "worker", "index": "0" } }
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-pv9zs (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             False 
  ContainersReady   False 
  PodScheduled      True 
Volumes:
  kube-api-access-pv9zs:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    ConfigMapOptional:       <nil>
    DownwardAPI:             true
QoS Class:                   BestEffort
Node-Selectors:              <none>
Tolerations:                 node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
                             node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
  Type     Reason     Age                From               Message
  ----     ------     ----               ----               -------
  Normal   Scheduled  66s                default-scheduler  Successfully assigned default/dist-strat-example-worker-0-slzhk to maye-laptop
  Normal   Pulled     16s (x4 over 64s)  kubelet            Container image "tf_std_server:v1" already present on machine
  Normal   Created    15s (x4 over 64s)  kubelet            Created container tensorflow
  Normal   Started    15s (x4 over 63s)  kubelet            Started container tensorflow
  Warning  BackOff    12s (x4 over 57s)  kubelet            Back-off restarting failed container tensorflow in pod dist-strat-example-worker-0-slzhk_default(ef9139cd-ac67-463b-a0ec-886deb24b5a8)
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ 

(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-ps-0-tmd2r 
Name:             dist-strat-example-ps-0-tmd2r
Namespace:        default
Priority:         0
Service Account:  default
Node:             maye-inspiron-5547/192.168.0.104
Start Time:       Wed, 14 Feb 2024 00:25:14 +0800
Labels:           job=ps
                  name=dist-strat-example
                  task=0
Annotations:      <none>
Status:           Running
IP:               10.244.0.177
IPs:
  IP:           10.244.0.177
Controlled By:  ReplicationController/dist-strat-example-ps-0
Containers:
  tensorflow:
    Container ID:  containerd://3045065757bbbd53aa8dc73e15393d50df44fde6eff15a9185ef6a9e14c0ccdd
    Image:         tf_std_server:v1
    Image ID:      sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
    Port:          5000/TCP
    Host Port:     0/TCP
    Command:
      /usr/bin/python
      /tf_std_server.py
      
    State:          Running
      Started:      Wed, 14 Feb 2024 00:25:19 +0800
    Ready:          True
    Restart Count:  0
    Environment:
      TF_CONFIG:  { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "ps", "index": "0" } }
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-7qlfh (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             True 
  ContainersReady   True 
  PodScheduled      True 
Volumes:
  kube-api-access-7qlfh:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    ConfigMapOptional:       <nil>
    DownwardAPI:             true
QoS Class:                   BestEffort
Node-Selectors:              <none>
Tolerations:                 node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
                             node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
  Type    Reason     Age    From               Message
  ----    ------     ----   ----               -------
  Normal  Scheduled  2m19s  default-scheduler  Successfully assigned default/dist-strat-example-ps-0-tmd2r to maye-inspiron-5547
  Normal  Pulled     2m16s  kubelet            Container image "tf_std_server:v1" already present on machine
  Normal  Created    2m15s  kubelet            Created container tensorflow
  Normal  Started    2m14s  kubelet            Started container tensorflow
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ 

(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-worker-1-f8wx4 
Name:             dist-strat-example-worker-1-f8wx4
Namespace:        default
Priority:         0
Service Account:  default
Node:             maye-inspiron-5547/192.168.0.104
Start Time:       Wed, 14 Feb 2024 00:25:14 +0800
Labels:           job=worker
                  name=dist-strat-example
                  task=1
Annotations:      <none>
Status:           Running
IP:               10.244.0.176
IPs:
  IP:           10.244.0.176
Controlled By:  ReplicationController/dist-strat-example-worker-1
Containers:
  tensorflow:
    Container ID:  containerd://463e12aeb85d2f211b8965fc7f0c47745740f1e110e669c04b66868f4f54d07d
    Image:         tf_std_server:v1
    Image ID:      sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
    Port:          5000/TCP
    Host Port:     0/TCP
    Command:
      /usr/bin/python
      /tf_std_server.py
      
    State:          Running
      Started:      Wed, 14 Feb 2024 00:25:18 +0800
    Ready:          True
    Restart Count:  0
    Environment:
      TF_CONFIG:  { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "worker", "index": "1" } }
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-br87m (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             True 
  ContainersReady   True 
  PodScheduled      True 
Volumes:
  kube-api-access-br87m:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    ConfigMapOptional:       <nil>
    DownwardAPI:             true
QoS Class:                   BestEffort
Node-Selectors:              <none>
Tolerations:                 node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
                             node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
  Type    Reason     Age    From               Message
  ----    ------     ----   ----               -------
  Normal  Scheduled  3m     default-scheduler  Successfully assigned default/dist-strat-example-worker-1-f8wx4 to maye-inspiron-5547
  Normal  Pulled     2m57s  kubelet            Container image "tf_std_server:v1" already present on machine
  Normal  Created    2m56s  kubelet            Created container tensorflow
  Normal  Started    2m56s  kubelet            Started container tensorflow
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ 

try 3:

(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl logs  dist-strat-example-worker-0-wk9rh 
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod  dist-strat-example-worker-0-wk9rh 
Name:             dist-strat-example-worker-0-wk9rh
Namespace:        default
Priority:         0
Service Account:  default
Node:             maye-laptop/192.168.0.102
Start Time:       Wed, 14 Feb 2024 00:29:06 +0800
Labels:           job=worker
                  name=dist-strat-example
                  task=0
Annotations:      <none>
Status:           Running
IP:               10.244.1.197
IPs:
  IP:           10.244.1.197
Controlled By:  ReplicationController/dist-strat-example-worker-0
Containers:
  tensorflow:
    Container ID:  containerd://81320099ecb3fbbeec8a0a0ccb96d123bdf92c9a37a7445cb679b4536bbc1169
    Image:         tf_std_server:v1
    Image ID:      sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
    Port:          5000/TCP
    Host Port:     0/TCP
    Command:
      /usr/bin/python
      /tf_std_server.py
      
    State:          Terminated
      Reason:       Error
      Exit Code:    132
      Started:      Wed, 14 Feb 2024 00:29:31 +0800
      Finished:     Wed, 14 Feb 2024 00:29:32 +0800
    Last State:     Terminated
      Reason:       Error
      Exit Code:    132
      Started:      Wed, 14 Feb 2024 00:29:14 +0800
      Finished:     Wed, 14 Feb 2024 00:29:16 +0800
    Ready:          False
    Restart Count:  2
    Environment:
      TF_CONFIG:  { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "worker", "index": "0" } }
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-mtpq5 (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             False 
  ContainersReady   False 
  PodScheduled      True 
Volumes:
  kube-api-access-mtpq5:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    ConfigMapOptional:       <nil>
    DownwardAPI:             true
QoS Class:                   BestEffort
Node-Selectors:              <none>
Tolerations:                 node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
                             node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
  Type     Reason     Age                From               Message
  ----     ------     ----               ----               -------
  Normal   Scheduled  42s                default-scheduler  Successfully assigned default/dist-strat-example-worker-0-wk9rh to maye-laptop
  Normal   Pulled     18s (x3 over 40s)  kubelet            Container image "tf_std_server:v1" already present on machine
  Normal   Created    17s (x3 over 40s)  kubelet            Created container tensorflow
  Normal   Started    17s (x3 over 39s)  kubelet            Started container tensorflow
  Warning  BackOff    0s (x3 over 30s)   kubelet            Back-off restarting failed container tensorflow in pod dist-strat-example-worker-0-wk9rh_default(716eca50-d526-4ba4-9564-4cbef3b0ed1e)
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ 

(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod  dist-strat-example-ps-0-8wgfl
Name:             dist-strat-example-ps-0-8wgfl
Namespace:        default
Priority:         0
Service Account:  default
Node:             maye-laptop/192.168.0.102
Start Time:       Wed, 14 Feb 2024 00:29:07 +0800
Labels:           job=ps
                  name=dist-strat-example
                  task=0
Annotations:      <none>
Status:           Running
IP:               10.244.1.198
IPs:
  IP:           10.244.1.198
Controlled By:  ReplicationController/dist-strat-example-ps-0
Containers:
  tensorflow:
    Container ID:  containerd://5ced44622e84aa04df10f38eda1edaa030a87eb27a6d19436e34e34bb1cb03dd
    Image:         tf_std_server:v1
    Image ID:      sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
    Port:          5000/TCP
    Host Port:     0/TCP
    Command:
      /usr/bin/python
      /tf_std_server.py
      
    State:          Waiting
      Reason:       CrashLoopBackOff
    Last State:     Terminated
      Reason:       Error
      Exit Code:    132
      Started:      Wed, 14 Feb 2024 00:30:08 +0800
      Finished:     Wed, 14 Feb 2024 00:30:09 +0800
    Ready:          False
    Restart Count:  3
    Environment:
      TF_CONFIG:  { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "ps", "index": "0" } }
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-mvz2h (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             False 
  ContainersReady   False 
  PodScheduled      True 
Volumes:
  kube-api-access-mvz2h:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    ConfigMapOptional:       <nil>
    DownwardAPI:             true
QoS Class:                   BestEffort
Node-Selectors:              <none>
Tolerations:                 node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
                             node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
  Type     Reason     Age                 From               Message
  ----     ------     ----                ----               -------
  Normal   Scheduled  113s                default-scheduler  Successfully assigned default/dist-strat-example-ps-0-8wgfl to maye-laptop
  Normal   Created    52s (x4 over 107s)  kubelet            Created container tensorflow
  Normal   Started    52s (x4 over 106s)  kubelet            Started container tensorflow
  Warning  BackOff    15s (x7 over 100s)  kubelet            Back-off restarting failed container tensorflow in pod dist-strat-example-ps-0-8wgfl_default(878a4480-0c88-47e1-9a78-96969d96e01b)
  Normal   Pulled     0s (x5 over 109s)   kubelet            Container image "tf_std_server:v1" already present on machine
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ 

In this example, there are three pods created: dist-strat-example-ps-0, dist-strat-example-worker-0, dist-strat-example-worker-1. Every time when "error: exit code 132" occurs, the pod is scheduled to "maye-laptop", no matter which pod it is. When pod is scheduled to "maye-inspiron-5547", it is always ok, no matter which pod it is. So it can be inferred that this error is owing to node "maye-laptop".

1.5.0以上的tensorflow需要AVX支持,远端服务器CPU里没有avx(cat /proc/cpuinfo===>在flags里没有avx), [1]

maye@maye-laptop:~$ cat /proc/cpuinfo | grep avx
maye@maye-laptop:~$ 

(base) maye@maye-Inspiron-5547:~$ cat /proc/cpuinfo | grep avx
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt dtherm ida arat pln pts md_clear flush_l1d
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt dtherm ida arat pln pts md_clear flush_l1d
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt dtherm ida arat pln pts md_clear flush_l1d
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt dtherm ida arat pln pts md_clear flush_l1d
(base) maye@maye-Inspiron-5547:~$ 

References:


  1. https://www.zhihu.com/question/286687942/answer/2226002125 ↩︎