[ERROR: Exit Code: 132, and log of pod is empty.]
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-worker-1-qv8wp
Name: dist-strat-example-worker-1-qv8wp
Namespace: default
Priority: 0
Service Account: default
Node: maye-laptop/192.168.0.102
Start Time: Wed, 14 Feb 2024 00:15:17 +0800
Labels: job=worker
name=dist-strat-example
task=1
Annotations: <none>
Status: Running
IP: 10.244.1.194
IPs:
IP: 10.244.1.194
Controlled By: ReplicationController/dist-strat-example-worker-1
Containers:
tensorflow:
Container ID: containerd://50de97cfb3a9b4d735826d6c77f7b21e7949251f15bd64ec388e08370827003b
Image: tf_std_server:v1
Image ID: sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
Port: 5000/TCP
Host Port: 0/TCP
Command:
/usr/bin/python
/tf_std_server.py
State: Waiting
Reason: CrashLoopBackOff
Last State: Terminated
Reason: Error
Exit Code: 132
Started: Wed, 14 Feb 2024 00:18:28 +0800
Finished: Wed, 14 Feb 2024 00:18:29 +0800
Ready: False
Restart Count: 5
Environment:
TF_CONFIG: { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "worker", "index": "1" } }
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-6nvjs (ro)
Conditions:
Type Status
Initialized True
Ready False
ContainersReady False
PodScheduled True
Volumes:
kube-api-access-6nvjs:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional: <nil>
DownwardAPI: true
QoS Class: BestEffort
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 3m33s default-scheduler Successfully assigned default/dist-strat-example-worker-1-qv8wp to maye-laptop
Normal Pulled 117s (x5 over 3m31s) kubelet Container image "tf_std_server:v1" already present on machine
Normal Created 116s (x5 over 3m31s) kubelet Created container tensorflow
Normal Started 116s (x5 over 3m30s) kubelet Started container tensorflow
Warning BackOff 77s (x10 over 3m25s) kubelet Back-off restarting failed container tensorflow in pod dist-strat-example-worker-1-qv8wp_default(b38de917-de58-4bfc-8616-498bde7900a6)
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$
### log is empty.
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl logs dist-strat-example-worker-1-qv8wp
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$
[ANALYSIS]
try 1:
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-ps-0-c5gcx
Name: dist-strat-example-ps-0-c5gcx
Namespace: default
Priority: 0
Service Account: default
Node: maye-inspiron-5547/192.168.0.104
Start Time: Wed, 14 Feb 2024 00:15:18 +0800
Labels: job=ps
name=dist-strat-example
task=0
Annotations: <none>
Status: Running
IP: 10.244.0.173
IPs:
IP: 10.244.0.173
Controlled By: ReplicationController/dist-strat-example-ps-0
Containers:
tensorflow:
Container ID: containerd://1020f3bbdd4bae086c7d60115e989c26e77257b815bdf312f7ca564aa0e4d855
Image: tf_std_server:v1
Image ID: sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
Port: 5000/TCP
Host Port: 0/TCP
Command:
/usr/bin/python
/tf_std_server.py
State: Running
Started: Wed, 14 Feb 2024 00:15:21 +0800
Ready: True
Restart Count: 0
Environment:
TF_CONFIG: { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "ps", "index": "0" } }
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-9dh2c (ro)
Conditions:
Type Status
Initialized True
Ready True
ContainersReady True
PodScheduled True
Volumes:
kube-api-access-9dh2c:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional: <nil>
DownwardAPI: true
QoS Class: BestEffort
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 3m default-scheduler Successfully assigned default/dist-strat-example-ps-0-c5gcx to maye-inspiron-5547
Normal Pulled 2m57s kubelet Container image "tf_std_server:v1" already present on machine
Normal Created 2m57s kubelet Created container tensorflow
Normal Started 2m56s kubelet Started container tensorflow
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-worker-0-bhp4t
Name: dist-strat-example-worker-0-bhp4t
Namespace: default
Priority: 0
Service Account: default
Node: maye-inspiron-5547/192.168.0.104
Start Time: Wed, 14 Feb 2024 00:15:16 +0800
Labels: job=worker
name=dist-strat-example
task=0
Annotations: <none>
Status: Running
IP: 10.244.0.172
IPs:
IP: 10.244.0.172
Controlled By: ReplicationController/dist-strat-example-worker-0
Containers:
tensorflow:
Container ID: containerd://116efacbe599cbd20d0a1ea16c869e1ff640fe02e92a4e636b0e583cc9231b64
Image: tf_std_server:v1
Image ID: sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
Port: 5000/TCP
Host Port: 0/TCP
Command:
/usr/bin/python
/tf_std_server.py
State: Running
Started: Wed, 14 Feb 2024 00:15:19 +0800
Ready: True
Restart Count: 0
Environment:
TF_CONFIG: { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "worker", "index": "0" } }
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-b7mpx (ro)
Conditions:
Type Status
Initialized True
Ready True
ContainersReady True
PodScheduled True
Volumes:
kube-api-access-b7mpx:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional: <nil>
DownwardAPI: true
QoS Class: BestEffort
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 4m41s default-scheduler Successfully assigned default/dist-strat-example-worker-0-bhp4t to maye-inspiron-5547
Normal Pulled 4m39s kubelet Container image "tf_std_server:v1" already present on machine
Normal Created 4m38s kubelet Created container tensorflow
Normal Started 4m38s kubelet Started container tensorflow
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-worker-0-bhp4t
Name: dist-strat-example-worker-0-bhp4t
Namespace: default
Priority: 0
Service Account: default
Node: maye-inspiron-5547/192.168.0.104
Start Time: Wed, 14 Feb 2024 00:15:16 +0800
Labels: job=worker
name=dist-strat-example
task=0
Annotations: <none>
Status: Running
IP: 10.244.0.172
IPs:
IP: 10.244.0.172
Controlled By: ReplicationController/dist-strat-example-worker-0
Containers:
tensorflow:
Container ID: containerd://116efacbe599cbd20d0a1ea16c869e1ff640fe02e92a4e636b0e583cc9231b64
Image: tf_std_server:v1
Image ID: sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
Port: 5000/TCP
Host Port: 0/TCP
Command:
/usr/bin/python
/tf_std_server.py
State: Running
Started: Wed, 14 Feb 2024 00:15:19 +0800
Ready: True
Restart Count: 0
Environment:
TF_CONFIG: { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "worker", "index": "0" } }
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-b7mpx (ro)
Conditions:
Type Status
Initialized True
Ready True
ContainersReady True
PodScheduled True
Volumes:
kube-api-access-b7mpx:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional: <nil>
DownwardAPI: true
QoS Class: BestEffort
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 4m41s default-scheduler Successfully assigned default/dist-strat-example-worker-0-bhp4t to maye-inspiron-5547
Normal Pulled 4m39s kubelet Container image "tf_std_server:v1" already present on machine
Normal Created 4m38s kubelet Created container tensorflow
Normal Started 4m38s kubelet Started container tensorflow
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$
try 2:
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-worker-0-slzhk
Name: dist-strat-example-worker-0-slzhk
Namespace: default
Priority: 0
Service Account: default
Node: maye-laptop/192.168.0.102
Start Time: Wed, 14 Feb 2024 00:25:14 +0800
Labels: job=worker
name=dist-strat-example
task=0
Annotations: <none>
Status: Running
IP: 10.244.1.196
IPs:
IP: 10.244.1.196
Controlled By: ReplicationController/dist-strat-example-worker-0
Containers:
tensorflow:
Container ID: containerd://d49676ed9f5a21a371a38c2493136fb4d409d4e06645d2c3d84e576f3f499951
Image: tf_std_server:v1
Image ID: sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
Port: 5000/TCP
Host Port: 0/TCP
Command:
/usr/bin/python
/tf_std_server.py
State: Terminated
Reason: Error
Exit Code: 132
Started: Wed, 14 Feb 2024 00:26:05 +0800
Finished: Wed, 14 Feb 2024 00:26:06 +0800
Last State: Terminated
Reason: Error
Exit Code: 132
Started: Wed, 14 Feb 2024 00:25:37 +0800
Finished: Wed, 14 Feb 2024 00:25:38 +0800
Ready: False
Restart Count: 3
Environment:
TF_CONFIG: { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "worker", "index": "0" } }
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-pv9zs (ro)
Conditions:
Type Status
Initialized True
Ready False
ContainersReady False
PodScheduled True
Volumes:
kube-api-access-pv9zs:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional: <nil>
DownwardAPI: true
QoS Class: BestEffort
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 66s default-scheduler Successfully assigned default/dist-strat-example-worker-0-slzhk to maye-laptop
Normal Pulled 16s (x4 over 64s) kubelet Container image "tf_std_server:v1" already present on machine
Normal Created 15s (x4 over 64s) kubelet Created container tensorflow
Normal Started 15s (x4 over 63s) kubelet Started container tensorflow
Warning BackOff 12s (x4 over 57s) kubelet Back-off restarting failed container tensorflow in pod dist-strat-example-worker-0-slzhk_default(ef9139cd-ac67-463b-a0ec-886deb24b5a8)
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-ps-0-tmd2r
Name: dist-strat-example-ps-0-tmd2r
Namespace: default
Priority: 0
Service Account: default
Node: maye-inspiron-5547/192.168.0.104
Start Time: Wed, 14 Feb 2024 00:25:14 +0800
Labels: job=ps
name=dist-strat-example
task=0
Annotations: <none>
Status: Running
IP: 10.244.0.177
IPs:
IP: 10.244.0.177
Controlled By: ReplicationController/dist-strat-example-ps-0
Containers:
tensorflow:
Container ID: containerd://3045065757bbbd53aa8dc73e15393d50df44fde6eff15a9185ef6a9e14c0ccdd
Image: tf_std_server:v1
Image ID: sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
Port: 5000/TCP
Host Port: 0/TCP
Command:
/usr/bin/python
/tf_std_server.py
State: Running
Started: Wed, 14 Feb 2024 00:25:19 +0800
Ready: True
Restart Count: 0
Environment:
TF_CONFIG: { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "ps", "index": "0" } }
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-7qlfh (ro)
Conditions:
Type Status
Initialized True
Ready True
ContainersReady True
PodScheduled True
Volumes:
kube-api-access-7qlfh:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional: <nil>
DownwardAPI: true
QoS Class: BestEffort
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 2m19s default-scheduler Successfully assigned default/dist-strat-example-ps-0-tmd2r to maye-inspiron-5547
Normal Pulled 2m16s kubelet Container image "tf_std_server:v1" already present on machine
Normal Created 2m15s kubelet Created container tensorflow
Normal Started 2m14s kubelet Started container tensorflow
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-worker-1-f8wx4
Name: dist-strat-example-worker-1-f8wx4
Namespace: default
Priority: 0
Service Account: default
Node: maye-inspiron-5547/192.168.0.104
Start Time: Wed, 14 Feb 2024 00:25:14 +0800
Labels: job=worker
name=dist-strat-example
task=1
Annotations: <none>
Status: Running
IP: 10.244.0.176
IPs:
IP: 10.244.0.176
Controlled By: ReplicationController/dist-strat-example-worker-1
Containers:
tensorflow:
Container ID: containerd://463e12aeb85d2f211b8965fc7f0c47745740f1e110e669c04b66868f4f54d07d
Image: tf_std_server:v1
Image ID: sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
Port: 5000/TCP
Host Port: 0/TCP
Command:
/usr/bin/python
/tf_std_server.py
State: Running
Started: Wed, 14 Feb 2024 00:25:18 +0800
Ready: True
Restart Count: 0
Environment:
TF_CONFIG: { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "worker", "index": "1" } }
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-br87m (ro)
Conditions:
Type Status
Initialized True
Ready True
ContainersReady True
PodScheduled True
Volumes:
kube-api-access-br87m:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional: <nil>
DownwardAPI: true
QoS Class: BestEffort
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 3m default-scheduler Successfully assigned default/dist-strat-example-worker-1-f8wx4 to maye-inspiron-5547
Normal Pulled 2m57s kubelet Container image "tf_std_server:v1" already present on machine
Normal Created 2m56s kubelet Created container tensorflow
Normal Started 2m56s kubelet Started container tensorflow
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$
try 3:
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl logs dist-strat-example-worker-0-wk9rh
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-worker-0-wk9rh
Name: dist-strat-example-worker-0-wk9rh
Namespace: default
Priority: 0
Service Account: default
Node: maye-laptop/192.168.0.102
Start Time: Wed, 14 Feb 2024 00:29:06 +0800
Labels: job=worker
name=dist-strat-example
task=0
Annotations: <none>
Status: Running
IP: 10.244.1.197
IPs:
IP: 10.244.1.197
Controlled By: ReplicationController/dist-strat-example-worker-0
Containers:
tensorflow:
Container ID: containerd://81320099ecb3fbbeec8a0a0ccb96d123bdf92c9a37a7445cb679b4536bbc1169
Image: tf_std_server:v1
Image ID: sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
Port: 5000/TCP
Host Port: 0/TCP
Command:
/usr/bin/python
/tf_std_server.py
State: Terminated
Reason: Error
Exit Code: 132
Started: Wed, 14 Feb 2024 00:29:31 +0800
Finished: Wed, 14 Feb 2024 00:29:32 +0800
Last State: Terminated
Reason: Error
Exit Code: 132
Started: Wed, 14 Feb 2024 00:29:14 +0800
Finished: Wed, 14 Feb 2024 00:29:16 +0800
Ready: False
Restart Count: 2
Environment:
TF_CONFIG: { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "worker", "index": "0" } }
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-mtpq5 (ro)
Conditions:
Type Status
Initialized True
Ready False
ContainersReady False
PodScheduled True
Volumes:
kube-api-access-mtpq5:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional: <nil>
DownwardAPI: true
QoS Class: BestEffort
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 42s default-scheduler Successfully assigned default/dist-strat-example-worker-0-wk9rh to maye-laptop
Normal Pulled 18s (x3 over 40s) kubelet Container image "tf_std_server:v1" already present on machine
Normal Created 17s (x3 over 40s) kubelet Created container tensorflow
Normal Started 17s (x3 over 39s) kubelet Started container tensorflow
Warning BackOff 0s (x3 over 30s) kubelet Back-off restarting failed container tensorflow in pod dist-strat-example-worker-0-wk9rh_default(716eca50-d526-4ba4-9564-4cbef3b0ed1e)
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$ kubectl describe pod dist-strat-example-ps-0-8wgfl
Name: dist-strat-example-ps-0-8wgfl
Namespace: default
Priority: 0
Service Account: default
Node: maye-laptop/192.168.0.102
Start Time: Wed, 14 Feb 2024 00:29:07 +0800
Labels: job=ps
name=dist-strat-example
task=0
Annotations: <none>
Status: Running
IP: 10.244.1.198
IPs:
IP: 10.244.1.198
Controlled By: ReplicationController/dist-strat-example-ps-0
Containers:
tensorflow:
Container ID: containerd://5ced44622e84aa04df10f38eda1edaa030a87eb27a6d19436e34e34bb1cb03dd
Image: tf_std_server:v1
Image ID: sha256:117ff425f04f86b62e85a1a7ca654d0c36e9c8ac3bcc78f413984e5cbddb8421
Port: 5000/TCP
Host Port: 0/TCP
Command:
/usr/bin/python
/tf_std_server.py
State: Waiting
Reason: CrashLoopBackOff
Last State: Terminated
Reason: Error
Exit Code: 132
Started: Wed, 14 Feb 2024 00:30:08 +0800
Finished: Wed, 14 Feb 2024 00:30:09 +0800
Ready: False
Restart Count: 3
Environment:
TF_CONFIG: { "cluster": { "worker": ["dist-strat-example-worker-0:5000","dist-strat-example-worker-1:5000"], "ps": ["dist-strat-example-ps-0:5000"], "chief": ["dist-strat-example-chief:5000"]}, "task": { "type": "ps", "index": "0" } }
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-mvz2h (ro)
Conditions:
Type Status
Initialized True
Ready False
ContainersReady False
PodScheduled True
Volumes:
kube-api-access-mvz2h:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional: <nil>
DownwardAPI: true
QoS Class: BestEffort
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 113s default-scheduler Successfully assigned default/dist-strat-example-ps-0-8wgfl to maye-laptop
Normal Created 52s (x4 over 107s) kubelet Created container tensorflow
Normal Started 52s (x4 over 106s) kubelet Started container tensorflow
Warning BackOff 15s (x7 over 100s) kubelet Back-off restarting failed container tensorflow in pod dist-strat-example-ps-0-8wgfl_default(878a4480-0c88-47e1-9a78-96969d96e01b)
Normal Pulled 0s (x5 over 109s) kubelet Container image "tf_std_server:v1" already present on machine
(base) maye@maye-Inspiron-5547:~/github_repository/tensorflow_ecosystem/distribution_strategy$
In this example, there are three pods created: dist-strat-example-ps-0, dist-strat-example-worker-0, dist-strat-example-worker-1. Every time when "error: exit code 132" occurs, the pod is scheduled to "maye-laptop", no matter which pod it is. When pod is scheduled to "maye-inspiron-5547", it is always ok, no matter which pod it is. So it can be inferred that this error is owing to node "maye-laptop".
1.5.0以上的tensorflow需要AVX支持,远端服务器CPU里没有avx(cat /proc/cpuinfo===>在flags里没有avx), [1]
maye@maye-laptop:~$ cat /proc/cpuinfo | grep avx
maye@maye-laptop:~$
(base) maye@maye-Inspiron-5547:~$ cat /proc/cpuinfo | grep avx
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt dtherm ida arat pln pts md_clear flush_l1d
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt dtherm ida arat pln pts md_clear flush_l1d
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt dtherm ida arat pln pts md_clear flush_l1d
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt dtherm ida arat pln pts md_clear flush_l1d
(base) maye@maye-Inspiron-5547:~$
References: