k8s pod 在迁移zookeeper时出现的问题

一次迁移中出现的问题，因为要搬迁机房，集群中的节点服务器分布在两个机房，通过专线打通了，现在需要整体都迁移到其中一个机房，
所以pod要进行迁移，机器资源也比较紧张，在迁移中zookeeper迁移出现问题。

//master上查看zk pod情况

[root@master-web-38 ~]# kubectl get pod -o wide | grep zk
zk-0                                 1/1       Running            0          7m        172.17.60.200    ht5.node
zk-1                                 1/1       Running            0          5h        172.17.205.86    ht23.node
zk-2                                 1/1       Running            0          5d        172.17.157.39    ht2.node

导出 zk yaml查看

apiVersion: apps/v1
kind: StatefulSet
metadata:
  annotations:
    kubectl.kubernetes.io/last-applied-configuration: |
      {"apiVersion":"apps/v1","kind":"StatefulSet","metadata":{"annotations":{},"creationTimestamp":"2018-09-06T10:24:03Z","generation":2,"name":"zk","namespace":"default","resourceVersion":"123118858","selfLink":"/apis/apps/v1/namespaces/default/statefulsets/zk","uid":"fa5e482b-b1be-11e8-a33b-060eb4000e9d"},"spec":{"podManagementPolicy":"Parallel","replicas":3,"revisionHistoryLimit":10,"selector":{"matchLabels":{"app":"zk"}},"serviceName":"zk-cluster-svc","template":{"metadata":{"creationTimestamp":null,"labels":{"app":"zk"}},"spec":{"affinity":{"podAntiAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":[{"labelSelector":{"matchExpressions":[{"key":"app","operator":"In","values":["zk"]}]},"topologyKey":"kubernetes.io/hostname"}]}},"containers":[{"command":["sh","-c","start-zookeeper --servers=3 --data_dir=/var/lib/zookeeper/data --data_log_dir=/var/lib/zookeeper/data/log --conf_dir=/opt/zookeeper/conf --client_port=2181 --election_port=3888 --server_port=2888 --tick_time=2000 --init_limit=10 --sync_limit=5 --heap=4G --max_client_cnxns=60 --snap_retain_count=3 --purge_interval=12 --max_session_timeout=40000 --min_session_timeout=4000 --log_level=INFO"],"image":"127.0.0.1:35000/k8s.gcr.io/kubernetes-zookeeper:1.0-3.4.10","imagePullPolicy":"IfNotPresent","livenessProbe":{"exec":{"command":["sh","-c","zookeeper-ready 2181"]},"failureThreshold":3,"initialDelaySeconds":10,"periodSeconds":10,"successThreshold":1,"timeoutSeconds":5},"name":"kubernetes-zookeeper","ports":[{"containerPort":2181,"name":"client","protocol":"TCP"},{"containerPort":2888,"name":"server","protocol":"TCP"},{"containerPort":3888,"name":"leader-election","protocol":"TCP"}],"readinessProbe":{"exec":{"command":["sh","-c","zookeeper-ready 2181"]},"failureThreshold":3,"initialDelaySeconds":10,"periodSeconds":10,"successThreshold":1,"timeoutSeconds":5},"resources":{"requests":{"cpu":"2","memory":"6Gi"}},"terminationMessagePath":"/dev/termination-log","terminationMessagePolicy":"File","volumeMounts":[{"mountPath":"/var/lib/zookeeper","name":"datadir"}]}],"dnsPolicy":"ClusterFirst","nodeSelector":{"zk":"ht"},"restartPolicy":"Always","schedulerName":"default-scheduler","securityContext":{"fsGroup":1000,"runAsUser":1000},"terminationGracePeriodSeconds":30,"volumes":[{"hostPath":{"path":"/zookeeper","type":""},"name":"datadir"}]}},"updateStrategy":{"type":"RollingUpdate"}},"status":{"collisionCount":0,"currentReplicas":2,"currentRevision":"zk-59fd64cc84","observedGeneration":2,"readyReplicas":2,"replicas":3,"updateRevision":"zk-cc7b55c88","updatedReplicas":1}}
  creationTimestamp: null
  generation: 1
  name: zk
  selfLink: /apis/apps/v1/namespaces/default/statefulsets/zk
spec:
  podManagementPolicy: Parallel
  replicas: 3
  revisionHistoryLimit: 10
  selector:
    matchLabels:
      app: zk
  serviceName: zk-cluster-svc
  template:
    metadata:
      creationTimestamp: null
      labels:
        app: zk
    spec:
      affinity:
        podAntiAffinity:  //亲和性防止pod创建在同一个node上,需要不同node节点上创建zookeeper，所以会导致如果你只开了1个node节点，则可能会失败。
          requiredDuringSchedulingIgnoredDuringExecution:
          - labelSelector:
              matchExpressions:
              - key: app
                operator: In
                values:
                - zk
            topologyKey: kubernetes.io/hostname
      containers:
      - command:
        - sh
        - -c
        - start-zookeeper --servers=3 --data_dir=/var/lib/zookeeper/data --data_log_dir=/var/lib/zookeeper/data/log
          --conf_dir=/opt/zookeeper/conf --client_port=2181 --election_port=3888 --server_port=2888
          --tick_time=2000 --init_limit=10 --sync_limit=5 --heap=4G --max_client_cnxns=60
          --snap_retain_count=3 --purge_interval=12 --max_session_timeout=40000 --min_session_timeout=4000
          --log_level=INFO
        image: 127.0.0.1:35000/k8s.gcr.io/kubernetes-zookeeper:1.0-3.4.10
        imagePullPolicy: IfNotPresent
        livenessProbe:
          exec:
            command:
            - sh
            - -c
            - zookeeper-ready 2181
          failureThreshold: 3
          initialDelaySeconds: 10
          periodSeconds: 10
          successThreshold: 1
          timeoutSeconds: 5
        name: kubernetes-zookeeper
        ports:
        - containerPort: 2181
          name: client
          protocol: TCP
        - containerPort: 2888
          name: server
          protocol: TCP
        - containerPort: 3888
          name: leader-election
          protocol: TCP
        readinessProbe:
          exec:
            command:
            - sh
            - -c
            - zookeeper-ready 2181
          failureThreshold: 3
          initialDelaySeconds: 10
          periodSeconds: 10
          successThreshold: 1
          timeoutSeconds: 5
        resources:
          requests:
            cpu: "2"
            memory: 6Gi
        terminationMessagePath: /dev/termination-log
        terminationMessagePolicy: File
        volumeMounts:
        - mountPath: /var/lib/zookeeper
          name: datadir
      dnsPolicy: ClusterFirst
      nodeSelector:
        zk: ht

        //这里注意你需要设置label 例如：  kubectl label nodes ht23.node app=zk

        //或如果存在该标签zk 则可以覆盖 kubectl label nodes ht23.node app=zk --overwrite

      restartPolicy: Always
      schedulerName: default-scheduler
      securityContext:
        fsGroup: 1000
        runAsUser: 1000
      terminationGracePeriodSeconds: 30
      volumes:
      - hostPath:
          path: /zookeeper
          type: ""
        name: datadir
  updateStrategy:
    type: RollingUpdate
status:
  replicas: 0

重建的时候出现问题。出现两个问题
1、删除pod zk-1时，报zookeeper目录问题。
2、删除pod zk-0时，由于亲和性的设置，所以需要在不同node节点上创建，否则会报错。

kubectl describe pod zk-1 得到下面的结果

Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal SuccessfulMountVolume 46m kubelet, ht23.node MountVolume.SetUp succeeded for volume "datadir"
Normal SuccessfulMountVolume 46m kubelet, ht23.node MountVolume.SetUp succeeded for volume "default-token-lgcgp"
Normal Created 46m (x4 over 46m) kubelet, ht23.node Created container
Normal Started 46m (x4 over 46m) kubelet, ht23.node Started container
Warning BackOff 45m (x10 over 46m) kubelet, ht23.node Back-off restarting failed container
Normal Pulled 45m (x5 over 46m) kubelet, ht23.node Container image "127.0.0.1:35000/k8s.gcr.io/kubernetes-zookeeper:1.0-3.4.10" already present on machine
Normal Scheduled 1m default-scheduler Successfully assigned zk-1 to ht23.node

//创建pod zk-1的时候,出错，所以查看日志

【root@master-web-38 yaml]# kubectl log zk-1
log is DEPRECATED and will be removed in a future version. Use logs instead.
#This file was autogenerated DO NOT EDIT
clientPort=2181
dataDir=/var/lib/zookeeper/data
dataLogDir=/var/lib/zookeeper/data/log
tickTime=2000
initLimit=10
syncLimit=5
maxClientCnxns=60
minSessionTimeout=4000
maxSessionTimeout=40000
autopurge.snapRetainCount=3
autopurge.purgeInteval=12
server.1=zk-0.zk-cluster-svc.default.svc.cluster.local.:2888:3888
server.2=zk-1.zk-cluster-svc.default.svc.cluster.local.:2888:3888
server.3=zk-2.zk-cluster-svc.default.svc.cluster.local.:2888:3888
Creating ZooKeeper log4j configuration
mkdir: cannot create directory '/var/lib/zookeeper/data': Permission denied
chown: cannot access '/var/lib/zookeeper/data': No such file or directory
mkdir: cannot create directory '/var/lib/zookeeper/data': Permission denied
chown: invalid group: 'zookeeper:USER'
/usr/bin/start-zookeeper: line 176: /var/lib/zookeeper/data/myid: No such file or directory

解决办法，到zk-1所在的node节点机器上创建 /zookeeper目录，然后修改权限。

解决方法：
[root@ht23 /]# ll
total 32
lrwxrwxrwx    1 root root    7 Sep 23 10:24 bin -> usr/bin
dr-xr-xr-x.   5 root root 4096 Sep 23 10:27 boot
drwxr-xr-x    3 root root   23 Feb 21 16:06 data
drwxr-xr-x   19 root root 3100 Feb 19 11:32 dev
drwxr-xr-x. 103 root root 8192 Feb 21 15:44 etc
drwxr-xr-x.   3 root root   19 Sep 23 11:18 home
lrwxrwxrwx    1 root root    7 Sep 23 10:24 lib -> usr/lib
lrwxrwxrwx    1 root root    9 Sep 23 10:24 lib64 -> usr/lib64
drwxr-xr-x.   2 root root    6 Apr 11  2018 media
drwxr-xr-x.   3 root root   18 Apr 11  2018 mnt
drwxr-xr-x.   4 root root   25 Sep 30 15:06 opt
dr-xr-xr-x  558 root root    0 Feb 19 11:32 proc
dr-xr-x---.   7 root root 4096 Feb 21 10:11 root
drwxr-xr-x   36 root root 1120 Feb 21 14:03 run
lrwxrwxrwx    1 root root    8 Sep 23 10:24 sbin -> usr/sbin
drwxr-xr-x.   2 root root    6 Apr 11  2018 srv
dr-xr-xr-x   13 root root    0 Feb 21 16:33 sys
drwxrwxrwt.   8 root root 4096 Feb 21 16:47 tmp
drwxr-xr-x.  13 root root 4096 Sep 23 10:24 usr
drwxr-xr-x.  22 root root 4096 Sep 23 10:24 var
drwxr-xr-x    2 root root    6 Dec 17 00:07 zookeeper

//我这里就先设置为777权限

[root@ht23 /]# chmod 777 zookeeper/
[root@ht23 /]# chmod -R  777 zookeeper/

//再次删除zk-0 pod时出现这样错误，

Warning FailedScheduling 1m (x12 over 3m) default-scheduler 0/18 nodes are available: 1 node(s) didn't match pod affinity/anti-affinity, 1 node(s) didn't satisfy existing pods anti-affinity rules, 15 node(s) were unschedulable, 4 node(s) were not ready, 4 node(s) were out of disk space.
Warning FailedScheduling 14s (x7 over 4m) default-scheduler 0/18 nodes are available: 1 node(s) didn't match pod affinity/anti-affinity, 1 node(s) didn't satisfy existing pods anti-affinity rules, 15 node(s) were unschedulable, 4 node(s) were out of disk space, 5 node(s) were not ready.
后我加入了一台安装好的节点，把zk-0调度到该节点后，问题解决。

posted @ 2022-02-21 23:23 jinzi 阅读(291) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部