kubeadm Etcd高可用恢复方案

基于kubeadm Etcd集群数据恢复方案

etcd HA集群中单台etcd故障不更换ip恢复方式

概述

单master节点,会将90作为故障节点删除并进行修复

 

 

查看当前节点状态

/opt# kubectl -nkube-system get po|grep etcd
    
 # 查看etcd集群状态
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd'  -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert-file=/etc/kubernetes/pki/etcd/server.crt --key-file=/etc/kubernetes/pki/etcd/server.key --ca-file=/etc/kubernetes/pki/etcd/ca.crt cluster-health"
    
 # 查看节点成员
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"

 模拟90节点宕机

# 模拟宕机节点,90节点上操作
/opt# rm -rf /var/lib/etcd/member/*
/opt# docker rm -vf `docker ps -a | grep etcd | awk '{print $1}'`

/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd'  -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert-file=/etc/kubernetes/pki/etcd/server.crt --key-file=/etc/kubernetes/pki/etcd/server.key --ca-file=/etc/kubernetes/pki/etcd/ca.crt cluster-health"
/opt# kubectl -nkube-system get po|grep etcd

# 移除节点成员
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member remove 8e363d6f244214f6"

 修复节点

snap# systemctl stop kubelet

/opt# rm -rf /var/lib/etcd/member/*
/opt# docker rm -vf `docker ps -a | grep etcd | awk '{print $1}'`
snap# ll -h /var/lib/etcd/member/

# 添加
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd'  --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt  member add bj-idc1-10-53-6-90-10.53.6.90 --peer-urls='https://10.53.6.90:2380'"

# 查看
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"

snap# systemctl start kubelet

# 90节点查看,数据已同步过来
snap# ll -h /var/lib/etcd/member/snap/

模拟etcd节点宕机,并将新节点加入集群

概述

只要是相同集群中的node基于集群ca签发的证书,都能作为扩展etcd的节点来使用

 清除故障节点信息

/opt# kubectl -nkube-system get po|grep etcd
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd'  -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert-file=/etc/kubernetes/pki/etcd/server.crt --key-file=/etc/kubernetes/pki/etcd/server.key --ca-file=/etc/kubernetes/pki/etcd/ca.crt cluster-health"

# 查看节点成员
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"

# 移除节点成员
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member remove 9de8f041daa634"

# 模拟宕机节点
/opt# rm -rf /etc/kubernetes/manifests/etcd.yaml
/opt# rm -rf /var/lib/etcd/member/*
/opt# docker rm -vf `docker ps -a | grep etcd | awk '{print $1}'`

 签发ca证书

~# mkdir ~/caadd103
# 需要依赖集群本身的ca证书 ~/caadd103# cp ../openssl/ca.* . ~/caadd103# vi server.cnf [ req ] req_extensions = v3_req distinguished_name = req_distinguished_name [req_distinguished_name] [ v3_req ] basicConstraints = CA:FALSE extendedKeyUsage = clientAuth, serverAuth keyUsage = nonRepudiation, digitalSignature, keyEncipherment subjectAltName = @alt_names [alt_names] IP.1 = 10.53.5.165 IP.2 = 10.53.4.221 IP.3 = 10.53.6.90 IP.4 = 10.53.4.103 ~/caadd103# vi peer.cnf [ req ] req_extensions = v3_req distinguished_name = req_distinguished_name [req_distinguished_name] [ v3_req ] extendedKeyUsage = clientAuth, serverAuth keyUsage = critical, digitalSignature, keyEncipherment subjectAltName = @alt_names [alt_names] IP.1 = 10.53.5.165 IP.2 = 10.53.4.221 IP.3 = 10.53.6.90 IP.4 = 10.53.4.103 ~/caadd103# openssl genrsa -out server.key 4096 ~/caadd103# openssl req -new -key server.key -out server.csr -subj "/CN=10.53.5.165" -config server.cnf ~/caadd103# openssl x509 -req -in server.csr -CA ca.crt \ -CAkey ca.key -CAcreateserial \ -out server.crt -days 1825 \ -extfile server.cnf -extensions v3_req ~/caadd103# openssl genrsa -out peer.key 4096 ~/caadd103# openssl req -new -key peer.key -out peer.csr \ -subj "/CN=10.53.5.165" \ -config peer.cnf ~/caadd103# openssl x509 -req -in peer.csr \ -CA ca.crt -CAkey ca.key -CAcreateserial \ -out peer.crt -days 1825 \ -extfile peer.cnf -extensions v3_req ~# cd ~ ~# scp -i diamond.yaml -r ~/caadd103 ubuntu@10.53.4.103:/home/ubuntu

 新节点103操作

caadd103# cd ~/caadd103
caadd103# cp ca.crt ca.key peer.crt peer.key server.crt server.key /etc/kubernetes/pki/etcd/
caadd103# ll /etc/kubernetes/pki/etcd/

 加入成员

# 添加
~# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd'  --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt  member add bj-idc1-10-53-4-103-10.53.4.103 --peer-urls='https://10.53.4.103:2380'"

# 查看
~# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"

 添加etcd.yaml

caadd103# vi /etc/kubernetes/manifests/etcd.yaml
# 关键配置
    - --advertise-client-urls=https://10.53.4.103:2379
    - --initial-advertise-peer-urls=https://10.53.4.103:2380
    - --initial-cluster=wangshile-vendor-4-10.53.5.165=https://10.53.5.165:2380,bj-idc1-10-53-4-221-10.53.4.221=https://10.53.4.221:2380,bj-idc1-10-53-4-103-10.53.4.103=https://10.53.4.103:2380
    - --initial-cluster-state=existing
    - --listen-client-urls=https://127.0.0.1:2379,https://10.53.4.103:2379
    - --listen-peer-urls=https://10.53.4.103:2380
    - --name=bj-idc1-10-53-4-103-10.53.4.103
caadd103# docker ps -a | grep etcd
caadd103# netstat -tnlp| grep etcd

~# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd'  -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert-file=/etc/kubernetes/pki/etcd/server.crt --key-file=/etc/kubernetes/pki/etcd/server.key --ca-file=/etc/kubernetes/pki/etcd/ca.crt cluster-health"
~# kubectl -nkube-system get po| grep etcd

 另外还得修改apiserver参数

- --etcd-servers=https://10.53.5.165:2379,https://10.53.4.221:2379,https://10.53.4.103:2379

实验总结

1. 模拟扩展中产生的问题:后加入的etcd的--initial-cluster=当前有几个填几个,填多报错

# 原节点再次介入集群中需要修改配置\- --initial-cluster-state=existing,并清楚该节点数据

2020-06-18 06:35:08.560068 E | rafthttp: request cluster ID mismatch (got cdf818194e3a8c32 want 4ec9131dabe34047)
2020-06-18 06:35:08.560482 E | rafthttp: request sent was ignored (cluster ID mismatch: peer[22efed4061b48b44]=cdf818194e3a8c32, local=4ec9131dabe34047)

 2. 问题描述 etcd peer 的值为localhost

etcdmain: error validating peerURLs {ClusterID:e7d9e721043e9bfa Members:[&{ID:a11f7c7b82c99552 RaftAttributes:{PeerURLs:[https://localhost:2380]} Attributes:{Name:"infra1" ClientURLs:[https://localhost:2379]}} &{ID:f7bd378938cf704d RaftAttributes:{PeerURLs:[https://10.10.30.53:2380]} Attributes:{Name: ClientURLs:[]}} &{ID:842133e992b120ec RaftAttributes:{PeerURLs:[https://10.10.30.51:2380]} Attributes:{Name:"infra0" ClientURLs:[https://10.10.30.51:2379]}}] RemovedMemberIDs:[]}: unmatched member while checking PeerURLs

通过大神带领排查问题为peer-url问题,进入pod中查看确实有问题
/ # etcdctl  --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list
8e9e05c52164694d, started, wangshile-vendor-4-10.53.5.165, http://localhost:2380, https://10.53.5.165:2379

其他节点排查根本没有http://localhost:2380的问题
之后通过手动update修改,解决该问题,可以保证etcd能够进行扩展
docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"

docker run --rm --net=host -v '/etc/kubernetes/pki/etcd-certs:/etc/kubernetes/pki/etcd-certs' -v '/opt/sensetime/diamond/data/etcd-snapshot:/backup' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd-certs/client.pem --key=/etc/kubernetes/pki/etcd-certs/client-key.pem --cacert=/etc/kubernetes/pki/etcd-certs/ca.pem member update 8e9e05c52164694d https://10.53.5.165:2380"

 其他操作

# 数据恢复
docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' -v '/opt/sensetime/diamond/data/etcd-snapshot:/backup' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl snapshot restore --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt '/backup/0617_snapshot.db';cp -r /default.etcd/member/* /var/lib/etcd/member/"

# 备份命令
docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' -v '/opt/sensetime/diamond/data/etcd-snapshot:/backup' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt snapshot save /backup/$(date +%m%d)_snapshot.db"

 

posted @ 2020-06-26 17:04  Wshile  阅读(1317)  评论(0编辑  收藏  举报