k8s 集群
# k8s 集群
192.168.71.201 k8s-master01 master01
192.168.71.202 k8s-master02 master02 harbor
192.168.71.203 k8s-master03 master03
192.168.71.204 k8s-node01 node01
192.168.71.205 k8s-node02 node02
k8s-master01 Ready master 170m v1.16.0
k8s-master02 Ready master 167m v1.16.0
k8s-master03 Ready master 164m v1.16.0
k8s-node01 Ready <none> 162m v1.16.0
k8s-node02 Ready <none> 162m v1.16.0
集群虚拟ip 192.168.71.200
k8s 高可用集群部署参照之前文档: https://www.cnblogs.com/lixinliang/p/12217033.html
故障重现
#制造故障
将第一个master 节点关机
因check_haproxy.sh 脚本,虚拟ip会漂移至 master02 节点
故障恢复
# 获取etcd 集群故障 etcd member id
首先在一台健康的master02 上运行下面的命令获取etcd集群中故障member的ID
ETCD=`docker ps|grep etcd|grep -v POD|awk '{print $1}'`
docker exec \
-it ${ETCD} \
etcdctl \
--endpoints https://127.0.0.1:2379 \
--ca-file /etc/kubernetes/pki/etcd/ca.crt \
--cert-file /etc/kubernetes/pki/etcd/peer.crt \
--key-file /etc/kubernetes/pki/etcd/peer.key \
cluster-health
结果显示报错: member 19c5f5e4748dc98b is unreachbel
那么故障id 为: 19c5f5e4748dc98b
#删除故障etcd member id
由于故障节点已经被重置,因此相当于该ID对应的ETCD实例已经丢失,无法再取得联系。因此直接运行下面命令将故障的member从etcd集群中删除。
ETCD=`docker ps|grep etcd|grep -v POD|awk '{print $1}'`
docker exec \
-it ${ETCD} \
etcdctl \
--endpoints https://127.0.0.1:2379 \
--ca-file /etc/kubernetes/pki/etcd/ca.crt \
--cert-file /etc/kubernetes/pki/etcd/server.crt \
--key-file /etc/kubernetes/pki/etcd/server.key \
member remove 19c5f5e4748dc98b
# 再次查看etcd 集群状态
再次查看只剩下两个etcd 节点为 healthy 状态
docker exec \
-it ${ETCD} \
etcdctl \
--endpoints https://127.0.0.1:2379 \
--ca-file /etc/kubernetes/pki/etcd/ca.crt \
--cert-file /etc/kubernetes/pki/etcd/peer.crt \
--key-file /etc/kubernetes/pki/etcd/peer.key \
cluster-health
# 加入新节点
+ 基础配置
* 主机名修改为master01 原来主机名
* /etc/hosts 文件保持同步
* 免密登录
+ 准备 keepalived、haproxy 配置文件
直接拷贝原来 master01 的相关配置,并启动服务
+ 分发证书
在 master02 节点分发证书至 master01
#!/bin/bash
for index in 201; do
ip=192.168.71.${index}
ssh $ip "mkdir -p /etc/kubernetes/pki/etcd; mkdir -p ~/.kube/"
scp /etc/kubernetes/pki/ca.crt $ip:/etc/kubernetes/pki/ca.crt
scp /etc/kubernetes/pki/ca.key $ip:/etc/kubernetes/pki/ca.key
scp /etc/kubernetes/pki/sa.key $ip:/etc/kubernetes/pki/sa.key
scp /etc/kubernetes/pki/sa.pub $ip:/etc/kubernetes/pki/sa.pub
scp /etc/kubernetes/pki/front-proxy-ca.crt $ip:/etc/kubernetes/pki/front-proxy-ca.crt
scp /etc/kubernetes/pki/front-proxy-ca.key $ip:/etc/kubernetes/pki/front-proxy-ca.key
scp /etc/kubernetes/pki/etcd/ca.crt $ip:/etc/kubernetes/pki/etcd/ca.crt
scp /etc/kubernetes/pki/etcd/ca.key $ip:/etc/kubernetes/pki/etcd/ca.key
scp /etc/kubernetes/admin.conf $ip:/etc/kubernetes/admin.conf
scp /etc/kubernetes/admin.conf $ip:~/.kube/config
done
+ 在 master01 准备 kubeadm_master01.conf 配置文件
随后将新的(初始化过的)节点加入到集群中,重新组成三节点的HA master,注意重建master的过程中使用了kubeadm的配置文件,该配置文件为HA master首次部署过程中使用过的,此处直接复用该配置文件。
注意: 以下文件需要修改两处地方,不能拿原来配置文件直接使用
$ cat kubeadm_master01.conf
apiVersion: kubeadm.k8s.io/v1beta1
kind: InitConfiguration
localAPIEndpoint:
advertiseAddress: 192.168.71.201
bindPort: 6443
---
apiVersion: kubeadm.k8s.io/v1beta1
kind: ClusterConfiguration
kubernetesVersion: v1.16.0
controlPlaneEndpoint: "192.168.71.200:8443"
imageRepository: registry.aliyuncs.com/google_containers
apiServer:
certSANs:
- "master01"
- "master02"
- "master03"
- 192.168.71.201
- 192.168.71.202
- 192.168.71.203
- 192.168.71.200
networking:
podSubnet: "10.244.0.0/16"
serviceSubnet: "10.96.0.0/12"
certificatesDir: /etc/kubernetes/pki
clusterName: kubernetes
etcd:
local:
extraArgs:
listen-client-urls: "https://127.0.0.1:2379,https://192.168.71.201:2379"
advertise-client-urls: "https://192.168.71.201:2379"
listen-peer-urls: "https://192.168.71.201:2380"
initial-advertise-peer-urls: "https://192.168.71.201:2380"
initial-cluster: "k8s-master01=https://192.168.71.201:2380,k8s-master02=https://192.168.71.202:2380,k8s-master03=https://192.168.71.203:2380"
# 第一处: 注意上一行需要修改,确保包括该重置节点在内的所有etcd节点的HOST=IP地址对都被列出在该配置中,不然新节点的etcd启动失败
initial-cluster-state: existing
# 第二处: 注意上一行需要修改,因为是集群已经存在,所以将刚开始的 new 修改为existing ,如果损坏的master 节点不是master01,那么此处将不用修改
serverCertSANs:
- master01
- 192.168.71.201
peerCertSANs:
- master01
- 192.168.71.201
---
apiVersion: kubeproxy.config.k8s.io/v1alpha1
kind: KubeProxyConfiguration
mode: ipvs
在 master01 执行:
# 配置证书
kubeadm init phase certs all --config kubeadm_master01.conf
# 配置etcd
kubeadm init phase etcd local --config kubeadm_master01.conf
# 生成kubelet配置文件
kubeadm init phase kubeconfig kubelet --config kubeadm_master01.conf
# 启动kubelet
kubeadm init phase kubelet-start --config kubeadm_master01.conf
# 将master01的etcd加入集群
kubectl exec -n kube-system etcd-k8s-master02 -- etcdctl --ca-file /etc/kubernetes/pki/etcd/ca.crt --cert-file /etc/kubernetes/pki/etcd/peer.crt --key-file /etc/kubernetes/pki/etcd/peer.key --endpoints=https://192.168.71.202:2379 member add master1 https://192.168.71.201:2380
# 启动 kube-apiserver、kube-controller-manager、kube-scheduler
kubeadm init phase kubeconfig all --config kubeadm_master01.conf
kubeadm init phase control-plane all --config kubeadm_master01.conf
# 将节点标记为master
kubeadm init phase mark-control-plane --config kubeadm_master01.conf
# 查看
kubectl get nodes
# 再次查看etcd 集群状态
[root@k8s-master02 ~]# docker exec -it ${ETCD} etcdctl --endpoints https://127.0.0.1:2379 --ca-file /etc/kubernetes/pki/etcd/ca.crt --cert-file /etc/kubernetes/pki/etcd/peer.crt --key-file /etc/kubernetes/pki/etcd/peer.key cluster-health
member 858768c8e151d5d8 is healthy: got healthy result from https://192.168.71.202:2379
member c79fe8ecd577a746 is healthy: got healthy result from https://192.168.71.203:2379
member e2892a4ec808af4e is healthy: got healthy result from https://192.168.71.201:2379
cluster is healthy
正常显示 etcd 集群,证明mater01 已修复。