k8s节点下线流程梳理
节点驱逐
1. 驱逐预检 (optional)
模拟调度,看能否可以将 pod 调度出去
2. SetPDB
- 单实例检测:replicaSet, statefulSet, cloneSet
controllerRef := metav1.GetControllerOf(&pod)
pod 的 父控制器为 replicaSet, statefulSet, cloneSet ,检测*v.Spec.Replicas == 1
即单实例,阻止驱逐
Pod Disruption Budget (pod 中断 预算) 简称PDB
,含义其实是终止pod前通过 labelSelector 机制获取正常运行的pod数目的限制,目的是对自愿中断的保护措施,主动驱逐Pod。(如果 Node 状态处于 not ready,PDB 是不会生效,因为 PDB 只能针对自愿中断生效)kubectl get poddisruptionbudget -A
var pdbs []*v1beta1.PodDisruptionBudget
maxUnavailable := &intstr.IntOrString{
Type: intstr.String,
// 如果是百分比k8s计算数量逻辑会向上取整
StrVal: ntd.MaxUnavailable, // 25%
}
// 设置 ReplicaSet PDB
for _, v := range rs {
if *v.Spec.Replicas == 1 {
continue
}
pdb := &v1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{
Name: v.Name + "-pdb",
Namespace: v.Namespace,
Annotations: map[string]string{
"NodeDrainTask" + strconv.Itoa(ntd.ID): "processing",
},
},
Spec: v1beta1.PodDisruptionBudgetSpec{
MaxUnavailable: maxUnavailable,
Selector: v.Spec.Selector,
},
}
_, err := cc.ClientSet.PolicyV1beta1().PodDisruptionBudgets(pdb.Namespace).Create(context.Background(), pdb, metav1.CreateOptions{})
if err == nil {
log.Info("SetPDB ReplicaSet create pdb: %s, %s", v.Namespace, pdb.Name)
pdbs = append(pdbs, pdb)
continue
}
if !apierrors.IsAlreadyExists(err) {
return pdbs, err
}
patchData := map[string]interface{}{
"metadata": map[string]map[string]string{
"annotations": {
"NodeDrainTask" + strconv.Itoa(ntd.ID): "processing",
},
},
}
playLoadBytes, err := json.Marshal(patchData)
if err != nil {
return pdbs, err
}
_, err = cc.ClientSet.PolicyV1beta1().PodDisruptionBudgets(pdb.Namespace).Patch(context.Background(), pdb.Name, types.StrategicMergePatchType, playLoadBytes, metav1.PatchOptions{})
if err != nil {
return pdbs, err
}
log.Info("SetPDB ReplicaSet update pdb: %s, %s", v.Namespace, pdb.Name)
pdbs = append(pdbs, pdb)
}
3. 设置不可调度
kubectl cordon ${node}
err := cc.updateNodeByRetry(nodeName, func(node *v1.Node) {
node.Spec.Unschedulable = true
})
4. Pod 驱逐
忽略 daemonsets,强制驱逐 replicaSet, statefulSet, cloneSet 的 pod 等同于 kubectl drain --force --ignore-daemonsets --delete-local-data ${node}
type DrainNodes struct {
ClusterID int `json:"cluster_id"`
Nodenames []string `json:"node_names"`
Force bool `json:"force"` // true
IgnoreDaemonsets bool `json:"ignore_daemonsets"` // true
DeleteLocalData bool `json:"delete_local_data"` // true
Timeout int `json:"timeout"` // 300
GracePeriodSeconds int64 `json:"grace_period_seconds"` // 30
ReplicaSetList []*NamespacedName `json:"replica_set_list"`
StatefulSetList []*NamespacedName `json:"stateful_set_list"`
CloneSetList []*NamespacedName `json:"clone_set_list"`
}
pods, err := getPodsForDeletion(cc, node, options)
func evictPod(client typedpolicyv1beta1.PolicyV1beta1Interface, pod corev1.Pod, policyGroupVersion string, gracePeriodSeconds int64, options *DrainOptions) error {
deleteOptions := &metav1.DeleteOptions{}
var gracePeriod int64
if gracePeriodSeconds >= 0 {
gracePeriod = gracePeriodSeconds
} else {
gracePeriod = options.GracePeriodSeconds
}
deleteOptions.GracePeriodSeconds = &gracePeriod
eviction := &policyv1beta1.Eviction{
TypeMeta: metav1.TypeMeta{
APIVersion: policyGroupVersion,
Kind: EvictionKind,
},
ObjectMeta: metav1.ObjectMeta{
Name: pod.Name,
Namespace: pod.Namespace,
},
DeleteOptions: deleteOptions,
}
return client.Evictions(eviction.Namespace).Evict(context.Background(), eviction)
}
删除节点
cc.ClientSet.CoreV1().Nodes().Delete(context.Background(), nodeName, metav1.DeleteOptions{})
清理操作
非常重要,将 node 中的 kublet 配置文件,残留容器进行删除
# 先停止 kubelet ,不然删除 pod 后,会被重新拉起
systemctl stop kubelet
# 删除容器
docker stop $(docker ps -aq)
crictl stop $(crictl ps -aq)
# 清理 shim
kill -s 15 $(pidof containerd-shim)
kill -s 15 $(pidof containerd-shim-runc-v2)
docker rm $(docker ps -aq)
crictl rm $(crictl ps -aq)
# 停止 k8s 组件
systemctl daemon-reload
systemctl stop docker
systemctl stop containerd
# 移除配置文件,防止重启节点,重新加入集群
rm -rf /etc/kubernetes/*
rm -rf /var/lib/kubelet/pki/*
rm -rf /root/.kube/*
# 下线检查
systemctl daemon-reload
systemctl status docker | grep "active (running)"
if [[ $? == 0 ]]; then
exit 1
fi
systemctl status containerd | grep "active (running)"
if [[ $? == 0 ]]; then
exit 1
fi
if [ -f /etc/kubernetes/kubelet.conf ];then
exit 1
fi
if pidof containerd-shim >/dev/null; then
echo "Error: containerd-shim is running."
exit 1
fi
if pidof containerd-shim-runc-v2 >/dev/null; then
echo "Error: containerd-shim-runc-v2 is running."
exit 1
fi