k8s节点下线流程梳理

节点驱逐

1. 驱逐预检 (optional)

模拟调度,看能否可以将 pod 调度出去

2. SetPDB

  • 单实例检测:replicaSet, statefulSet, cloneSet

controllerRef := metav1.GetControllerOf(&pod)pod 的 父控制器为 replicaSet, statefulSet, cloneSet ,检测*v.Spec.Replicas == 1 即单实例,阻止驱逐

  • Pod Disruption Budget (pod 中断 预算) 简称PDB,含义其实是终止pod前通过 labelSelector 机制获取正常运行的pod数目的限制,目的是对自愿中断的保护措施主动驱逐Pod。(如果 Node 状态处于 not ready,PDB 是不会生效,因为 PDB 只能针对自愿中断生效)kubectl get poddisruptionbudget -A
var pdbs []*v1beta1.PodDisruptionBudget
maxUnavailable := &intstr.IntOrString{
    Type: intstr.String,
    // 如果是百分比k8s计算数量逻辑会向上取整
    StrVal: ntd.MaxUnavailable, // 25%
}
// 设置 ReplicaSet PDB
for _, v := range rs {
    if *v.Spec.Replicas == 1 {
       continue
    }

    pdb := &v1beta1.PodDisruptionBudget{
       ObjectMeta: metav1.ObjectMeta{
          Name:      v.Name + "-pdb",
          Namespace: v.Namespace,
          Annotations: map[string]string{
             "NodeDrainTask" + strconv.Itoa(ntd.ID): "processing",
          },
       },
       Spec: v1beta1.PodDisruptionBudgetSpec{
          MaxUnavailable: maxUnavailable,
          Selector:       v.Spec.Selector,
       },
    }

    _, err := cc.ClientSet.PolicyV1beta1().PodDisruptionBudgets(pdb.Namespace).Create(context.Background(), pdb, metav1.CreateOptions{})
    if err == nil {
       log.Info("SetPDB ReplicaSet create pdb: %s, %s", v.Namespace, pdb.Name)
       pdbs = append(pdbs, pdb)
       continue
    }
    if !apierrors.IsAlreadyExists(err) {
       return pdbs, err
    }
    patchData := map[string]interface{}{
       "metadata": map[string]map[string]string{
          "annotations": {
             "NodeDrainTask" + strconv.Itoa(ntd.ID): "processing",
          },
       },
    }

    playLoadBytes, err := json.Marshal(patchData)
    if err != nil {
       return pdbs, err
    }
    _, err = cc.ClientSet.PolicyV1beta1().PodDisruptionBudgets(pdb.Namespace).Patch(context.Background(), pdb.Name, types.StrategicMergePatchType, playLoadBytes, metav1.PatchOptions{})
    if err != nil {
       return pdbs, err
    }
    log.Info("SetPDB ReplicaSet update pdb: %s, %s", v.Namespace, pdb.Name)
    pdbs = append(pdbs, pdb)
}

3. 设置不可调度

kubectl cordon ${node}

err := cc.updateNodeByRetry(nodeName, func(node *v1.Node) {
    node.Spec.Unschedulable = true
})

4. Pod 驱逐

忽略 daemonsets,强制驱逐 replicaSet, statefulSet, cloneSet 的 pod 等同于 kubectl drain --force --ignore-daemonsets --delete-local-data ${node}

type DrainNodes struct {
    ClusterID          int               `json:"cluster_id"`
    Nodenames          []string          `json:"node_names"`
    Force              bool              `json:"force"` // true
    IgnoreDaemonsets   bool              `json:"ignore_daemonsets"` // true
    DeleteLocalData    bool              `json:"delete_local_data"` // true
    Timeout            int               `json:"timeout"` // 300
    GracePeriodSeconds int64             `json:"grace_period_seconds"` // 30
    ReplicaSetList     []*NamespacedName `json:"replica_set_list"`
    StatefulSetList    []*NamespacedName `json:"stateful_set_list"`
    CloneSetList       []*NamespacedName `json:"clone_set_list"`
}
pods, err := getPodsForDeletion(cc, node, options) 

func evictPod(client typedpolicyv1beta1.PolicyV1beta1Interface, pod corev1.Pod, policyGroupVersion string, gracePeriodSeconds int64, options *DrainOptions) error {
    deleteOptions := &metav1.DeleteOptions{}
    var gracePeriod int64
    if gracePeriodSeconds >= 0 {
       gracePeriod = gracePeriodSeconds
    } else {
       gracePeriod = options.GracePeriodSeconds
    }
    deleteOptions.GracePeriodSeconds = &gracePeriod
    eviction := &policyv1beta1.Eviction{
       TypeMeta: metav1.TypeMeta{
          APIVersion: policyGroupVersion,
          Kind:       EvictionKind,
       },
       ObjectMeta: metav1.ObjectMeta{
          Name:      pod.Name,
          Namespace: pod.Namespace,
       },
       DeleteOptions: deleteOptions,
    }
    return client.Evictions(eviction.Namespace).Evict(context.Background(), eviction)
}

删除节点

cc.ClientSet.CoreV1().Nodes().Delete(context.Background(), nodeName, metav1.DeleteOptions{})

清理操作

非常重要,将 node 中的 kublet 配置文件,残留容器进行删除

# 先停止 kubelet ,不然删除 pod 后,会被重新拉起
systemctl stop kubelet
# 删除容器
docker stop $(docker ps -aq)
crictl stop $(crictl ps -aq)
# 清理 shim
kill -s 15 $(pidof containerd-shim)
kill -s 15 $(pidof containerd-shim-runc-v2)

docker rm $(docker ps -aq)
crictl rm $(crictl ps -aq)

# 停止 k8s 组件
systemctl daemon-reload
systemctl stop docker
systemctl stop containerd

# 移除配置文件,防止重启节点,重新加入集群
rm -rf /etc/kubernetes/*
rm -rf /var/lib/kubelet/pki/*
rm -rf /root/.kube/*

# 下线检查
systemctl daemon-reload
systemctl status docker | grep "active (running)"
if [[ $? == 0 ]]; then
    exit 1
fi
systemctl status containerd | grep "active (running)"
if [[ $? == 0 ]]; then
    exit 1
fi

if [ -f /etc/kubernetes/kubelet.conf ];then
    exit 1
fi
if pidof containerd-shim >/dev/null; then
    echo "Error: containerd-shim is running."
    exit 1
fi

if pidof containerd-shim-runc-v2 >/dev/null; then
    echo "Error: containerd-shim-runc-v2 is running."
    exit 1
fi
posted @ 2024-06-30 16:16  叒狗  阅读(30)  评论(0编辑  收藏  举报