Kubernetes DeploymentController 源码解析

说明：kubernetes 源码分析系列在未说明具体版本情况下，默认使用最新源码 master 分支

DeploymentController 对象从 NewDeploymentController 方法开始创建，我们首先看这个方法

 1 // NewDeploymentController creates a new DeploymentController.
 2 func NewDeploymentController(ctx context.Context, dInformer appsinformers.DeploymentInformer, rsInformer appsinformers.ReplicaSetInformer, podInformer coreinformers.PodInformer, client clientset.Interface) (*DeploymentController, error) {
 3     eventBroadcaster := record.NewBroadcaster()
 4     logger := klog.FromContext(ctx)
 5     dc := &DeploymentController{
 6         client:           client,
 7         eventBroadcaster: eventBroadcaster,
 8         eventRecorder:    eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "deployment-controller"}),
 9         queue:            workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "deployment"),
10     }
11     dc.rsControl = controller.RealRSControl{
12         KubeClient: client,
13         Recorder:   dc.eventRecorder,
14     }
15 
16     dInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
17         AddFunc: func(obj interface{}) {
18             dc.addDeployment(logger, obj)
19         },
20         UpdateFunc: func(oldObj, newObj interface{}) {
21             dc.updateDeployment(logger, oldObj, newObj)
22         },
23         // This will enter the sync loop and no-op, because the deployment has been deleted from the store.
24         DeleteFunc: func(obj interface{}) {
25             dc.deleteDeployment(logger, obj)
26         },
27     })
28     rsInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
29         AddFunc: func(obj interface{}) {
30             dc.addReplicaSet(logger, obj)
31         },
32         UpdateFunc: func(oldObj, newObj interface{}) {
33             dc.updateReplicaSet(logger, oldObj, newObj)
34         },
35         DeleteFunc: func(obj interface{}) {
36             dc.deleteReplicaSet(logger, obj)
37         },
38     })
39     podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
40         DeleteFunc: func(obj interface{}) {
41             dc.deletePod(logger, obj)
42         },
43     })
44 
45     dc.syncHandler = dc.syncDeployment
46     dc.enqueueDeployment = dc.enqueue
47 
48     dc.dLister = dInformer.Lister()
49     dc.rsLister = rsInformer.Lister()
50     dc.podLister = podInformer.Lister()
51     dc.dListerSynced = dInformer.Informer().HasSynced
52     dc.rsListerSynced = rsInformer.Informer().HasSynced
53     dc.podListerSynced = podInformer.Informer().HasSynced
54     return dc, nil
55 }

首先创建 DeploymentController （后面称dc），然后根据传入的Informer 来注册监听，对于 DeploymentController 来说需要关注的 rs 有三种，分别是 Deployment，ReplicaSet，Pod

关注 Deployment rs 不必多说，我们来看一下为什么 dc 还需要关注另外两种 rs ReplicaSet，Pod

可以看到 ReplicaSet Informer 三个方法都实现了，三种变化都关心，在dc对应的处理方法里，有这很类似的三段代码，

 1 // If it has a ControllerRef, that's all that matters.
 2     if controllerRef := metav1.GetControllerOf(rs); controllerRef != nil {
 3         d := dc.resolveControllerRef(rs.Namespace, controllerRef)
 4         if d == nil {
 5             return
 6         }
 7         logger.V(4).Info("ReplicaSet added", "replicaSet", klog.KObj(rs))
 8         dc.enqueueDeployment(d)
 9         return
10     }
11 
12     // Otherwise, it's an orphan. Get a list of all matching Deployments and sync
13     // them to see if anyone wants to adopt it.
14     ds := dc.getDeploymentsForReplicaSet(logger, rs)
15     if len(ds) == 0 {
16         return
17     }
18     logger.V(4).Info("Orphan ReplicaSet added", "replicaSet", klog.KObj(rs))
19     for _, d := range ds {
20         dc.enqueueDeployment(d)
21     }

其中注释也写的很简洁明了，在一般情况下ReplicaSet是需要关联到Deployment的，如果没有就是 orphan （孤儿），我们在努力把 ReplicaSet 关联到 Deployment，进行了二次检查。

对于 pod 这种 rs 也是一样的，大多数情况下 pod 是关联到 ReplicaSet 间接关联到 Deployment的，所以在 dc 需要关注 pod 被delete的事件，确认 delete 的这个 pod 是否关联了对应的 deployment，

如果关联了需要在 delete pod 后同步deployment，下面的代码可以看出 dc 根据 pod 尝试找到对应的 ReplicaSet 然后继续尝试找到关联的 deployment

 1 // getDeploymentForPod returns the deployment managing the given Pod.
 2 func (dc *DeploymentController) getDeploymentForPod(logger klog.Logger, pod *v1.Pod) *apps.Deployment {
 3     // Find the owning replica set
 4     var rs *apps.ReplicaSet
 5     var err error
 6     controllerRef := metav1.GetControllerOf(pod)
 7     if controllerRef == nil {
 8         // No controller owns this Pod.
 9         return nil
10     }
11     if controllerRef.Kind != apps.SchemeGroupVersion.WithKind("ReplicaSet").Kind {
12         // Not a pod owned by a replica set.
13         return nil
14     }
15     rs, err = dc.rsLister.ReplicaSets(pod.Namespace).Get(controllerRef.Name)
16     if err != nil || rs.UID != controllerRef.UID {
17         logger.V(4).Info("Cannot get replicaset for pod", "ownerReference", controllerRef.Name, "pod", klog.KObj(pod), "err", err)
18         return nil
19     }
20 
21     // Now find the Deployment that owns that ReplicaSet.
22     controllerRef = metav1.GetControllerOf(rs)
23     if controllerRef == nil {
24         return nil
25     }
26     return dc.resolveControllerRef(rs.Namespace, controllerRef)
27 }

现在总结一下 dc 的 Informer 主要监听三种资源，Deployment，ReplicaSet，Pod。其中 Deployment，ReplicaSet 监听 Add, Update, Delete。 Pod 只监听 Delete 事件。

接下来我们接着看 DeploymentController 的核心处理逻辑设计，很有借鉴意义。

1 dc.syncHandler = dc.syncDeployment
2 dc.enqueueDeployment = dc.enqueue

dc 的核心就是一个 Deployment 队列 enqueueDeployment，一个Deployment 同步器 syncHandler。

跟踪 enqueueDeployment 可以看到在注册的 Informer 中所有可以关联到 Deployment 的事件都会调用 enqueueDeployment 并把 Deployment 对象传给它。enqueue 被初始化为 enqueueDeployment，其实就是传给了 enqueue，enqueue 方法则是提取 Deployment 对象中的属性拼成字符串添加到 queue 中。queue 是一个可以限速的队列 workqueue.TypedRateLimitingInterface[string]

1 func (dc *DeploymentController) enqueue(deployment *apps.Deployment) {
2     key, err := controller.KeyFunc(deployment)
3     if err != nil {
4         utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", deployment, err))
5         return
6     }
7 
8     dc.queue.Add(key)
9 }

接下来继续看syncHandler，跟着源码可以看到 ControllerManager 在启动 Controller 时候调用 run 方法，在 run （worker 调用）方法中调用了 syncHandler 相关代码，为了方便阅读我们忽略一部分与本章节无关的代码。

1 // Run begins watching and syncing.
2 func (dc *DeploymentController) Run(ctx context.Context, workers int) {
3     ...... 省略
4     for i := 0; i < workers; i++ {
5         go wait.UntilWithContext(ctx, dc.worker, time.Second)
6     }
7 
8     <-ctx.Done()
9 }

根据指定的 worker 数开启 goroutine ，在 goroutine 中通过一个 kubernetes 封装的定时器，每秒执行一次 worker 方法。 worker 方法中先从上面说到的 enqueue 拉取 Deployment 属性拼成的 key，并把 key 传递给 syncHandler。

同时我们可以思考一个问题，为什么不在 Informer 监听事件的时候，收到事件直接调用 syncHandler 进行 Deployment 同步？而是收到事件加入限速队列，然后定时执行 syncHandler 读取队列执行同步。

根据前面的代码已经得知，其实是把 key 传递给 syncDeployment 方法，syncDeployment 开始整个 Deployment 的同步流程。

// syncDeployment will sync the deployment with the given key.
// This function is not meant to be invoked concurrently with the same key.
func (dc *DeploymentController) syncDeployment(ctx context.Context, key string) error {
    logger := klog.FromContext(ctx)
    namespace, name, err := cache.SplitMetaNamespaceKey(key)
    if err != nil {
        logger.Error(err, "Failed to split meta namespace cache key", "cacheKey", key)
        return err
    }

    startTime := time.Now()
    logger.V(4).Info("Started syncing deployment", "deployment", klog.KRef(namespace, name), "startTime", startTime)
    defer func() {
        logger.V(4).Info("Finished syncing deployment", "deployment", klog.KRef(namespace, name), "duration", time.Since(startTime))
    }()

    deployment, err := dc.dLister.Deployments(namespace).Get(name)
    if errors.IsNotFound(err) {
        logger.V(2).Info("Deployment has been deleted", "deployment", klog.KRef(namespace, name))
        return nil
    }
    if err != nil {
        return err
    }

    // Deep-copy otherwise we are mutating our cache.
    // TODO: Deep-copy only when needed.
    d := deployment.DeepCopy()

    everything := metav1.LabelSelector{}
    if reflect.DeepEqual(d.Spec.Selector, &everything) {
        dc.eventRecorder.Eventf(d, v1.EventTypeWarning, "SelectingAll", "This deployment is selecting all pods. A non-empty selector is required.")
        if d.Status.ObservedGeneration < d.Generation {
            d.Status.ObservedGeneration = d.Generation
            dc.client.AppsV1().Deployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{})
        }
        return nil
    }

    // List ReplicaSets owned by this Deployment, while reconciling ControllerRef
    // through adoption/orphaning.
    rsList, err := dc.getReplicaSetsForDeployment(ctx, d)
    if err != nil {
        return err
    }
    // List all Pods owned by this Deployment, grouped by their ReplicaSet.
    // Current uses of the podMap are:
    //
    // * check if a Pod is labeled correctly with the pod-template-hash label.
    // * check that no old Pods are running in the middle of Recreate Deployments.
    podMap, err := dc.getPodMapForDeployment(d, rsList)
    if err != nil {
        return err
    }

    if d.DeletionTimestamp != nil {
        return dc.syncStatusOnly(ctx, d, rsList)
    }

    // Update deployment conditions with an Unknown condition when pausing/resuming
    // a deployment. In this way, we can be sure that we won't timeout when a user
    // resumes a Deployment with a set progressDeadlineSeconds.
    if err = dc.checkPausedConditions(ctx, d); err != nil {
        return err
    }

    if d.Spec.Paused {
        return dc.sync(ctx, d, rsList)
    }

    // rollback is not re-entrant in case the underlying replica sets are updated with a new
    // revision so we should ensure that we won't proceed to update replica sets until we
    // make sure that the deployment has cleaned up its rollback spec in subsequent enqueues.
    if getRollbackTo(d) != nil {
        return dc.rollback(ctx, d, rsList)
    }

    scalingEvent, err := dc.isScalingEvent(ctx, d, rsList)
    if err != nil {
        return err
    }
    if scalingEvent {
        return dc.sync(ctx, d, rsList)
    }

    switch d.Spec.Strategy.Type {
    case apps.RecreateDeploymentStrategyType:
        return dc.rolloutRecreate(ctx, d, rsList, podMap)
    case apps.RollingUpdateDeploymentStrategyType:
        return dc.rolloutRolling(ctx, d, rsList)
    }
    return fmt.Errorf("unexpected deployment strategy type: %s", d.Spec.Strategy.Type)
}

先解析当前的 key，得到 namespace deploymentName，根据 namespace 和 deploymentName 从 Lister 获取 deployment 对象，然后判断 deployment 如果为空直接return。

接下来获取 deployment 关联的 ReplicaSet 和 pod 列表，然后判断当前的 deployment 是否需要（Paused） DeploymentController 同步。

接着执行 dc 的同步方法 sync

// sync is responsible for reconciling deployments on scaling events or when they
// are paused.
func (dc *DeploymentController) sync(ctx context.Context, d *apps.Deployment, rsList []*apps.ReplicaSet) error {
    newRS, oldRSs, err := dc.getAllReplicaSetsAndSyncRevision(ctx, d, rsList, false)
    if err != nil {
        return err
    }
    if err := dc.scale(ctx, d, newRS, oldRSs); err != nil {
        // If we get an error while trying to scale, the deployment will be requeued
        // so we can abort this resync
        return err
    }

    // Clean up the deployment when it's paused and no rollback is in flight.
    if d.Spec.Paused && getRollbackTo(d) == nil {
        if err := dc.cleanupDeployment(ctx, oldRSs, d); err != nil {
            return err
        }
    }

    allRSs := append(oldRSs, newRS)
    return dc.syncDeploymentStatus(ctx, allRSs, newRS, d)
}

首先计算 newRS, oldRSs ，newRS 就是对最新 pod 管理的 ReplicaSet，对应着描述文件对 pod 的最新要求。同时也需要历史的 oldRSs，之后调用 scale 使 pod 更新到最新的状态。

getAllReplicaSetsAndSyncRevision 方法逻辑也比较多，可以简单看下。

func (dc *DeploymentController) getAllReplicaSetsAndSyncRevision(ctx context.Context, d *apps.Deployment, rsList []*apps.ReplicaSet, createIfNotExisted bool) (*apps.ReplicaSet, []*apps.ReplicaSet, error) {
    _, allOldRSs := deploymentutil.FindOldReplicaSets(d, rsList)

    // Get new replica set with the updated revision number
    newRS, err := dc.getNewReplicaSet(ctx, d, rsList, allOldRSs, createIfNotExisted)
    if err != nil {
        return nil, nil, err
    }

    return newRS, allOldRSs, nil
}


allOldRSs 的获取 FindOldReplicaSets 其实就是把 Deployment 管理的所有 ReplicaSet 排序遍历一遍，排除掉最新的 ReplicaSet 然后返回剩下的 ReplicaSets。

// FindOldReplicaSets returns the old replica sets targeted by the given Deployment, with the given slice of RSes.
// Note that the first set of old replica sets doesn't include the ones with no pods, and the second set of old replica sets include all old replica sets.
func FindOldReplicaSets(deployment *apps.Deployment, rsList []*apps.ReplicaSet) ([]*apps.ReplicaSet, []*apps.ReplicaSet) {
    var requiredRSs []*apps.ReplicaSet
    var allRSs []*apps.ReplicaSet
    newRS := FindNewReplicaSet(deployment, rsList)
    for _, rs := range rsList {
        // Filter out new replica set
        if newRS != nil && rs.UID == newRS.UID {
            continue
        }
        allRSs = append(allRSs, rs)
        if *(rs.Spec.Replicas) != 0 {
            requiredRSs = append(requiredRSs, rs)
        }
    }
    return requiredRSs, allRSs
}

接下来 getNewReplicaSet 获取 newRS，getNewReplicaSet这个方法代码行数较多，但是逻辑也不复杂。因为有一半的代码其实是在为了 createIfNotExisted 这参数服务的，这个参数是否需要创建不存在的 ReplicaSet，在 sync 过程中始终是不需要的，我们可以看到在 sync 的第一行，传入了 false。

newRS, oldRSs, err := dc.getAllReplicaSetsAndSyncRevision(ctx, d, rsList, false)

所以为了避免贴出来全部代码不太好阅读，我将省略了 createIfNotExisted 为 true 的部分。

// Returns a replica set that matches the intent of the given deployment. Returns nil if the new replica set doesn't exist yet.
// 1. Get existing new RS (the RS that the given deployment targets, whose pod template is the same as deployment's).
// 2. If there's existing new RS, update its revision number if it's smaller than (maxOldRevision + 1), where maxOldRevision is the max revision number among all old RSes.
// 3. If there's no existing new RS and createIfNotExisted is true, create one with appropriate revision number (maxOldRevision + 1) and replicas.
// Note that the pod-template-hash will be added to adopted RSes and pods.
func (dc *DeploymentController) getNewReplicaSet(ctx context.Context, d *apps.Deployment, rsList, oldRSs []*apps.ReplicaSet, createIfNotExisted bool) (*apps.ReplicaSet, error) {
    logger := klog.FromContext(ctx)
    existingNewRS := deploymentutil.FindNewReplicaSet(d, rsList)

    // Calculate the max revision number among all old RSes
    maxOldRevision := deploymentutil.MaxRevision(logger, oldRSs)
    // Calculate revision number for this new replica set
    newRevision := strconv.FormatInt(maxOldRevision+1, 10)

    // Latest replica set exists. We need to sync its annotations (includes copying all but
    // annotationsToSkip from the parent deployment, and update revision, desiredReplicas,
    // and maxReplicas) and also update the revision annotation in the deployment with the
    // latest revision.
    if existingNewRS != nil {
        rsCopy := existingNewRS.DeepCopy()

        // Set existing new replica set's annotation
        annotationsUpdated := deploymentutil.SetNewReplicaSetAnnotations(ctx, d, rsCopy, newRevision, true, maxRevHistoryLengthInChars)
        minReadySecondsNeedsUpdate := rsCopy.Spec.MinReadySeconds != d.Spec.MinReadySeconds
        if annotationsUpdated || minReadySecondsNeedsUpdate {
            rsCopy.Spec.MinReadySeconds = d.Spec.MinReadySeconds
            return dc.client.AppsV1().ReplicaSets(rsCopy.ObjectMeta.Namespace).Update(ctx, rsCopy, metav1.UpdateOptions{})
        }

        // Should use the revision in existingNewRS's annotation, since it set by before
        needsUpdate := deploymentutil.SetDeploymentRevision(d, rsCopy.Annotations[deploymentutil.RevisionAnnotation])
        // If no other Progressing condition has been recorded and we need to estimate the progress
        // of this deployment then it is likely that old users started caring about progress. In that
        // case we need to take into account the first time we noticed their new replica set.
        cond := deploymentutil.GetDeploymentCondition(d.Status, apps.DeploymentProgressing)
        if deploymentutil.HasProgressDeadline(d) && cond == nil {
            msg := fmt.Sprintf("Found new replica set %q", rsCopy.Name)
            condition := deploymentutil.NewDeploymentCondition(apps.DeploymentProgressing, v1.ConditionTrue, deploymentutil.FoundNewRSReason, msg)
            deploymentutil.SetDeploymentCondition(&d.Status, *condition)
            needsUpdate = true
        }

        if needsUpdate {
            var err error
            if _, err = dc.client.AppsV1().Deployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{}); err != nil {
                return nil, err
            }
        }
        return rsCopy, nil
    }

    if !createIfNotExisted {
        return nil, nil
    }

　　........
　　省略

}

和之前一样 ReplicaSetList 中拿到最新的 ReplicaSet，然后根据旧版本号计算新版本，版本号+1, 如果拿不到最新的（和 Deployment deployment.Spec.Template 一致的）ReplicaSet 直接执行完成。

然后判断 ReplicaSet Annotation 中的版本号是否需要更新，对应的 key 为 deployment.kubernetes.io/revision，接着核对 MinReadySeconds 是否和 Deployment 配置一致，是否需要更新，如果需要更新两者。

接着判断 Deployment 的版本号和 Condition 是否需要更新，如果需要和上面 ReplicaSet 一样通过 client 像 apiserver 发送更新资源请求，最终返回最新的 ReplicaSet。

接着回到 sync 继续往下走，在拿到 newRS, oldRSs，调用 scale 方法执行。

func (dc *DeploymentController) scale(ctx context.Context, deployment *apps.Deployment, newRS *apps.ReplicaSet, oldRSs []*apps.ReplicaSet) error {
    // If there is only one active replica set then we should scale that up to the full count of the
    // deployment. If there is no active replica set, then we should scale up the newest replica set.
    if activeOrLatest := deploymentutil.FindActiveOrLatest(newRS, oldRSs); activeOrLatest != nil {
        if *(activeOrLatest.Spec.Replicas) == *(deployment.Spec.Replicas) {
            return nil
        }
        _, _, err := dc.scaleReplicaSetAndRecordEvent(ctx, activeOrLatest, *(deployment.Spec.Replicas), deployment)
        return err
    }

    // If the new replica set is saturated, old replica sets should be fully scaled down.
    // This case handles replica set adoption during a saturated new replica set.
    if deploymentutil.IsSaturated(deployment, newRS) {
        for _, old := range controller.FilterActiveReplicaSets(oldRSs) {
            if _, _, err := dc.scaleReplicaSetAndRecordEvent(ctx, old, 0, deployment); err != nil {
                return err
            }
        }
        return nil
    }

    // There are old replica sets with pods and the new replica set is not saturated.
    // We need to proportionally scale all replica sets (new and old) in case of a
    // rolling deployment.
    if deploymentutil.IsRollingUpdate(deployment) {
        allRSs := controller.FilterActiveReplicaSets(append(oldRSs, newRS))
        allRSsReplicas := deploymentutil.GetReplicaCountForReplicaSets(allRSs)

        allowedSize := int32(0)
        if *(deployment.Spec.Replicas) > 0 {
            allowedSize = *(deployment.Spec.Replicas) + deploymentutil.MaxSurge(*deployment)
        }

        // Number of additional replicas that can be either added or removed from the total
        // replicas count. These replicas should be distributed proportionally to the active
        // replica sets.
        deploymentReplicasToAdd := allowedSize - allRSsReplicas

        // The additional replicas should be distributed proportionally amongst the active
        // replica sets from the larger to the smaller in size replica set. Scaling direction
        // drives what happens in case we are trying to scale replica sets of the same size.
        // In such a case when scaling up, we should scale up newer replica sets first, and
        // when scaling down, we should scale down older replica sets first.
        var scalingOperation string
        switch {
        case deploymentReplicasToAdd > 0:
            sort.Sort(controller.ReplicaSetsBySizeNewer(allRSs))
            scalingOperation = "up"

        case deploymentReplicasToAdd < 0:
            sort.Sort(controller.ReplicaSetsBySizeOlder(allRSs))
            scalingOperation = "down"
        }

        // Iterate over all active replica sets and estimate proportions for each of them.
        // The absolute value of deploymentReplicasAdded should never exceed the absolute
        // value of deploymentReplicasToAdd.
        deploymentReplicasAdded := int32(0)
        nameToSize := make(map[string]int32)
        logger := klog.FromContext(ctx)
        for i := range allRSs {
            rs := allRSs[i]

            // Estimate proportions if we have replicas to add, otherwise simply populate
            // nameToSize with the current sizes for each replica set.
            if deploymentReplicasToAdd != 0 {
                proportion := deploymentutil.GetProportion(logger, rs, *deployment, deploymentReplicasToAdd, deploymentReplicasAdded)

                nameToSize[rs.Name] = *(rs.Spec.Replicas) + proportion
                deploymentReplicasAdded += proportion
            } else {
                nameToSize[rs.Name] = *(rs.Spec.Replicas)
            }
        }

        // Update all replica sets
        for i := range allRSs {
            rs := allRSs[i]

            // Add/remove any leftovers to the largest replica set.
            if i == 0 && deploymentReplicasToAdd != 0 {
                leftover := deploymentReplicasToAdd - deploymentReplicasAdded
                nameToSize[rs.Name] = nameToSize[rs.Name] + leftover
                if nameToSize[rs.Name] < 0 {
                    nameToSize[rs.Name] = 0
                }
            }

            // TODO: Use transactions when we have them.
            if _, _, err := dc.scaleReplicaSet(ctx, rs, nameToSize[rs.Name], deployment, scalingOperation); err != nil {
                // Return as soon as we fail, the deployment is requeued
                return err
            }
        }
    }
    return nil
}

scale 方法中其实就是对新旧 ReplicaSet 副本数进行按比例扩缩，增加新副本数的同时逐渐减少旧副本数，最终实现 Deployment 可用副本数全部更新。

posted @ 2024-05-17 11:06 MrPei 阅读(8) 评论(0) 编辑收藏举报

MrPei's Blog

代码改变世界

Kubernetes DeploymentController 源码解析

公告