kube-scheduler 调度源码分析
调度器核心的数据结构是 Scheduler
,Scheduler对象初始化完成后就开始执行调度,Scheduler 对象的大概结构如下
+------------------+ +------------------+ implement +------------------+
| scheduler | +------->| schedulerCache |----------------------------interface------->| Cache |
+------------------+-----------------------+ | +------------------+-----------------------+ +------------------+-----------------------+
| SchedulerCache internalcache.Cache |--------+ | assumedPods map[string]bool | | AssumePod(...) |
+------------------------------------------+ +------------------------------------------+ +------------------------------------------+
| Algorithm core.ScheduleAlgorithm |--------+ | podStates map[string]*podState | | FinishBinding(...) |
+------------------------------------------+ | +------------------------------------------+ | |
| SchedulingQueue | | | nodes map[string]*nodeInfoListItem | +------------------------------------------+
| internalqueue.SchedulingQueue |-----+ | | | | ForgetPod(...) |
+------------------------------------------+ | | +------------------------------------------+ +------------------------------------------+
| Profiles profile.Map |--+ | | | headNode *nodeInfoListItem | | AddPod(...) |
+------------------------------------------+ | | | +------------------------------------------+ +------------------------------------------+
| NextPod func() *framework.PodInfo | | | | | nodeTree *nodeTree | | AddNode(...) |
+------------------------------------------+ | | | +------------------------------------------+ +------------------------------------------+
| ...... | | | | | ...... | | ...... |
+------------------------------------------+ | | | +------------------------------------------+ +------------------------------------------+
+----------------------------------+ | |
| | |
| | |
v | | +------------------+ implement +------------------+
+------------------+ | +-------->| genericScheduler |--------------------------interface-------->|ScheduleAlgorithm |
| Profile | | +------------------+-----------------------+ +------------------+-----------------------+
+------------------+-----------+ | | cache internalcache.Cache | | Schedule(...) |
| framework.Framework |--+ | +------------------------------------------+ +------------------------------------------+
+------------------------------+ | | | schedulingQueue | | Preempt(...) |
| ...... | | | | internalqueue.SchedulingQueue | +------------------------------------------+
+------------------------------+ | | +------------------------------------------+ | Extenders(...) |
| | | extenders []SchedulerExtender | +------------------------------------------+
+------------------------+ | +------------------------------------------+
| | | nodeInfoSnapshot *internalcache.Snapshot |
v | +------------------------------------------+
+----------------------------+ | | percentageOfNodesToScore int32 |
| framework |---------------+ | +------------------------------------------+
+----------------------------+------+ | |
| registry Registry | | |
+-----------------------------------+ | | +------------------+ implement +------------------+
|queueSortPlugins []QueueSortPlugin |----+ | +----------->| PriorityQueue |--------------------------interface-------->| SchedulingQueue |
+-----------------------------------+ | | +------------------+-----------------------+ +------------------+-----------------------+
| preFilterPlugin []PreFilterPlugin | | | | activeQ *heap.Heap | | Add(pod *v1.Pod) error |
+-----------------------------------+ | | +------------------------------------------+ +------------------------------------------+
| filterPlugins []FilterPlugin | | | | podBackoffQ *heap.Heap | | Pop() (*framework.PodInfo, error) |
+-----------------------------------+ | | | | +------------------------------------------+
| preScorePlugins []PreScorePlugin | | | +------------------------------------------+ | MoveAllToActiveOrBackoffQueue(event |
+-----------------------------------+ | | | unschedulableQ *UnschedulablePodsMap | +------------------------------------------+
| scorePlugins []ScorePlugin | | | +------------------------------------------+ | Run() |
+-----------------------------------+ | | | nominatedPods *nominatedPodMap | +------------------------------------------+
| reservePlugins []ReservePlugin | | | +------------------------------------------+ | ...... |
+-----------------------------------+ | | | schedulingCycle int64 | +------------------------------------------+
| preBindPlugins []PreBindPlugin | | | +------------------------------------------+
+-----------------------------------+ | | | moveRequestCycle int64 |
| bindPlugins []BindPlugin | | | +------------------------------------------+
+-----------------------------------+ | |
| postBindPlugins []PostBindPlugin | | |
+-----------------------------------+ | | implement
|unreservePlugins []UnreservePlugin | | | interface +------------------+
+-----------------------------------+ | +------------------------------------------------------------------------------->| Framework |
| permitPlugins []PermitPlugin |--->| +------------------+----------------+
+-----------------------------------+ | | QueueSortFunc() LessFunc |
| ...... | | +-----------------------------------+
+-----------------------------------+ | | RunPreFilterPlugins(...) |
+-------------+ +-----------------------------------+
| | RunFilterPlugins(...) |
| +-----------------------------------+
v | RunPreFilterExtensionAddPod(...) |
+-----------interface----------------------------------------------------------------------------------------+ +-----------------------------------+
| +------------------+ | |RunPreFilterExtensionRemovePod(...)|
| +------------------+ | PreBindPlugin | | +-----------------------------------+
| | QueueSortPlugin | +------------------+----------------+ | | RunPreScorePlugins(...) |
| +------------------+----------------+ | PreBind(...) | | +-----------------------------------+
| | Less(*PodInfo, *PodInfo) bool | +-----------------------------------+ | | RunScorePlugins(...) |
| +-----------------------------------+ | +-----------------------------------+
| +------------------+ | | RunPreBindPlugins(...) |
| | PreFilterPlugin | +------------------+ | +-----------------------------------+
| +------------------+----------------+ | BindPlugin | | | RunPostBindPlugins(...) |
| | PreFilter(...) | +------------------+----------------+ | +-----------------------------------+
| +-----------------------------------+ | Bind(...) | | | RunReservePlugins(...) |
| | PreFilterExtensions() | +-----------------------------------+ | +-----------------------------------+
| +-----------------------------------+ | | RunUnreservePlugins(...) |
| | +-----------------------------------+
| +------------------+ +------------------+ | | RunPermitPlugins(...) |
| | PreScorePlugin | | PostBindPlugin | | +-----------------------------------+
| +------------------+----------------+ +------------------+----------------+ | | WaitOnPermit(...) |
| | PreScore(...) | | PostBind(...) | | +-----------------------------------+
| +-----------------------------------+ +-----------------------------------+ | | RunBindPlugins(...) |
| | +-----------------------------------+
| +------------------+ +------------------+ | | ...... |
| | ScorePlugin | | UnreservePlugin | | +-----------------------------------+
| +------------------+----------------+ +------------------+----------------+ |
| | Score(...) | | Unreserve(...) | |
| +-----------------------------------+ +-----------------------------------+ |
| | ScoreExtensions() | |
| +-----------------------------------+ +------------------+ |
| +------------------+ | PermitPlugin | |
| | ReservePlugin | +------------------+----------------+ |
| +------------------+----------------+ | Permit(...) | |
| | Reserve(...) | +-----------------------------------+ |
| +-----------------------------------+ |
| |
+------------------------------------------------------------------------------------------------------------+
scheduler 对象中,主要关注4个核心的数据成员,分别是
SchedulerCache
: 调度缓存Algorithm
:调度算法SchedulingQueue
: 优先级队列Profiles
:插件集
scheduler 创建完成后,开始调度,代码如下
// Run executes the scheduler based on the given configuration. It only returns on error or when context is done.
func Run(ctx context.Context, cc schedulerserverconfig.CompletedConfig, outOfTreeRegistryOptions ...Option) error {
......
// Create the scheduler.
sched, err := scheduler.New(cc.Client,
cc.InformerFactory,
cc.PodInformer,
recorderFactory,
ctx.Done(),
scheduler.WithProfiles(cc.ComponentConfig.Profiles...),
scheduler.WithAlgorithmSource(cc.ComponentConfig.AlgorithmSource),
scheduler.WithPreemptionDisabled(cc.ComponentConfig.DisablePreemption),
scheduler.WithPercentageOfNodesToScore(cc.ComponentConfig.PercentageOfNodesToScore),
scheduler.WithBindTimeoutSeconds(cc.ComponentConfig.BindTimeoutSeconds),
scheduler.WithFrameworkOutOfTreeRegistry(outOfTreeRegistry),
scheduler.WithPodMaxBackoffSeconds(cc.ComponentConfig.PodMaxBackoffSeconds),
scheduler.WithPodInitialBackoffSeconds(cc.ComponentConfig.PodInitialBackoffSeconds),
scheduler.WithExtenders(cc.ComponentConfig.Extenders...),
)
......
sched.Run(ctx)
}
继续进入 sched.Run(ctx) 方法
// Run begins watching and scheduling. It waits for cache to be synced, then starts scheduling and blocked until the context is done.
func (sched *Scheduler) Run(ctx context.Context) {
if !cache.WaitForCacheSync(ctx.Done(), sched.scheduledPodsHasSynced) {
return
}
sched.SchedulingQueue.Run()
wait.UntilWithContext(ctx, sched.scheduleOne, 0)
sched.SchedulingQueue.Close()
}
在 Run 方法内,开始同步 watch 缓存,接着开启 调度队列 监听,带调度的pod都是通过调度队列获取。最后调用 sched.scheduleOne
,开始调度流程
// scheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting.
func (sched *Scheduler) scheduleOne(ctx context.Context) {
// 通过 NextPod 从调度队列中获取一个带调度pod
podInfo := sched.NextPod()
......
// 通过 pod.Spec.SchedulerName,在 Profiles 字典中获取注册的插件集
prof, err := sched.profileForPod(pod)
......
// 这个最核心的,调用 Algorithm 成员的 Schedule 方法,并传入 prof和pod 参数,这个方法完成后,会返回调度的结果。
scheduleResult, err := sched.Algorithm.Schedule(schedulingCycleCtx, prof, state, pod)
......
// 执行 Reserve 插件
prof.RunReservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
......
// 将 pod.NodeName设置为调度算法选择的最佳的node。并将pod信息和node信息更新到 SchedulerCache 缓存。
sched.assume(assumedPod, scheduleResult.SuggestedHost)
......
// 执行 Permit 插件,该插件在真正的绑定前,会做一个检查,返回失败,则调度失败,则会将pod从 SchedulerCache 中删除
prof.RunPermitPlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
......
// 开启协程,并发的执行pod绑定
go func() {
......
// 执行PreBind插件,成功则继续执行
preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
......
// 执行bind 方法开始 绑定 pod 和 node,绑定成功后,pod调度成功
err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state)
......
// 绑定完成后,执行 PostBind 插件
prof.RunPostBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
}
}