k8s随笔--descheduler源码简读
启动:
折叠源码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
// 通过cobra生成启动命令 cmd := &cobra.Command{ Use: "descheduler" , Short: "descheduler" , Long: `The descheduler evicts pods which may be bound to less desired nodes`, Run: func(cmd *cobra.Command, args []string) { // s.Logs.Config.Format = s.Logging.Format // LoopbackClientConfig is a config for a privileged loopback connection var LoopbackClientConfig *restclient.Config var SecureServing *apiserver.SecureServingInfo if err := s.SecureServing.ApplyTo(&SecureServing, &LoopbackClientConfig); err != nil { klog.ErrorS(err, "failed to apply secure server configuration" ) return } factory, _ := registry.LogRegistry.Get(s.Logging.Format) if factory == nil { klog.ClearLogger() } else { log, logrFlush := factory.Create(config.FormatOptions{}) defer logrFlush() klog.SetLogger(log) } ctx, done := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer done() pathRecorderMux := mux.NewPathRecorderMux( "descheduler" ) // 可配置是否开启监控 if !s.DisableMetrics { pathRecorderMux.Handle( "/metrics" , legacyregistry.HandlerWithReset()) } healthz.InstallHandler(pathRecorderMux, healthz.NamedCheck( "Descheduler" , healthz.PingHealthz.Check)) if _, err := SecureServing.Serve(pathRecorderMux, 0 , ctx.Done()); err != nil { klog.Fatalf( "failed to start secure server: %v" , err) return } err := Run(ctx, s) if err != nil { klog.ErrorS(err, "descheduler server" ) } }, } |
初始化及声明运行周期等部分:
折叠源码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
|
func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer, deschedulerPolicy *api.DeschedulerPolicy, evictionPolicyGroupVersion string, stopChannel chan struct{}) error { // 获取进行二次调度需要使用的一些相关信息 sharedInformerFactory := informers.NewSharedInformerFactory(rs.Client, 0 ) nodeInformer := sharedInformerFactory.Core().V1().Nodes() podInformer := sharedInformerFactory.Core().V1().Pods() namespaceInformer := sharedInformerFactory.Core().V1().Namespaces() priorityClassInformer := sharedInformerFactory.Scheduling().V1().PriorityClasses() // create the informers namespaceInformer.Informer() priorityClassInformer.Informer() // 注意这里,这里其实build了一个func,这个func接收nodeName和podFilter方法,返回该node下通过filter的所有pod getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer) if err != nil { return fmt.Errorf( "build get pods assigned to node function error: %v" , err) } sharedInformerFactory.Start(stopChannel) sharedInformerFactory.WaitForCacheSync(stopChannel) // 可能使用到的(descheduler预定义的)所有重调度策略,value是实现各自策略的方法 // 每个策略具体执行的实现都是这个类型的 // type strategyFunction func(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc) strategyFuncs := map[api.StrategyName]strategyFunction{ // 移除同一node下的重复pod "RemoveDuplicates" : strategies.RemoveDuplicatePods, // node低资源使用率 "LowNodeUtilization" : nodeutilization.LowNodeUtilization, // node高资源使用率 "HighNodeUtilization" : nodeutilization.HighNodeUtilization, "RemovePodsViolatingInterPodAntiAffinity" : strategies.RemovePodsViolatingInterPodAntiAffinity, "RemovePodsViolatingNodeAffinity" : strategies.RemovePodsViolatingNodeAffinity, "RemovePodsViolatingNodeTaints" : strategies.RemovePodsViolatingNodeTaints, "RemovePodsHavingTooManyRestarts" : strategies.RemovePodsHavingTooManyRestarts, "PodLifeTime" : strategies.PodLifeTime, "RemovePodsViolatingTopologySpreadConstraint" : strategies.RemovePodsViolatingTopologySpreadConstraint, "RemoveFailedPods" : strategies.RemoveFailedPods, } // 下面准备一些相关的配置 var nodeSelector string if deschedulerPolicy.NodeSelector != nil { nodeSelector = *deschedulerPolicy.NodeSelector } var evictLocalStoragePods bool if deschedulerPolicy.EvictLocalStoragePods != nil { evictLocalStoragePods = *deschedulerPolicy.EvictLocalStoragePods } evictBarePods := false if deschedulerPolicy.EvictFailedBarePods != nil { evictBarePods = *deschedulerPolicy.EvictFailedBarePods if evictBarePods { klog.V( 1 ).InfoS( "Warning: EvictFailedBarePods is set to True. This could cause eviction of pods without ownerReferences." ) } } evictSystemCriticalPods := false if deschedulerPolicy.EvictSystemCriticalPods != nil { evictSystemCriticalPods = *deschedulerPolicy.EvictSystemCriticalPods if evictSystemCriticalPods { klog.V( 1 ).InfoS( "Warning: EvictSystemCriticalPods is set to True. This could cause eviction of Kubernetes system pods." ) } } ignorePvcPods := false if deschedulerPolicy.IgnorePVCPods != nil { ignorePvcPods = *deschedulerPolicy.IgnorePVCPods } // 通过NonSlidingUntil来周期性的执行descheduler的监控及驱逐策略,执行周期为 rs.DeschedulingInterval wait.NonSlidingUntil(func() { // 查找node集合 nodes, err := nodeutil.ReadyNodes(ctx, rs.Client, nodeInformer, nodeSelector) if err != nil { klog.V( 1 ).InfoS( "Unable to get ready nodes" , "err" , err) close(stopChannel) return } //node数量<=1,应当终止 if len(nodes) <= 1 { klog.V( 1 ).InfoS( "The cluster size is 0 or 1 meaning eviction causes service disruption or degradation. So aborting.." ) close(stopChannel) return } var podEvictorClient clientset.Interface // When the dry mode is enable, collect all the relevant objects (mostly pods) under a fake client. // So when evicting pods while running multiple strategies in a row have the cummulative effect // as is when evicting pods for real. // DryRun模式下,会构造虚拟的podEvictorClient,不会真正的进行驱逐 if rs.DryRun { klog.V( 3 ).Infof( "Building a cached client from the cluster for the dry run" ) // Create a new cache so we start from scratch without any leftovers fakeClient, err := cachedClient(rs.Client, podInformer, nodeInformer, namespaceInformer, priorityClassInformer) if err != nil { klog.Error(err) return } fakeSharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0 ) getPodsAssignedToNode, err = podutil.BuildGetPodsAssignedToNodeFunc(fakeSharedInformerFactory.Core().V1().Pods()) if err != nil { klog.Errorf( "build get pods assigned to node function error: %v" , err) return } fakeCtx, cncl := context.WithCancel(context.TODO()) defer cncl() fakeSharedInformerFactory.Start(fakeCtx.Done()) fakeSharedInformerFactory.WaitForCacheSync(fakeCtx.Done()) podEvictorClient = fakeClient } else { podEvictorClient = rs.Client } klog.V( 3 ).Infof( "Building a pod evictor" ) // 构造驱逐器 podEvictor := evictions.NewPodEvictor( // client, dryRun模式下是fakeClient podEvictorClient, // 集群支持的驱逐器所需要使用的子资源的版本 evictionPolicyGroupVersion, // 是否为dryRun rs.DryRun, // 每个node最多驱逐的pod限制,可配置 deschedulerPolicy.MaxNoOfPodsToEvictPerNode, // 每个node最多驱逐的pod限制,可配置 deschedulerPolicy.MaxNoOfPodsToEvictPerNamespace, // 所有node的集合 nodes, // 是否驱逐LocalStoragePod,可配置 evictLocalStoragePods, // 是否驱逐SystemCritical,可配置 evictSystemCriticalPods, // 是否忽略PvcPods,可配置 ignorePvcPods, evictBarePods, !rs.DisableMetrics, ) // 遍历用户配置中指定的驱逐策略,如果descheduler支持当前策略,就通过 f(ctx, rs.Client, strategy, nodes, podEvictor, getPodsAssignedToNode) 执行. for name, strategy := range deschedulerPolicy.Strategies { if f, ok := strategyFuncs[name]; ok { if strategy.Enabled { f(ctx, rs.Client, strategy, nodes, podEvictor, getPodsAssignedToNode) } } else { klog.ErrorS(fmt.Errorf( "unknown strategy name" ), "skipping strategy" , "strategy" , name) } } klog.V( 1 ).InfoS( "Number of evicted pods" , "totalEvicted" , podEvictor.TotalEvicted()) // If there was no interval specified, send a signal to the stopChannel to end the wait.Until loop after 1 iteration if rs.DeschedulingInterval.Seconds() == 0 { close(stopChannel) } }, rs.DeschedulingInterval, stopChannel) return nil } |
移除同一个Node下重复Pod策略实现:
折叠源码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
|
// RemoveDuplicatePods removes the duplicate pods on node. This strategy evicts all duplicate pods on node. // A pod is said to be a duplicate of other if both of them are from same creator, kind and are within the same // namespace, and have at least one container with the same image. // As of now, this strategy won't evict daemonsets, mirror pods, critical pods and pods with local storages. func RemoveDuplicatePods( ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc, ) { // 一些关于策略配置方面的基础校验 if err := validateRemoveDuplicatePodsParams(strategy.Params); err != nil { klog.ErrorS(err, "Invalid RemoveDuplicatePods parameters" ) return } thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params) if err != nil { klog.ErrorS(err, "Failed to get threshold priority from strategy's params" ) return } var includedNamespaces, excludedNamespaces sets.String if strategy.Params != nil && strategy.Params.Namespaces != nil { includedNamespaces = sets.NewString(strategy.Params.Namespaces.Include...) excludedNamespaces = sets.NewString(strategy.Params.Namespaces.Exclude...) } nodeFit := false if strategy.Params != nil { nodeFit = strategy.Params.NodeFit } evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit)) duplicatePods := make(map[podOwner]map[string][]*v1.Pod) ownerKeyOccurence := make(map[podOwner]int32) nodeCount := 0 nodeMap := make(map[string]*v1.Node) // 构造podFilter podFilter, err := podutil.NewOptions(). WithFilter(evictable.IsEvictable). WithNamespaces(includedNamespaces). WithoutNamespaces(excludedNamespaces). BuildFilterFunc() if err != nil { klog.ErrorS(err, "Error initializing pod filter function" ) return } // 遍历所有node for _, node := range nodes { klog.V( 1 ).InfoS( "Processing node" , "node" , klog.KObj(node)) // 注意这里的 getPodsAssignedToNode,就是初始化部分提到的 getPodsAssignedToNode, 接收节点名与podFitler方法,返回这个node下所有通过filter的pod pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, podFilter) if err != nil { klog.ErrorS(err, "Error listing evictable pods on node" , "node" , klog.KObj(node)) continue } nodeMap[node.Name] = node nodeCount++ // Each pod has a list of owners and a list of containers, and each container has 1 image spec. // For each pod, we go through all the OwnerRef/Image mappings and represent them as a "key" string. // All of those mappings together makes a list of "key" strings that essentially represent that pod's uniqueness. // This list of keys representing a single pod is then sorted alphabetically. // If any other pod has a list that matches that pod's list, those pods are undeniably duplicates for the following reasons: // - The 2 pods have the exact same ownerrefs // - The 2 pods have the exact same container images // // duplicateKeysMap maps the first Namespace/Kind/Name/Image in a pod's list to a 2D-slice of all the other lists where that is the first key // (Since we sort each pod's list, we only need to key the map on the first entry in each list. Any pod that doesn't have // the same first entry is clearly not a duplicate. This makes lookup quick and minimizes storage needed). // If any of the existing lists for that first key matches the current pod's list, the current pod is a duplicate. // If not, then we add this pod's list to the list of lists for that key. // 以下是筛选重复pod的关键逻辑,descheduler对于这部分做了特别的设计,尝试解释一下,也可以看看上面的英文 // 首先是duplicateKeysMap,key是由当前pod的 Namespace/Kind/Name/Image 拼接而成 value则是一个二维string切片 [][]string duplicateKeysMap := map[string][][]string{} for _, pod := range pods { ownerRefList := podutil.OwnerRef(pod) if hasExcludedOwnerRefKind(ownerRefList, strategy) || len(ownerRefList) == 0 { continue } // 这里定义的podContainerKeys预先指定了容量,可以避免slice的扩容. M个ownerRet与N个containers M * N podContainerKeys := make([]string, 0 , len(ownerRefList)*len(pod.Spec.Containers)) imageList := []string{} for _, container := range pod.Spec.Containers { imageList = append(imageList, container.Image) } // 对pod下的image排序,保证image出现的次序 sort.Strings(imageList) imagesHash := strings.Join(imageList, "#" ) for _, ownerRef := range ownerRefList { ownerKey := podOwner{ namespace: pod.ObjectMeta.Namespace, kind: ownerRef.Kind, name: ownerRef.Name, imagesHash: imagesHash, } ownerKeyOccurence[ownerKey] = ownerKeyOccurence[ownerKey] + 1 for _, image := range imageList { // Namespace/Kind/Name should be unique for the cluster. // We also consider the image, as 2 pods could have the same owner but serve different purposes // So any non-unique Namespace/Kind/Name/Image pattern is a duplicate pod. // 这里使用了更严格的策略, 增加了image s := strings.Join([]string{pod.ObjectMeta.Namespace, ownerRef.Kind, ownerRef.Name, image}, "/" ) podContainerKeys = append(podContainerKeys, s) } } // 对podContainerKey再次进行排序 sort.Strings(podContainerKeys) // If there have been any other pods with the same first "key", look through all the lists to see if any match // 基于上面的排序操作,如果真的存在相同的key,任何podContainerKeys 0号 索引不同的pod被认为是不重复的 if existing, ok := duplicateKeysMap[podContainerKeys[ 0 ]]; ok { matched := false for _, keys := range existing { if reflect.DeepEqual(keys, podContainerKeys) { matched = true klog.V( 3 ).InfoS( "Duplicate found" , "pod" , klog.KObj(pod)) for _, ownerRef := range ownerRefList { ownerKey := podOwner{ namespace: pod.ObjectMeta.Namespace, kind: ownerRef.Kind, name: ownerRef.Name, imagesHash: imagesHash, } // 添加到指定的duplicatePods[ownerKey]中去,key是ownerKey, value是要驱逐的pod的切片 if _, ok := duplicatePods[ownerKey]; !ok { duplicatePods[ownerKey] = make(map[string][]*v1.Pod) } duplicatePods[ownerKey][node.Name] = append(duplicatePods[ownerKey][node.Name], pod) } break } } if !matched { // Found no matches, add this list of keys to the list of lists that have the same first key duplicateKeysMap[podContainerKeys[ 0 ]] = append(duplicateKeysMap[podContainerKeys[ 0 ]], podContainerKeys) } } else { // This is the first pod we've seen that has this first "key" entry // 当Map中不存在某个Pod生成的重复性校验的key时(例如,这组pods中的第一个pod),就把它添加进去 duplicateKeysMap[podContainerKeys[ 0 ]] = [][]string{podContainerKeys} } } } // 1. how many pods can be evicted to respect uniform placement of pods among viable nodes? for ownerKey, podNodes := range duplicatePods { // 查看将要被驱逐的节点还可以再哪些节点上运行 targetNodes := getTargetNodes(podNodes, nodes) klog.V( 2 ).InfoS( "Adjusting feasible nodes" , "owner" , ownerKey, "from" , nodeCount, "to" , len(targetNodes)) // 如果要被驱逐的pod可选用的node数量不足,就不进行本次的驱逐操作 if len(targetNodes) < 2 { klog.V( 1 ).InfoS( "Less than two feasible nodes for duplicates to land, skipping eviction" , "owner" , ownerKey) continue } upperAvg := int (math.Ceil(float64(ownerKeyOccurence[ownerKey]) / float64(len(targetNodes)))) for nodeName, pods := range podNodes { klog.V( 2 ).InfoS( "Average occurrence per node" , "node" , klog.KObj(nodeMap[nodeName]), "ownerKey" , ownerKey, "avg" , upperAvg) // list of duplicated pods does not contain the original referential pod if len(pods)+ 1 > upperAvg { // It's assumed all duplicated pods are in the same priority class // TODO(jchaloup): check if the pod has a different node to lend to for _, pod := range pods[upperAvg- 1 :] { if _, err := podEvictor.EvictPod(ctx, pod, nodeMap[nodeName], "RemoveDuplicatePods" ); err != nil { klog.ErrorS(err, "Error evicting pod" , "pod" , klog.KObj(pod)) break } } } } } } |