storm源码分析之任务分配--task assignment

在"storm源码分析之topology提交过程"一文最后,submitTopologyWithOpts函数调用了mk-assignments函数。该函数的主要功能就是进行topology的任务分配(task assignment)。mk-assignments函数定义如下:

;; get existing assignment (just the executor->node+port map) -> default to {}
;; filter out ones which have a executor timeout
;; figure out available slots on cluster. add to that the used valid slots to get total slots. figure out how many executors should be in each slot (e.g., 4, 4, 4, 5)
;; only keep existing slots that satisfy one of those slots. for rest, reassign them across remaining slots
;; edge case for slots with no executor timeout but with supervisor timeout... just treat these as valid slots that can be reassigned to. worst comes to worse the executor will timeout and won't assign here next time around
(defnk mk-assignments [nimbus :scratch-topology-id nil]
 (let [conf (:conf nimbus)
       storm-cluster-state (:storm-cluster-state nimbus)
       ^INimbus inimbus (:inimbus nimbus)
       ;; read all the topologies
       topology-ids (.active-storms storm-cluster-state)
       ;; topologies绑定topology id->TopologyDetails对象键值对的map
       topologies (into {} (for [tid topology-ids]
                             {tid (read-topology-details nimbus tid)}))
       ;; topologies重新绑定Topologies对象,请参见Topologies.java。
       topologies (Topologies. topologies)
       ;; read all the assignments
       ;; assigned-topology-ids绑定zookeeper中/assignments/下的所有topology id。
       assigned-topology-ids (.assignments storm-cluster-state nil)
       ;; existing-assignments绑定topology id->AssignmentInfo信息键值对的map。storm集群上所有已经存在的"分配"
       existing-assignments (into {} (for [tid assigned-topology-ids]
                                       ;; for the topology which wants rebalance (specified by the scratch-topology-id)
                                       ;; we exclude its assignment, meaning that all the slots occupied by its assignment
                                       ;; will be treated as free slot in the scheduler code.
                                       (when (or (nil? scratch-topology-id) (not= tid scratch-topology-id))
                                         ;; assignment-info函数返回tid对应的AssignmentInfo信息,
                                         ;; (defrecord Assignment [master-code-dir node->host executor->node+port executor->start-time-secs])
                                         {tid (.assignment-info storm-cluster-state tid nil)})))
       ;; make the new assignments for topologies
       ;; topology->executor->node+port绑定cluster中最新的分配信息{topology-id -> {executor [node port]}}
       topology->executor->node+port (compute-new-topology->executor->node+port
                                      nimbus
                                      existing-assignments
                                      topologies
                                      scratch-topology-id)
       
       ;; now-secs绑定当前时间
       now-secs (current-time-secs)
       ;; basic-supervisor-details-map绑定supervisor id->SupervisorDetails的map
       basic-supervisor-details-map (basic-supervisor-details-map storm-cluster-state)
       
       ;; construct the final Assignments by adding start-times etc into it
       ;; new-assignments绑定当前集群最新分配信息topology-id->Assignment的map
       new-assignments (into {} (for [[topology-id executor->node+port] topology->executor->node+port
                                       ;; existing-assignment绑定topology-id在zookeeper上的分配信息AssignmentInfo,对于刚刚提交的topology来说,existing-assignment为nil
                                       :let [existing-assignment (get existing-assignments topology-id)
                                             ;; all-nodes绑定cluster中分配给它的supervisor id的集合,如#{node1 node2 ...}
                                             all-nodes (->> executor->node+port vals (map first) set)
                                             ;; node->host绑定supervisor id->host-name的map,如{node1 host1 node2 host2 ...}
                                             node->host (->> all-nodes
                                                             (mapcat (fn [node]
                                                                       (if-let [host (.getHostName inimbus basic-supervisor-details-map node)]
                                                                         [[node host]]
                                                                         )))
                                                             (into {}))
                                             ;; 对于进行重新分配的topology,node->host存放的是cluster重新分配给它的supervisor,existing-assignment存放了重新分配之前,该topology所拥有的旧的                          ;; supervisor
                                             ;; all-node->host存放了两者的并集
                                             all-node->host (merge (:node->host existing-assignment) node->host)
                                             ;; reassign-executors绑定cluster中重新分配的executor;changed-executors函数参见其定义部分
                                             reassign-executors (changed-executors (:executor->node+port existing-assignment) executor->node+port)
                                             ;; 为重新分配的executor设置启动时间,并将其与旧分配中executor的启动时间进行合并,保存在start-times
                         start-times (merge (:executor->start-time-secs existing-assignment)
                                                               (into {}
                                                                     (for [id reassign-executors]
                                                                       [id now-secs]
                                                                       )))]]
                  ;; 创建topology-id->Assignment的map
                  ;; Assignment定义:(defrecord Assignment [master-code-dir node->host executor->node+port executor->start-time-secs])
                  {topology-id (Assignment.
                                                ;; master-stormdist-root函数返回topology在nimbus上的代码目录
                                                (master-stormdist-root conf topology-id)
                                                ;; select-keys函数返回all-node->host中key包含在all-nodes集合的键值对
                                                (select-keys all-node->host all-nodes)
                                                executor->node+port
                                                start-times)}))]

   ;; tasks figure out what tasks to talk to by looking at topology at runtime
   ;; only log/set when there's been a change to the assignment
   (doseq [[topology-id assignment] new-assignments
           :let [existing-assignment (get existing-assignments topology-id)
                 topology-details (.getById topologies topology-id)]]
     ;; 如果新分配assignment不等于zookeeper中的existing-assignment,则调用storm-cluster-state的set-assignment!函数将这个新分配信息重新写入zookeeper
     (if (= existing-assignment assignment)
       (log-debug "Assignment for " topology-id " hasn't changed")
       (do
         (log-message "Setting new assignment for topology id " topology-id ": " (pr-str assignment))
         (.set-assignment! storm-cluster-state topology-id assignment)
         )))
   (->> new-assignments
         (map (fn [[topology-id assignment]]
           (let [existing-assignment (get existing-assignments topology-id)]
             ;; newly-added-slots函数返回新分配给该topology的slot集合#{[node1 port1] [node1 port2] [node2 port1] ...}
             ;; (map to-worker-slot)返回WorkerSlot对象的集合
             [topology-id (map to-worker-slot (newly-added-slots existing-assignment assignment))]
             )))
         ;; 返回topology-id->WorkerSlot对象集合的map
         (into {})
         ;; 当前版本assignSlots函数是一个空实现
         (.assignSlots inimbus topologies))
   ))

函数的注释阐述了该函数的主要作用。conf绑定nimbus的配置信息,inimbus绑定nimbus实例,topology-ids绑定所有状态为"active"的topology id集合。active-storms函数获取zookeeper中/storms/的所有children(即所有topology id),/storms/{topology-id}中存放当前正在运行的topology信息。保存的内容参考common.clj中的类StormBase(defrecord StormBase [storm-name launch-time-secs status num-workers component->executors])。topologies绑定topology id->TopologyDetails对象键值对的map。read-topology-details函数根据每个topology id生成一个对应的TopologyDetails对象。TopologyDetails类如下:

public class TopologyDetails {
   String topologyId;
   Map topologyConf;
   StormTopology topology;
   Map<ExecutorDetails, String> executorToComponent;
   int numWorkers;
   
       //get and set方法
       ... ...
       //构造方法
       ... ...
       
   public Map<ExecutorDetails, String> selectExecutorToComponent(Collection<ExecutorDetails> executors) {
       Map<ExecutorDetails, String> ret = new HashMap<ExecutorDetails, String>(executors.size());
       for (ExecutorDetails executor : executors) {
           String compId = this.executorToComponent.get(executor);
           if (compId != null) {
               ret.put(executor, compId);
           }
       }
       
       return ret;
   }
   
   public Collection<ExecutorDetails> getExecutors() {
       return this.executorToComponent.keySet();
   }
}

Topologies类定义如下:

public class Topologies {
   Map<String, TopologyDetails> topologies;
   Map<String, String> nameToId;
   
   public Topologies(Map<String, TopologyDetails> topologies) {
       if(topologies==null) topologies = new HashMap();
       this.topologies = new HashMap<String, TopologyDetails>(topologies.size());
       this.topologies.putAll(topologies);
       this.nameToId = new HashMap<String, String>(topologies.size());
       
       for (String topologyId : topologies.keySet()) {
           TopologyDetails topology = topologies.get(topologyId);
           this.nameToId.put(topology.getName(), topologyId);
       }
   }
   
   public TopologyDetails getById(String topologyId) {
       return this.topologies.get(topologyId);
   }
   
   public TopologyDetails getByName(String topologyName) {
       String topologyId = this.nameToId.get(topologyName);
       
       if (topologyId == null) {
           return null;
       } else {
           return this.getById(topologyId);
       }
   }
   
   public Collection<TopologyDetails> getTopologies() {
       return this.topologies.values();
   }
}

read-topology-details函数定义如下:

(defn read-topology-details [nimbus storm-id]
 (let [conf (:conf nimbus)
       storm-base (.storm-base (:storm-cluster-state nimbus) storm-id nil)
       topology-conf (read-storm-conf conf storm-id)
       topology (read-storm-topology conf storm-id)
   ;; executor->component绑定ExecutorDetails对象->组件名称键值对的map,如{ExecutorDetails(1, 2) "boltA" ExecutorDetails(3, 4) "boltA" ExecutorDetails(5, 5) "boltB" ExecutorDetails(6,       6) "boltB"},compute-executor->component函数请见定义部分。
   ;; ExecutorDetails类请见ExecutorDetails.java
   executor->component (->> (compute-executor->component nimbus storm-id)
                                (map-key (fn [[start-task end-task]]
                                         (ExecutorDetails. (int start-task) (int end-task)))))]
   ;; 返回TopologyDetails对象,请参见TopologyDetails.java
   (TopologyDetails. storm-id
                     topology-conf
                     topology
                     (:num-workers storm-base)
                     executor->component
                     )))

storm-base绑定topology id所对应的StormBase对象。storm-base函数定义在cluster.clj文件的StormClusterState协议中,定义如下:

(storm-base
       [this storm-id callback]
       (when callback
         (swap! storm-base-callback assoc storm-id callback))
       (maybe-deserialize (get-data cluster-state (storm-path storm-id) (not-nil? callback))))

maybe-deserialize函数将get-data函数从zookeeper的/storms/{topology id}获取的二进制数据反序列化成StormBase对象。topology-conf绑定topology的配置信息,read-storm-conf函数从/nimbus/stormdist/{topology id}读取stormconf.ser信息并合并到conf中返回,read-storm-conf函数定义如下:

(defn- read-storm-conf [conf storm-id]
 (let [stormroot (master-stormdist-root conf storm-id)]
   (merge conf
          (Utils/deserialize
           (FileUtils/readFileToByteArray
            (File. (master-stormconf-path stormroot))
            )))))

topology绑定read-storm-topology函数获取的topology对象,read-storm-topology函数从/nimbus/stormdist/{topology id}读取stormcode.ser信息并反序列化成一个topology对象。定义如下:

(defn- read-storm-topology [conf storm-id]
 (let [stormroot (master-stormdist-root conf storm-id)]
   (Utils/deserialize
     (FileUtils/readFileToByteArray
       (File. (master-stormcode-path stormroot))
       ))))

compute-executor->component函数定义如下:

(defn- compute-executor->component [nimbus storm-id]
 (let [conf (:conf nimbus)
       ;; executors绑定线程id集合,如([1 2] [3 4] [5 5] [6 6]),详细信息请见compute-executors函数定义部分。
       executors (compute-executors nimbus storm-id)
       topology (read-storm-topology conf storm-id)
       storm-conf (read-storm-conf conf storm-id)
       ;; task->component绑定任务id->组件名称键值对的map,形如:{1 "boltA", 2 "boltA", 3 "boltA", 4 "boltA", 5 "boltB", 6 "boltB"},具体请见storm-task-info函数定义部分。
       task->component (storm-task-info topology storm-conf)
       ;; executor->component绑定线程id->组件名称键值对的map,如{[1 2] "boltA" [3 4] "boltA" [5 5] "boltB" [6 6] "boltB"}
       executor->component (into {} (for [executor executors
                                          :let [start-task (first executor)
                                                component (task->component start-task)]]
                                      {executor component}))]))

compute-executors函数定义如下:

(defn- compute-executors [nimbus storm-id]
 (let [conf (:conf nimbus)
         ;; storm-base绑定start-storm函数写入zookeeper的topology所对应的StormBase对象
       storm-base (.storm-base (:storm-cluster-state nimbus) storm-id nil)
       ;; component->executors绑定组件名称->组件"并行度"(每个组件有几个executor)键值对的map,形如{"boltA" 2, "boltB" 2}
       component->executors (:component->executors storm-base)
       ;; storm-conf绑定topology的配置信息
       storm-conf (read-storm-conf conf storm-id)
       ;; topology绑定topology对象
       topology (read-storm-topology conf storm-id)
       ;; task->component绑定任务id->组件名称键值对的map,形如:{1 "boltA", 2 "boltA", 3 "boltA", 4 "boltA", 5 "boltB", 6 "boltB"}
       task->component (storm-task-info topology storm-conf)]
   ;; ->>是clojure的一个宏,将(storm-task-info topology storm-conf)返回结果作为reverse-map的最后一个参数,然后将reverse-map返回结果作为map-val的最后一个参数依次类推
   (->> (storm-task-info topology storm-conf)
        ;; reverse-map函数的输入参数是一个map,返回这个map按值分类后的结果map,输入参数为{1 "boltA", 2 "boltA", 3 "boltA", 4 "boltA", 5 "boltB", 6 "boltB"}
        ;; 返回结果为{"boltA" [3 4 2 1], "boltB" [5 6]}。reverse-map定义在util.clj中
        reverse-map
        ;; map-val函数有两个输入参数第一个参数是一个函数,第二参数是map,返回结果是一个map;结果map的key就是输入map的key,结果map的value是输入map的value调用函数的返回值
        ;;(map-val sort {"boltA" [3 4 2 1], "boltB" [5 6]})返回{"boltA" (1 2 3 4), "boltB" (5 6)}。map-val定义在util.clj中
        (map-val sort)
        ;; join-maps函数将component->executors和(map-val sort)的返回结果进行合并,component->executors={"boltA" 2, "boltB" 2},(map-val sort)={"boltA" (1 2 3 4), "boltB" (5 6)}
        ;; join-maps返回结果为{"boltA" (2 [1 2 3 4]), "boltB" (2 [5 6])}
        (join-maps component->executors)
        ;; map-val函数的第一个参数是partial定义一个偏函数,函数体就是apply partition-fixed函数,第二参数是join-maps返回结果{"boltA" (2 [1 2 3 4]), "boltB" (2 [5 6])},返回结果为
        ;; {"boltA" [(1 2) (3 4)] "boltB" [(5) (6)]}
        (map-val (partial apply partition-fixed))
        ;;((1 2) (3 4) (5) (6))
        (mapcat second)
        ;;([1 2] [3 4] [5 5] [6 6])
        (map to-executor-id)
        )))

storm-task-info函数定义如下:

(defn storm-task-info
 "Returns map from task -> component id"
 [^StormTopology user-topology storm-conf]
 (->> (system-topology! storm-conf user-topology)
      ;; 获取组件名称->组件对象键值对的map
      all-components
      ;; 返回组件名称->组件任务task数键值对的map,如{"boltA" 4, "boltB" 2}
      (map-val (comp #(get % TOPOLOGY-TASKS) component-conf))
      ;; 按照组件名称对map进行排序返回结果序列,如(["boltA" 4] ["boltB" 2])
      (sort-by first)
      ;; mapcat函数等价于对(map (fn...))的返回结果执行concat函数,返回("boltA" "boltA" "boltA" "boltA" "boltB" "boltB")
      (mapcat (fn [[c num-tasks]] (repeat num-tasks c)))
      ;; {1 "boltA", 2 "boltA",3 "boltA", 4 "boltA", 5 "boltB", 6 "boltB"}
      (map (fn [id comp] [id comp]) (iterate (comp int inc) (int 1)))
      (into {})
      ))

compute-new-topology->executor->node+port函数定义如下:

(defn compute-new-topology->executor->node+port [nimbus existing-assignments topologies scratch-topology-id]
 (let [conf (:conf nimbus)
       storm-cluster-state (:storm-cluster-state nimbus)
       ;; topology->executors绑定所有已经分配任务的topology id->executor id集合键值对的map,如{"topology-id-1" #{[1 2] [3 4] [5 5] [6 6]} "topology-id-2" #{[1 2] [3 4] [5 6] [7 8]}}
       ;; compute-topology->executors函数比较简单只是循环调用了compute-executors函数,关于compute-executors函数请见其定义部分。
       topology->executors (compute-topology->executors nimbus (keys existing-assignments))
       ;; update the executors heartbeats first.
       ;; 将线程心跳信息更新到nimbus的心跳信息缓存中。
       _ (update-all-heartbeats! nimbus existing-assignments topology->executors)
       ;; topology->alive-executors绑定topology id->alive-executor-id集合键值对的map,compute-topology->alive-executors函数参见其定义部分。
       topology->alive-executors (compute-topology->alive-executors nimbus
                                                                    existing-assignments
                                                                    topologies
                                                                    topology->executors
                                                                    scratch-topology-id)
       ;; supervisor->dead-ports绑定supervisor-id->dead-port键值对的map。compute-supervisor->dead-ports函数参见其定义部分。
       supervisor->dead-ports (compute-supervisor->dead-ports nimbus
                                                              existing-assignments
                                                              topology->executors
                                                              topology->alive-executors)
       ;; topology->scheduler-assignment绑定topology id->SchedulerAssignmentImpl对象键值对的map。SchedulerAssignmentImpl类定义参见SchedulerAssignmentImpl.java。compute-topology-               >scheduler-assignment函数
       ;; 参见其定义部分。                                              
       topology->scheduler-assignment (compute-topology->scheduler-assignment nimbus
                                                                              existing-assignments
                                                                              topology->alive-executors)
       ;; missing-assignment-topologies绑定丢失assignment的topology id集合
       ;; topologies绑定一个Topologies对象,Topologies类参见Topologies.java                                                                      
       missing-assignment-topologies (->> topologies
                                          ;; getTopologies方法返回Topologies对象中的TopologyDetails对象的集合
                                          .getTopologies
                                          ;; memfn函数将java中方法调用方式转换成clojure中函数调用方式,(map (memfn getId))返回topology id集合
                                          (map (memfn getId))
                                          (filter (fn [t]
                                                     ;; alle绑定storm集群上所有executor的集合
                                                     (let [alle (get topology->executors t)
                                                           ;; alivee绑定storm集群上所有"alive"的executor集合
                                                           alivee (get topology->alive-executors t)]
                                                           ;; or函数由三个条件组成,用于判断topology是否丢失assignment,如果alle为空,说明该topology丢失了所有的assignment即executor                                                                     都"dead"
                                                           ;; 如果alle不等于alivee,说明该topology丢失了部分assignment即部分executor是"dead"
                                                           ;; 如果该topology当前实际workslot个数小于定义时指定的worker个数,说明该topology丢失部分workslot即部分worker进程
                                                           (or (empty? alle)
                                                               (not= alle alivee)
                                                               (< (-> topology->scheduler-assignment
                                                                      (get t)
                                                                      num-used-workers )
                                                                  (-> topologies (.getById t) .getNumWorkers)
                                                                  ))
                                                           ))))
       ;; all-scheduling-slots绑定所有supervisor id->可用port集合的map,如{node1 #{port1 port2} node2 #{port1} ...}
       ;; all-scheduling-slots函数返回所有可用的node+port的集合,如([node1 port1] [node1 port2] [node2 port1] ...),(map (fn [[node-id port]] {node-id #{port}}))返回({node1 port1}             {node1 port2} {node2 port1} ...),
       ;; (apply merge-with set/union)返回{node1 #{port1 port2} node2 #{port1} ...}                                            
       all-scheduling-slots (->> (all-scheduling-slots nimbus topologies missing-assignment-topologies)
                                 (map (fn [[node-id port]] {node-id #{port}}))
                                 (apply merge-with set/union))
       ;; supervisors绑定storm集群上所有supervisor id->SupervisorDetails对象的map,read-all-supervisor-details函数参见其定义部分
       supervisors (read-all-supervisor-details nimbus all-scheduling-slots supervisor->dead-ports)
       ;; cluster绑定一个Cluster对象,Cluster类参见Cluster.java
       cluster (Cluster. (:inimbus nimbus) supervisors topology->scheduler-assignment)

       ;; call scheduler.schedule to schedule all the topologies
       ;; the new assignments for all the topologies are in the cluster object.
       ;; 调用IScheduler接口实现类的schedule方法,topologies绑定了从zookeeper上获取的storm集群中的全部topology信息(包括我们刚刚提交的topology信息),cluster绑定了当前storm集群中正在运行的所有
       ;; topology的assignment信息(不包括我们刚刚提交的topology)
       ;; nimbus中的scheduler是由mk-scheduler函数生成的,通过分析mk-scheduler函数,可以发现在没有配置用户自定义的scheduler情况下,mk-scheduler函数默认返回DefaultScheduler
       ;; 所以默认情况下是调用DefaultScheduler类的schedule方法,schedule方法参见DefaultScheduler.clj,mk-scheduler函数参见其定义部分
       _ (.schedule (:scheduler nimbus) topologies cluster)
       ;; new-scheduler-assignments绑定分配完成后,集群中当前最新的topology id->AssignmentImpl的map
       new-scheduler-assignments (.getAssignments cluster)
       ;; add more information to convert SchedulerAssignment to Assignment
       ;; new-topology->executor->node+port绑定{topology-id -> {executor [node port]}},compute-topology->executor->node+port函数将topology id->AssignmentImpl的map转换成{topology-id           -> {executor [node port]}}
       new-topology->executor->node+port (compute-topology->executor->node+port new-scheduler-assignments)]
   ;; print some useful information.
   (doseq [[topology-id executor->node+port] new-topology->executor->node+port
           ;; old-executor->node+port绑定从zookeeper上获取的分配信息相对于从cluster对象中获取的new-topology->executor->node+port分配信息为"旧分配信息"
           :let [old-executor->node+port (-> topology-id
                                         existing-assignments
                                         :executor->node+port)
                 ;; reassignment绑定被重新分配的executor所对应的executor->node+port的map
                 reassignment (filter (fn [[executor node+port]]
                                        (and (contains? old-executor->node+port executor)
                                             (not (= node+port (old-executor->node+port executor)))))
                                      executor->node+port)]]
     ;; 如果reassignment不为空,在日志文件中打印新增slot的个数和被重新分配的executor
     (when-not (empty? reassignment)
       (let [new-slots-cnt (count (set (vals executor->node+port)))
             reassign-executors (keys reassignment)]
         (log-message "Reassigning " topology-id " to " new-slots-cnt " slots")
         (log-message "Reassign executors: " (vec reassign-executors)))))
   ;; 返回最新分配信息new-topology->executor->node+port
   new-topology->executor->node+port))

update-all-heartbeats!函数定义如下:

(defn- update-all-heartbeats! [nimbus existing-assignments topology->executors]
 "update all the heartbeats for all the topologies's executors"
 ;; assignment绑定AssignmentInfo,tid绑定topology id
 (doseq [[tid assignment] existing-assignments
           ;; all-executors绑定executor id集合
         :let [all-executors (topology->executors tid)]]
   (update-heartbeats! nimbus tid all-executors assignment)))

update-heartbeats!函数定义如下:

(defn update-heartbeats! [nimbus storm-id all-executors existing-assignment]
 (log-debug "Updating heartbeats for " storm-id " " (pr-str all-executors))
 (let [storm-cluster-state (:storm-cluster-state nimbus)
       ;; executor-beats绑定指定的executor的心跳信息,executor-beats函数参见其定义部分
       executor-beats (.executor-beats storm-cluster-state storm-id (:executor->node+port existing-assignment))
       ;; (@(:heartbeats-cache nimbus) storm-id)根据topology id(即storm-id)获取nimbus心跳缓存map中与之对应的心跳信息一个map对象,update-heartbeat-cache函数
       ;; cache绑定更新后的executor-id->心跳信息缓存map的键值对map,update-heartbeat-cache函数参见其定义部分。
       cache (update-heartbeat-cache (@(:heartbeats-cache nimbus) storm-id)
                                     executor-beats
                                     all-executors
                                     ((:conf nimbus) NIMBUS-TASK-TIMEOUT-SECS))]
     ;; 将storm-id标示的topology的最新线程心跳信息更新到nimbus心跳缓存map中。
     (swap! (:heartbeats-cache nimbus) assoc storm-id cache)))

executor-beats函数定义如下:

(executor-beats
       [this storm-id executor->node+port]
       ;; need to take executor->node+port in explicitly so that we don't run into a situation where a
       ;; long dead worker with a skewed clock overrides all the timestamps. By only checking heartbeats
       ;; with an assigned node+port, and only reading executors from that heartbeat that are actually assigned,
       ;; we avoid situations like that
       ;; node+port->executors绑定结点+端口->executor-id键值对的map,如{[node1 port1] [[1 2] [3 4]] [node2 port2] [[5 6] [7 8]]}
       (let [node+port->executors (reverse-map executor->node+port)
             ;; all-heartbeats绑定所有executor-id->心跳信息的map的集合,如({[1 2] {:time-secs 心跳 :uptime 开始时间 :stats BoltExecutorStats或SpoutExecutorStats对象}} [3 4]...)
             all-heartbeats (for [[[node port] executors] node+port->executors]
                              ;; get-worker-heartbeat函数从zookeeper的/workerbeats/{storm-id}/{node-port}获取worker进程心跳信息
                              (->> (get-worker-heartbeat this storm-id node port)
                                   ;; convert-executor-beats函数从worker进程心跳信息中获取线程的心跳信息,convert-executor-beats函数参见其定义部分。
                                   (convert-executor-beats executors)
                                   ))]
         ;; 将all-heartbeats集合中的map进行合并,结果如{[1 2] {:time-secs 心跳 :uptime 开始时间 :stats BoltExecutorStats或SpoutExecutorStats对象} [3 4]...}
         (apply merge all-heartbeats)))

convert-executor-beats函数定义如下:

(defn convert-executor-beats
 "Ensures that we only return heartbeats for executors assigned to
 this worker."
 [executors worker-hb]
 ;; (:executor-stats worker-hb)返回一个map,可以为executor-id
 (let [executor-stats (:executor-stats worker-hb)]
   (->> executors
        (map (fn [t]
               ;; 判断executor是否属于这个worker进程
               (if (contains? executor-stats t)
                 {t {:time-secs (:time-secs worker-hb)
                     :uptime (:uptime worker-hb)
                     :stats (get executor-stats t)}})))
        (into {}))))

update-heartbeat-cache函数定义如下:

(defn update-heartbeat-cache [cache executor-beats all-executors timeout]
 (let [cache (select-keys cache all-executors)]
   (into {}
     (for [executor all-executors :let [curr (cache executor)]]
       [executor
        (update-executor-cache curr (get executor-beats executor) timeout)]
        ))))

该函数主要调用了update-executor-cache函数,update-executor-cache函数定义如下:

(defn- update-executor-cache [curr hb timeout]
 ;; reported-time绑定线程的心跳时间
 (let [reported-time (:time-secs hb)
       ;; 绑定最新一次nimbus时间
       {last-nimbus-time :nimbus-time
        ;; last-reported-time绑定最新一次心跳时间
        last-reported-time :executor-reported-time} curr
       reported-time (cond reported-time reported-time
                           last-reported-time last-reported-time
                           :else 0)
       ;; 当last-nimbus-time为nil或last-reported-time和reported-time不相等时,nimbus-time绑定当前时间,否则绑定最新一次nimbus时间
       nimbus-time (if (or (not last-nimbus-time)
                       (not= last-reported-time reported-time))
                     (current-time-secs)
                     last-nimbus-time
                     )]
     ;; 判断是否超时,如果当前时间-nimbus-time>=timeout(timeout的值为storm.yaml中的"nimbus.task.timeout.secs"),:is-timed-out值为true,否则为false
     {:is-timed-out (and
                      nimbus-time
                      (>= (time-delta nimbus-time) timeout))
      :nimbus-time nimbus-time
      :executor-reported-time reported-time}))

 compute-topology->alive-executors函数定义如下:

;; 通过调用alive-executors函数获取a topology-id -> alive executors map。
(defn- compute-topology->alive-executors [nimbus existing-assignments topologies topology->executors scratch-topology-id]
"compute a topology-id -> alive executors map"
(into {} (for [[tid assignment] existing-assignments
               :let [topology-details (.getById topologies tid)
                     all-executors (topology->executors tid)
                     alive-executors (if (and scratch-topology-id (= scratch-topology-id tid))
                                       all-executors
                                       (set (alive-executors nimbus topology-details all-executors assignment)))]]
           {tid alive-executors})))

alive-executors函数定义如下:

(defn- alive-executors
 [nimbus ^TopologyDetails topology-details all-executors existing-assignment]
 (log-debug "Computing alive executors for " (.getId topology-details) "\n"
            "Executors: " (pr-str all-executors) "\n"
            "Assignment: " (pr-str existing-assignment) "\n"
            "Heartbeat cache: " (pr-str (@(:heartbeats-cache nimbus) (.getId topology-details)))
            )
 ;; TODO: need to consider all executors associated with a dead executor (in same slot) dead as well,
 ;; don't just rely on heartbeat being the same
 (let [conf (:conf nimbus)
       storm-id (.getId topology-details)
       ;; executor-start-times绑定executors的启动时间
       executor-start-times (:executor->start-time-secs existing-assignment)
       ;; heartbeats-cache绑定nimbus的心跳缓存
       heartbeats-cache (@(:heartbeats-cache nimbus) storm-id)]
   (->> all-executors
       ;; 为每个executor调用一个匿名函数,判断executor是否"alive"
       (filter (fn [executor]
         ;; start-time绑定executor的启动时间
         (let [start-time (get executor-start-times executor)
               ;; is-timed-out绑定executor的超时状态
               is-timed-out (-> heartbeats-cache (get executor) :is-timed-out)]
            ;; 如果start-time不为nil并且当前时间-start-time<NIMBUS-TASK-LAUNCH-SECS或is-timed-out=false,那么该executor是"alive"的
            ;; filter函数只会返回匿名函数返回结果为true的executor,NIMBUS-TASK-LAUNCH-SECS配置项的作用是:task启动时的一个特殊超时设置,
            ;; 在启动后第一次心跳前会使用该值来临时替代nimbus.task.timeout.secs。所以当前时间-start-time<NIMBUS-TASK-LAUNCH-SECS和is-timed-out=false之间
            ;; 是或关系。
           (if (and start-time
                  (or
                   (< (time-delta start-time)
                      (conf NIMBUS-TASK-LAUNCH-SECS))
                   (not is-timed-out)
                   ))
             true
             (do
               (log-message "Executor " storm-id ":" executor " not alive")
               false))
           )))
       doall)))

compute-supervisor->dead-ports函数定义如下:

(defn- compute-supervisor->dead-ports [nimbus existing-assignments topology->executors topology->alive-executors]
 ;; dead-slots集合如[([node1 port1]) ([node2 port2]) ([node1 port1]) ([node1 port2])]
 (let [dead-slots (into [] (for [[tid assignment] existing-assignments
                                 ;; all-executors绑定所有executor
                                 :let [all-executors (topology->executors tid)
                                       ;; alive-executors绑定所有"alive"的executor
                                       alive-executors (topology->alive-executors tid)
                                       ;; dead-executors绑定all-executors和alive-executors的差集,即"dead"的executor
                                       dead-executors (set/difference all-executors alive-executors)
                                       ;; dead-slots绑定dead-executors集合中executor对应的[node port]的集合,如(([node1 port1]) ([node2 port2]) ([node1 port1]) ([node1 port2]))
                                       ;; 注意dead-slots集合可能包含重复元素。
                                       dead-slots (->> (:executor->node+port assignment)
                                                       (filter #(contains? dead-executors (first %)))
                                                       vals)]]
                             dead-slots))
       ;; supervisor->dead-ports绑定supervisor-id->dead-port集合键值对的map
       supervisor->dead-ports (->> dead-slots
                                   ;; apply函数的结果如([node1 port1] [node2 port2] [node1 port1] [node1 port2])
                                   (apply concat)
                                   ;; map函数的结果如({node1 #{port1}} {node2 #{port2}} {node1 #{port1}} {node1 #{port2}})
                                   (map (fn [[sid port]] {sid #{port}}))
                                   ;; apply函数的结果如({node1 #{port1 port2}} {node2 #{port2}}}),主要用于去重合并。
                                   (apply (partial merge-with set/union)))]
   (or supervisor->dead-ports {})))

compute-topology->scheduler-assignment函数定义如下:

(defn- compute-topology->scheduler-assignment [nimbus existing-assignments topology->alive-executors]
 "convert assignment information in zk to SchedulerAssignment, so it can be used by scheduler api."
 ;; tid绑定topology id,assignment绑定对应的AssignmentInfo对象
 (into {} (for [[tid assignment] existing-assignments
                ;; alive-executors绑定"alive"的executor的集合
                :let [alive-executors (topology->alive-executors tid)
                      ;; executor->node+port绑定AssignmentInfo对象中的executor->node+port的map
                      executor->node+port (:executor->node+port assignment)
                      ;; 循环遍历executor->node+port,如果executor包含在alive-executors集合中,那么创建一个ExecutorDetails->WorkerSlot的map,最后将for循环创建的map合并赋值给executor->slot
                      executor->slot (into {} (for [[executor [node port]] executor->node+port]
                                                ;; filter out the dead executors
                                                (if (contains? alive-executors executor)
                                                  {(ExecutorDetails. (first executor)
                                                                     (second executor))
                                                   (WorkerSlot. node port)}
                                                  {})))]]
            ;; 返回topology id->SchedulerAssignmentImpl对象的map,SchedulerAssignmentImpl类参见SchedulerAssignmentImpl.java                                  
            {tid (SchedulerAssignmentImpl. tid executor->slot)})))

SchedulerAssignmentImpl.java定义如下:

public class SchedulerAssignmentImpl implements SchedulerAssignment {
   /**
    * topology-id this assignment is for.
    */
   String topologyId;
   /**
    * assignment detail, a mapping from executor to <code>WorkerSlot</code>
    */
   Map<ExecutorDetails, WorkerSlot> executorToSlot;
   
   public SchedulerAssignmentImpl(String topologyId, Map<ExecutorDetails, WorkerSlot> executorToSlots) {
       this.topologyId = topologyId;
       this.executorToSlot = new HashMap<ExecutorDetails, WorkerSlot>(0);
       if (executorToSlots != null) {
           this.executorToSlot.putAll(executorToSlots);
       }
   }

   @Override
   public Set<WorkerSlot> getSlots() {
       return new HashSet(executorToSlot.values());
   }    
   
   /**
    * Assign the slot to executors.
    * @param slot
    * @param executors
    */
   public void assign(WorkerSlot slot, Collection<ExecutorDetails> executors) {
       for (ExecutorDetails executor : executors) {
           this.executorToSlot.put(executor, slot);
       }
   }
   
   /**
    * Release the slot occupied by this assignment.
    * @param slot
    */
   public void unassignBySlot(WorkerSlot slot) {
       List<ExecutorDetails> executors = new ArrayList<ExecutorDetails>();
       for (ExecutorDetails executor : this.executorToSlot.keySet()) {
           WorkerSlot ws = this.executorToSlot.get(executor);
           if (ws.equals(slot)) {
               executors.add(executor);
           }
       }
       
       // remove
       for (ExecutorDetails executor : executors) {
           this.executorToSlot.remove(executor);
       }
   }

   /**
    * Does this slot occupied by this assignment?
    * @param slot
    * @return
    */
   public boolean isSlotOccupied(WorkerSlot slot) {
       return this.executorToSlot.containsValue(slot);
   }

   public boolean isExecutorAssigned(ExecutorDetails executor) {
       return this.executorToSlot.containsKey(executor);
   }
   
   public String getTopologyId() {
       return this.topologyId;
   }

   public Map<ExecutorDetails, WorkerSlot> getExecutorToSlot() {
       return this.executorToSlot;
   }

   /**
    * Return the executors covered by this assignments
    * @return
    */
   public Set<ExecutorDetails> getExecutors() {
       return this.executorToSlot.keySet();
   }
}

all-scheduling-slots函数定义如下:

(defn- all-scheduling-slots
 [nimbus topologies missing-assignment-topologies]
 (let [storm-cluster-state (:storm-cluster-state nimbus)
       ^INimbus inimbus (:inimbus nimbus)
       ;; supervisor-infos绑定supervisor id->SupervisorInfo对象键值对的map
       ;; SupervisorInfod定义:(defrecord SupervisorInfo [time-secs hostname assignment-id used-ports meta scheduler-meta uptime-secs])
       supervisor-infos (all-supervisor-info storm-cluster-state nil)
       ;; supervisor-details绑定SupervisorDetails对象集合,SupervisorDetails类参见SupervisorDetails.java
       supervisor-details (dofor [[id info] supervisor-infos]
                            ;; info的:meta的值为这个结点的所有端口的集合
                            (SupervisorDetails. id (:meta info)))
       ;; ret绑定storm集群上所有可用的workslot
       ret (.allSlotsAvailableForScheduling inimbus
                    supervisor-details
                    topologies
                    (set missing-assignment-topologies)
                    )
       ]
   ;; 返回所有可用的node+port的集合,如([node1 port1] [node1 port2] [node2 port1] ...)
   (dofor [^WorkerSlot slot ret]
     [(.getNodeId slot) (.getPort slot)]
     )))

SupervisorDetails类定义如下:

public class SupervisorDetails {

   String id;
   /**
    * hostname of this supervisor
    */
   String host;
   Object meta;
   /**
    * meta data configured for this supervisor
    */
   Object schedulerMeta;
   /**
    * all the ports of the supervisor
    */
   Set<Integer> allPorts;
   
   // 构造方法
   ... ...
   // get和set方法
   ... ...
}

read-all-supervisor-details函数定义如下:

(defn- read-all-supervisor-details [nimbus all-scheduling-slots supervisor->dead-ports]
 "return a map: {topology-id SupervisorDetails}"
 (let [storm-cluster-state (:storm-cluster-state nimbus)
       ;; supervisor-infos绑定从zookeeper上获取的supervisor id->SupervisorInfo对象键值对的map
       supervisor-infos (all-supervisor-info storm-cluster-state)
       ;; nonexistent-supervisor-slots绑定all-scheduling-slots中有,但supervisor-infos中没有的supervisor,
       ;; 根据all-scheduling-slots的定义,我们可以知道all-scheduling-slots就是由supervisor-infos生成的,所以目前nonexistent-supervisor-slots应该是一个空map
       nonexistent-supervisor-slots (apply dissoc all-scheduling-slots (keys supervisor-infos))
       ;; all-supervisor-details绑定supervisor id->SupervisorDetails对象键值对的map
       all-supervisor-details (into {} (for [[sid supervisor-info] supervisor-infos
                                             :let [hostname (:hostname supervisor-info)
                                                   scheduler-meta (:scheduler-meta supervisor-info)
                                                   dead-ports (supervisor->dead-ports sid)
                                                   ;; hide the dead-ports from the all-ports
                                                   ;; these dead-ports can be reused in next round of assignments
                                                   ;; all-ports绑定去除"死亡"port后的所有port
                                                   all-ports (-> (get all-scheduling-slots sid)
                                                                 (set/difference dead-ports)
                                                                 ((fn [ports] (map int ports))))
                                                   ;; supervisor-details绑定SupervisorDetails对象
                                                   supervisor-details (SupervisorDetails. sid hostname scheduler-meta all-ports)]]
                                         {sid supervisor-details}))]
   ;; 返回合并后的supervisor id->SupervisorDetails对象的map                                    
   (merge all-supervisor-details
          (into {}
             (for [[sid ports] nonexistent-supervisor-slots]
               [sid (SupervisorDetails. sid nil ports)]))
          )))

Cluster类定义如下:

public class Cluster {

   /**
    * key: supervisor id, value: supervisor details
    */
   private Map<String, SupervisorDetails>   supervisors;
   /**
    * key: topologyId, value: topology's current assignments.
    */
   private Map<String, SchedulerAssignmentImpl> assignments;

   /**
    * a map from hostname to supervisor id.
    */
   private Map<String, List<String>>        hostToId;
   
   private Set<String> blackListedHosts = new HashSet<String>();
   private INimbus inimbus;
   
   // 其他方法
   ... ...
   }

mk-scheduler函数定义如下:

(defn mk-scheduler [conf inimbus]
 ;; 当前版本getForcedScheduler函数返回nil
 (let [forced-scheduler (.getForcedScheduler inimbus)
       ;; scheduler绑定IScheduler接口的实现
       ;; cond等价于java中的switch,我们可以发现首先检查forced-scheduler,如果forced-scheduler为nil,则检查是否有用户自定义的scheduler,如果没有则
       ;; 使用默认的DefaultScheduler
       scheduler (cond
                   forced-scheduler
                   (do (log-message "Using forced scheduler from INimbus " (class forced-scheduler))
                       forced-scheduler)
   
                   (conf STORM-SCHEDULER)
                   (do (log-message "Using custom scheduler: " (conf STORM-SCHEDULER))
                       (-> (conf STORM-SCHEDULER) new-instance))
   
                   :else
                   (do (log-message "Using default scheduler")
                       (DefaultScheduler.)))]
   ;; 先调用prepare函数
   (.prepare scheduler conf)
   ;; 然后返回scheduler
   scheduler
   ))

DefaultScheduler.clj中的schedule方法定义如下:只是简单调用default-schedule函数

(defn -schedule [this ^Topologies topologies ^Cluster cluster]
 (default-schedule topologies cluster))

DefaultScheduler.clj中的default-schedule函数定义如下:

(defn default-schedule [^Topologies topologies ^Cluster cluster]
 ;; needs-scheduling-topologies绑定需要assign的topology
 ;; Cluster类中的needsSchedulingTopologies方法返回需要assign的topology的List<TopologyDetails>信息,needsSchedulingTopologies方法参见其定义部分
 (let [needs-scheduling-topologies (.needsSchedulingTopologies cluster topologies)]
   (doseq [^TopologyDetails topology needs-scheduling-topologies
           :let [topology-id (.getId topology)
                 ;; available-slots绑定storm集群中所有未被分配的node+port集合,如([node1 port3] [node1 port6] [node1 port7] ...)
                 ;; getAvailableSlots方法就是将集群中supervisor定义的接口集合减去当前已分配的接口集合
                 available-slots (->> (.getAvailableSlots cluster)
                                      (map #(vector (.getNodeId %) (.getPort %))))
                 ;; all-executors绑定指定需要分配的topology的executor id的集合,如([1 2] [3 4] [5 6] ...)
                 all-executors (->> topology
                                    .getExecutors
                                    (map #(vector (.getStartTask %) (.getEndTask %)))
                                    set)
                 ;; alive-assigned绑定"alive"分配信息,如{[node1 port1] [[1 2] [3 4]] [node2 port1] [[5 6] [7 8]]},需要注意的是topology-id标示
                 ;; 了两种topology,一种是实际分配的进程小于期望进程即进行了部分分配的topology,一种是完全没有分配的topology(我们刚刚提交的topology就是完全没有分配的),对于后者alive-assigned一定                     是一个空map
                 ;; get-alive-assigned-node+port->executors函数参见其定义部分
                 alive-assigned (EvenScheduler/get-alive-assigned-node+port->executors cluster topology-id)
                 ;; alive-executors如#{[1 2] [3 4] [5 6] [7 8]},注意对于完全没有分配的topology,alive-executors为空set
                 alive-executors (->> alive-assigned vals (apply concat) set)
                 ;; can-reassign-slots绑定了所有可以重新分配的slot,alive executors是有可能跑在dead slot上的, 所以不是所有alive executors的slot都可用
                 ;; reassign的条件, node不在cluster的blacklist, port是否在supervisor的allPort中(allPort已经在read-all-supervisor-details函数中去除了dead port), 即这个slot是可用的
                 can-reassign-slots (slots-can-reassign cluster (keys alive-assigned))
                 ;; total-slots-to-use绑定指定topology可以使用的slot总数,取指定topology的进程数与(可重新分配slot数+可用slot数)的最小值
                 total-slots-to-use (min (.getNumWorkers topology)
                                         (+ (count can-reassign-slots) (count available-slots)))
                 ;; bad-slots绑定"分配不理想"的WorkSlot集合,"分配不理想"我们可以这样理解,一个topology定义中有3n个executor,n个slot(进程),但是集群只给它分配了m(m<n)个slot,按照定义每个slot                     上运行3个executor,但是
                 ;; 实际情况它只有m个slot,每个slot运行3n/m个executor(3n/m是否整除没有关系),还有一种情况分配给topology的某些slot挂掉了,而且集群也没有空闲的slot分配给它,这样造成实际分配与定义                     不符。
                 ;; alive-assigned与alive-executors是一一对应的,如果or的第一个条件为true,即该topology当前可以使用的slot总数大于集群已分配给它的slot总数,说明上次分配给它的slot数不够
                 ;; 即存在"bad"slot,第二个条件为true,alive-executors集合不等于all-executors集合,说明该topology的某些executor因为"心跳超时"或其他原因挂掉了,即一定存在"bad"slot
                 ;; 对于我们刚刚提交的topology来说,or的两个条件都为true,bad-slots函数参见其定义部分
                 bad-slots (if (or (> total-slots-to-use (count alive-assigned))
                                   (not= alive-executors all-executors))
                               ;; bad-slots返回"分配不理想"的WorkSlot集合,对于我们刚刚提交的topology,bad-slots返回空集合
                               (bad-slots alive-assigned (count all-executors) total-slots-to-use)
                               [])]]
     ;; freeSlots函数从cluster的SchedulerAssignmentImpl中把所有bad-slots集合包含的slot从executorToSlot中删除,只要slot没有被executor占用就是free,freeSlots方法参见其定义部分
     (.freeSlots cluster bad-slots)
     (EvenScheduler/schedule-topologies-evenly (Topologies. {topology-id topology}) cluster))))

Cluster类的needsSchedulingTopologies方法定义如下:参数topologies绑定了storm集群上所有的topology

public List<TopologyDetails> needsSchedulingTopologies(Topologies topologies) {
       List<TopologyDetails> ret = new ArrayList<TopologyDetails>();
       for (TopologyDetails topology : topologies.getTopologies()) {
           // needsScheduling方法判断topology是否需要assign,needsScheduling方法参见定义部分
           if (needsScheduling(topology)) {
               ret.add(topology);
           }
       }

       return ret;
   }

Cluster类的needsScheduling方法定义如下:

/**
* Does the topology need scheduling?
*
* A topology needs scheduling if one of the following conditions holds:
* <ul>
*       如果topology已经assign了,但是分配的线程少于期望的线程("死亡"slot或者可用slot不够),则该topology需要assign
*   <li>Although the topology is assigned slots, but is squeezed. i.e. the topology is assigned less slots than desired.</li>
       该topology没有分配,则需要assign
*   <li>There are unassigned executors in this topology</li>
* </ul>
*/
public boolean needsScheduling(TopologyDetails topology) {
   int desiredNumWorkers = topology.getNumWorkers();
   // this指向当前Cluster对象,Cluster对象存放了当前storm集群运行的topology的assignment信息,getAssignedNumWorkers方法获取指定topology的实际进程数
   int assignedNumWorkers = this.getAssignedNumWorkers(topology);

   if (desiredNumWorkers > assignedNumWorkers) {
       return true;
   }
   // getUnassignedExecutors方法获取指定topology的未assign的Executors的集合
   // 对于我们刚刚提交的topology,该条件一定为true;还有一种情况当前集群上正在运行的某个topology的某个executor因为"心跳"超时,也会使该条件为true
   // getUnassignedExecutors方法参见其定义部分
   return this.getUnassignedExecutors(topology).size() > 0;
}

Cluster类的getUnassignedExecutors方法定义如下;

public Collection<ExecutorDetails> getUnassignedExecutors(TopologyDetails topology) {
       if (topology == null) {
           return new ArrayList<ExecutorDetails>(0);
       }
       // ret暂存topology定义中需要的executor
       Collection<ExecutorDetails> ret = new HashSet(topology.getExecutors());
       // 获取指定topology的SchedulerAssignment
       SchedulerAssignment assignment = this.getAssignmentById(topology.getId());
       if (assignment != null) {
           // 获取storm集群实际分配给该topology的executor
           Set<ExecutorDetails> assignedExecutors = assignment.getExecutors();
           // ret删除实际分配的executor,保留未分配的executor
           ret.removeAll(assignedExecutors);
       }
       
       return ret;
   }

EvenScheduler.clj中的get-alive-assigned-node+port->executors函数定义如下:

(defn get-alive-assigned-node+port->executors [cluster topology-id]
 ;; existing-assignment绑定topology-id对应的SchedulerAssignmentImpl
 (let [existing-assignment (.getAssignmentById cluster topology-id)
       ;; executor->slot绑定ExecutorDetails->WorkerSlot的map
       executor->slot (if existing-assignment
                        (.getExecutorToSlot existing-assignment)
                        {})
       ;; executor->node+port如{[1 2] [node1 port1], [3 4] [node1 port1], [5 6] [node2 port1], [7 8] [node2 port1]}
       executor->node+port (into {} (for [[^ExecutorDetails executor ^WorkerSlot slot] executor->slot
                                          :let [executor [(.getStartTask executor) (.getEndTask executor)]
                                                node+port [(.getNodeId slot) (.getPort slot)]]]
                                      {executor node+port}))
       ;; alive-assigned如{[node1 port1] [[1 2] [3 4]] [node2 port1] [[5 6] [7 8]]}
       alive-assigned (reverse-map executor->node+port)]
   alive-assigned))

bad-slots函数定义如下:

;; existing-slots绑定[node+port]->[slot-id]的map,表示该topology的"上一次划分",num-executors绑定该topology定义中的executor总数,num-workers绑定当前集群中该topology可以分配到的进程总数(slot总数)
;; 对于我们刚刚提交的topology来说,bad-slots函数返回一个空的集合
(defn- bad-slots [existing-slots num-executors num-workers]
 (if (= 0 num-workers)
   '()
   ;; distribution绑定在当前集群状态下该topology的新划分,如(integer-divided 5 2)返回{2 1, 3 1}表示一个slot运行2个executor,一个slot运行3个executor
   (let [distribution (atom (integer-divided num-executors num-workers))
         ;; keepers绑定"分配理想"的slot
         keepers (atom {})]
     ;; doseq循环用于过滤出该topology的"上一次划分"与"新划分"相同的划分部分,并将其存入keepers中
     (doseq [[node+port executor-list] existing-slots :let [executor-count (count executor-list)]]
       (when (pos? (get @distribution executor-count 0))
         (swap! keepers assoc node+port executor-list)
         (swap! distribution update-in [executor-count] dec)
         ))
     ;; 从"上一次划分"existing-slots中删除"分配理想"的划分keepers,并返回"分配不理想"的WorkSlot集合
     (->> @keepers
          keys
          (apply dissoc existing-slots)
          keys
          (map (fn [[node port]]
                 (WorkerSlot. node port)))))))

Cluster类中的freeSlots方法定义如下:循环调用了freeSlot方法

public void freeSlots(Collection<WorkerSlot> slots) {
       if(slots!=null) {
           for (WorkerSlot slot : slots) {
               this.freeSlot(slot);
           }
       }
   }

Cluster类中的freeSlot方法定义如下:判断该slot是否被占用(cluster中SchedulerAssignmentImpl的executorToSlot是否包含该slot),如果占用(executorToSlot包含slot)则释放占用,所谓释放占用就是从SchedulerAssignmentImpl的executorToSlot中删除该slot对应的键值对。

public void freeSlot(WorkerSlot slot) {
       // remove the slot from the existing assignments
       for (SchedulerAssignmentImpl assignment : this.assignments.values()) {
           if (assignment.isSlotOccupied(slot)) {
               assignment.unassignBySlot(slot);
           }
       }
   }

Cluster类中的unassignBySlot方法定义如下:

public void unassignBySlot(WorkerSlot slot) {
       List<ExecutorDetails> executors = new ArrayList<ExecutorDetails>();
       for (ExecutorDetails executor : this.executorToSlot.keySet()) {
           WorkerSlot ws = this.executorToSlot.get(executor);
           if (ws.equals(slot)) {
               executors.add(executor);
           }
       }
       
       // remove
       for (ExecutorDetails executor : executors) {
           this.executorToSlot.remove(executor);
       }
   }

EvenScheduler.clj中的schedule-topologies-evenly函数定义如下:

;; topologies绑定Topologies对象,cluster绑定当前集群的状态
(defn schedule-topologies-evenly [^Topologies topologies ^Cluster cluster]
;; 这里有个问题:topologies明明是由"需要分配"的topology id和对应TopologyDetails构建的,但为什么这里又重新调用了一次cluster的needsSchedulingTopologies方法呢?
;; 原因是在default-schedule函数中调用了(.freeSlots cluster bad-slots)函数,这样就从cluster中删除了"bad" slot,改变了集群中某些topology的TopologyDetails,所以需要重新调用needsSchedulingTopologies
(let [needs-scheduling-topologies (.needsSchedulingTopologies cluster topologies)]
   (doseq [^TopologyDetails topology needs-scheduling-topologies
         :let [topology-id (.getId topology)
                 ;; new-assignment绑定该topology的新增分配executor id->[node+port]的map
               new-assignment (schedule-topology topology cluster)
               node+port->executors (reverse-map new-assignment)]]
       (doseq [[node+port executors] node+port->executors
           ;; 生成WorkerSlot对象
           :let [^WorkerSlot slot (WorkerSlot. (first node+port) (last node+port))
                 ;; 生成ExecutorDetails对象
                 executors (for [[start-task end-task] executors]
                             (ExecutorDetails. start-task end-task))]]
     ;; 通过调用cluster的assign方法,把新增的分配信息添加到cluster中,这样我们刚刚提交的topology的分配信息也添加到了cluter中                      
     (.assign cluster slot topology-id executors)))))

schedule-topology函数定义如下:

(defn- schedule-topology [^TopologyDetails topology ^Cluster cluster]
 (let [topology-id (.getId topology)
       ;; available-slots绑定当前集群所有可用的slot
       available-slots (->> (.getAvailableSlots cluster)
                            (map #(vector (.getNodeId %) (.getPort %))))
       ;; all-executors绑定topology定义中的所有executor集合                    
       all-executors (->> topology
                         .getExecutors
                         (map #(vector (.getStartTask %) (.getEndTask %)))
                         set)
       ;; alive-assigned绑定已经分配给该topology的slot,对于刚刚提交的topology,alive-assigned为空集合
       alive-assigned (get-alive-assigned-node+port->executors cluster topology-id)
       ;; total-slots-to-use绑定该topology可以使用的slot总数
       total-slots-to-use (min (.getNumWorkers topology)
                               (+ (count available-slots) (count alive-assigned)))
       ;; reassign-slots绑定为该topology重新分配的slot,即增加分配的slot,分配的个数为total-slots-to-use减去已分配的slot数
       ;; sort-slots函数对available-slots进行交叉排序,如available-slots=([11 1] [11 2] [11 5] [22 1] [22 2] [22 3] [22 4] [22 5] [33 3] [33 4])
       ;; 排序后为([11 1] [22 1] [33 3] [11 2] [22 2] [33 4] [11 5] [22 3] [22 4] [22 5])
       reassign-slots (take (- total-slots-to-use (count alive-assigned))
                            (sort-slots available-slots))
       ;; reassign-executors绑定需要重新分配的executor的集合
       reassign-executors (sort (set/difference all-executors (set (apply concat (vals alive-assigned)))))
       ;; reassignment绑定executor id->slot id的map,表示增加的assignment,从reassign-executors取一个executor id作为map的key,再从reassign-slots中
       ;; 取一个slot作为map的value,由于reassign-slots是交叉有序的,这样executor就可以均匀的分配给各个supervisor
       reassignment (into {}
                          (map vector
                               reassign-executors
                               ;; for some reason it goes into infinite loop without limiting the repeat-seq
                               ;; repeat-seq函数将reassign-slots集合扩展成原来的(count reassign-executors)倍,如reassign-slots=([11 1] [22 1]),(count reassign-executors)=3
                               ;; 返回([11 1] [22 1] [11 1] [22 1] [11 1] [22 1]),这样[11 1]被分配2个executor,[22 1]被分配1个executor
                               ;; 之所以需要repeat-seq, 是因为executors往往多于slots
                               (repeat-seq (count reassign-executors) reassign-slots)))]
   (when-not (empty? reassignment)
     (log-message "Available slots: " (pr-str available-slots))
     )
   ;; 返回reassignment
   reassignment))

Cluter类中的assign方法定义如下:该方法主要功能就是将新增分配信息添加到cluster的SchedulerAssignmentImpl的executorToSlot中。通过调用assign方法,我们刚刚提交的topology的分配信息也添加到了cluster中。

public void assign(WorkerSlot slot, String topologyId, Collection<ExecutorDetails> executors) {
       if (this.isSlotOccupied(slot)) {
           throw new RuntimeException("slot: [" + slot.getNodeId() + ", " + slot.getPort() + "] is already occupied.");
       }
       
       SchedulerAssignmentImpl assignment = (SchedulerAssignmentImpl)this.getAssignmentById(topologyId);
       if (assignment == null) {
           assignment = new SchedulerAssignmentImpl(topologyId, new HashMap<ExecutorDetails, WorkerSlot>());
           this.assignments.put(topologyId, assignment);
       } else {
           for (ExecutorDetails executor : executors) {
                if (assignment.isExecutorAssigned(executor)) {
                    throw new RuntimeException("the executor is already assigned, you should unassign it before assign it to another slot.");
                }
           }
       }

       assignment.assign(slot, executors);
   }

changed-executors函数定义如下:

(defn changed-executors [executor->node+port new-executor->node+port]
 ;; slot-assigned如{[node1 port1] [[1 2] [3 4] [5 6]], [node1 port2] [[7 8] [9 10]]}
 (let [slot-assigned (reverse-map executor->node+port)
       ;; new-slot-assigned如{[node1 port1] [[1 2] [7 8]], [node2 port1] [[3 4] [9 10]], [node1 port2] [[5 6]]}
       new-slot-assigned (reverse-map new-executor->node+port)
       ;; brand-new-slots绑定在new-slot-assigned中存在,但在slot-assigned中不存在,如{[node1 port1] [[1 2] [7 8]], [node2 port1] [[3 4] [9 10]], [node1 port2] [[5 6]]}
       brand-new-slots (map-diff slot-assigned new-slot-assigned)]
   ;; ([1 2] [3 4] [5 6] [7 8] [9 10])
   (apply concat (vals brand-new-slots))
   ))

 

posted @ 2015-04-02 11:43  不懂0604  阅读(2469)  评论(0编辑  收藏  举报