Flink – JobManager.submitJob
JobManager作为actor,
case SubmitJob(jobGraph, listeningBehaviour) => val client = sender() val jobInfo = new JobInfo(client, listeningBehaviour, System.currentTimeMillis(), jobGraph.getSessionTimeout) submitJob(jobGraph, jobInfo)
submitJob,做3件事、
根据JobGraph生成ExecuteGraph
恢复状态CheckpointedState,或者Savepoint
提交ExecuteGraph给Scheduler进行调度
ExecuteGraph
executionGraph = ExecutionGraphBuilder.buildGraph( executionGraph, //currentJobs.get(jobGraph.getJobID),对应的jobid是否有现存的ExecuteGraph jobGraph, flinkConfiguration, //配置 futureExecutor, //Executors.newFixedThreadPool(numberProcessors, new NamedThreadFactory("jobmanager-future-", "-thread-")),根据cpu核数创建的线程池 ioExecutor, // Executors.newFixedThreadPool(numberProcessors, new NamedThreadFactory("jobmanager-io-", "-thread-")) userCodeLoader, //libraryCacheManager.getClassLoader(jobGraph.getJobID),从jar中加载 checkpointRecoveryFactory, //用于createCheckpointStore和createCheckpointIDCounter,standalone和zk两种 Time.of(timeout.length, timeout.unit), restartStrategy, //job重启策略 jobMetrics, numSlots, //scheduler.getTotalNumberOfSlots(),注册到该JM上的instances一共有多少slots log.logger)
ExecutionGraphBuilder.buildGraph
New
// create a new execution graph, if none exists so far final ExecutionGraph executionGraph; try { executionGraph = (prior != null) ? prior : new ExecutionGraph( futureExecutor, ioExecutor, jobId, jobName, jobGraph.getJobConfiguration(), jobGraph.getSerializedExecutionConfig(), timeout, restartStrategy, jobGraph.getUserJarBlobKeys(), jobGraph.getClasspaths(), classLoader, metrics); } catch (IOException e) { throw new JobException("Could not create the execution graph.", e); }
attachJobGraph,生成Graph的节点和边
// topologically sort the job vertices and attach the graph to the existing one List<JobVertex> sortedTopology = jobGraph.getVerticesSortedTopologicallyFromSources(); executionGraph.attachJobGraph(sortedTopology);
ExecutionGraph.attachJobGraph
for (JobVertex jobVertex : topologiallySorted) { // create the execution job vertex and attach it to the graph ExecutionJobVertex ejv = new ExecutionJobVertex(this, jobVertex, 1, timeout, createTimestamp); ejv.connectToPredecessors(this.intermediateResults); //All job vertices that are part of this graph, ConcurrentHashMap<JobVertexID, ExecutionJobVertex> tasks ExecutionJobVertex previousTask = this.tasks.putIfAbsent(jobVertex.getID(), ejv); for (IntermediateResult res : ejv.getProducedDataSets()) { //All intermediate results that are part of this graph //ConcurrentHashMap<IntermediateDataSetID, IntermediateResult> intermediateResults IntermediateResult previousDataSet = this.intermediateResults.putIfAbsent(res.getId(), res); } //All vertices, in the order in which they were created //List<ExecutionJobVertex> verticesInCreationOrder this.verticesInCreationOrder.add(ejv); }
将JobVertex封装成ExecutionJobVertex
会依次创建出ExecutionJobVertex,ExecutionVertex, Execution; IntermediateResult, IntermediateResultPartition
ExecutionJobVertex
public ExecutionJobVertex( ExecutionGraph graph, JobVertex jobVertex, int defaultParallelism, Time timeout, long createTimestamp) throws JobException { if (graph == null || jobVertex == null) { throw new NullPointerException(); } //并发度,决定有多少ExecutionVertex int vertexParallelism = jobVertex.getParallelism(); int numTaskVertices = vertexParallelism > 0 ? vertexParallelism : defaultParallelism; //产生ExecutionVertex this.taskVertices = new ExecutionVertex[numTaskVertices]; this.inputs = new ArrayList<>(jobVertex.getInputs().size()); // take the sharing group this.slotSharingGroup = jobVertex.getSlotSharingGroup(); this.coLocationGroup = jobVertex.getCoLocationGroup(); // create the intermediate results this.producedDataSets = new IntermediateResult[jobVertex.getNumberOfProducedIntermediateDataSets()]; //创建用于存放中间结果的IntermediateResult for (int i = 0; i < jobVertex.getProducedDataSets().size(); i++) { final IntermediateDataSet result = jobVertex.getProducedDataSets().get(i); this.producedDataSets[i] = new IntermediateResult( //将JobGraph中的IntermediateDataSet封装成IntermediateResult result.getId(), this, numTaskVertices, result.getResultType()); } // create all task vertices for (int i = 0; i < numTaskVertices; i++) { ExecutionVertex vertex = new ExecutionVertex( //初始化ExecutionVertex this, i, this.producedDataSets, timeout, createTimestamp, maxPriorAttemptsHistoryLength); this.taskVertices[i] = vertex; // } finishedSubtasks = new boolean[parallelism]; }
ExecutionVertex
public ExecutionVertex( ExecutionJobVertex jobVertex, int subTaskIndex, //第几个task,task和ExecutionVertex对应 IntermediateResult[] producedDataSets, Time timeout, long createTimestamp, int maxPriorExecutionHistoryLength) { this.jobVertex = jobVertex; this.subTaskIndex = subTaskIndex; this.taskNameWithSubtask = String.format("%s (%d/%d)", jobVertex.getJobVertex().getName(), subTaskIndex + 1, jobVertex.getParallelism()); this.resultPartitions = new LinkedHashMap<IntermediateResultPartitionID, IntermediateResultPartition>(producedDataSets.length, 1); //用于记录IntermediateResultPartition for (IntermediateResult result : producedDataSets) { IntermediateResultPartition irp = new IntermediateResultPartition(result, this, subTaskIndex); //初始化IntermediateResultPartition result.setPartition(subTaskIndex, irp); resultPartitions.put(irp.getPartitionId(), irp); } this.inputEdges = new ExecutionEdge[jobVertex.getJobVertex().getInputs().size()][]; this.priorExecutions = new EvictingBoundedList<>(maxPriorExecutionHistoryLength); this.currentExecution = new Execution( //创建Execution getExecutionGraph().getFutureExecutor(), this, 0, createTimestamp, timeout); this.timeout = timeout; }
connectToPredecessors,把节点用edge相连
public void connectToPredecessors(Map<IntermediateDataSetID, IntermediateResult> intermediateDataSets) throws JobException { List<JobEdge> inputs = jobVertex.getInputs(); //JobVertex的输入 for (int num = 0; num < inputs.size(); num++) { JobEdge edge = inputs.get(num); //对应的JobEdge IntermediateResult ires = intermediateDataSets.get(edge.getSourceId()); //取出JobEdge的source IntermediateResult this.inputs.add(ires); //List<IntermediateResult> inputs; int consumerIndex = ires.registerConsumer(); //将当前vertex作为consumer注册到IntermediateResult的每个IntermediateResultPartition for (int i = 0; i < parallelism; i++) { ExecutionVertex ev = taskVertices[i]; ev.connectSource(num, ires, edge, consumerIndex); //为每个ExecutionVertex建立到具体IntermediateResultPartition的ExecutionEdge } } }
connectSource
public void connectSource(int inputNumber, IntermediateResult source, JobEdge edge, int consumerNumber) { final DistributionPattern pattern = edge.getDistributionPattern(); // 获取edge的distribution pattern final IntermediateResultPartition[] sourcePartitions = source.getPartitions(); // 获取souce的partitions ExecutionEdge[] edges; switch (pattern) { case POINTWISE: edges = connectPointwise(sourcePartitions, inputNumber); break; case ALL_TO_ALL: edges = connectAllToAll(sourcePartitions, inputNumber); break; default: throw new RuntimeException("Unrecognized distribution pattern."); } this.inputEdges[inputNumber] = edges; // add the consumers to the source // for now (until the receiver initiated handshake is in place), we need to register the // edges as the execution graph for (ExecutionEdge ee : edges) { ee.getSource().addConsumer(ee, consumerNumber); } }
看下connectPointwise
private ExecutionEdge[] connectPointwise(IntermediateResultPartition[] sourcePartitions, int inputNumber) { final int numSources = sourcePartitions.length; //Partitions的个数 final int parallelism = getTotalNumberOfParallelSubtasks(); //subTasks的并发度 // simple case same number of sources as targets if (numSources == parallelism) { //如果1比1,简单 return new ExecutionEdge[] { new ExecutionEdge(sourcePartitions[subTaskIndex], this, inputNumber) }; //取sourcePartitions中和subTaskIndex对应的那个partition } else if (numSources < parallelism) { //如果subTasks的并发度高,那一个source会对应于多个task int sourcePartition; // check if the pattern is regular or irregular // we use int arithmetics for regular, and floating point with rounding for irregular if (parallelism % numSources == 0) { //整除的情况下,比如2个source,6个task,那么第3个task应该对应于第一个source // same number of targets per source int factor = parallelism / numSources; sourcePartition = subTaskIndex / factor; } else { // different number of targets per source float factor = ((float) parallelism) / numSources; sourcePartition = (int) (subTaskIndex / factor); } return new ExecutionEdge[] { new ExecutionEdge(sourcePartitions[sourcePartition], this, inputNumber) }; } else { //...... } }
配置checkpoint
executionGraph.enableSnapshotCheckpointing(
snapshotSettings.getCheckpointInterval(),
snapshotSettings.getCheckpointTimeout(),
snapshotSettings.getMinPauseBetweenCheckpoints(),
snapshotSettings.getMaxConcurrentCheckpoints(),
snapshotSettings.getExternalizedCheckpointSettings(),
triggerVertices,
ackVertices,
confirmVertices,
checkpointIdCounter,
completedCheckpoints,
externalizedCheckpointsDir,
checkpointStatsTracker);
启动CheckpointCoordinator,参考专门讨论Checkpoint机制的blog
Scheduler
下面看看如何将生成好的ExecutionGraph进行调度
future { //异步 try { submittedJobGraphs.putJobGraph(new SubmittedJobGraph(jobGraph, jobInfo)) //放入submittedJobGraphs } catch { // } } jobInfo.notifyClients( decorateMessage(JobSubmitSuccess(jobGraph.getJobID))) //通知用户提交成功 if (leaderElectionService.hasLeadership) { executionGraph.scheduleForExecution(scheduler) //调度 } } catch { // } }(context.dispatcher) }
executionGraph.scheduleForExecution
public void scheduleForExecution(SlotProvider slotProvider) throws JobException { switch (scheduleMode) { case LAZY_FROM_SOURCES: // simply take the vertices without inputs. for (ExecutionJobVertex ejv : this.tasks.values()) { //ConcurrentHashMap<JobVertexID, ExecutionJobVertex> tasks,这个tasks的命名不科学 if (ejv.getJobVertex().isInputVertex()) { ejv.scheduleAll(slotProvider, allowQueuedScheduling); } } break; case EAGER: for (ExecutionJobVertex ejv : getVerticesTopologically()) { ejv.scheduleAll(slotProvider, allowQueuedScheduling); } break; default: throw new JobException("Schedule mode is invalid."); } }
对于流默认是EAGER,
public JobGraph createJobGraph() { jobGraph = new JobGraph(streamGraph.getJobName()); // make sure that all vertices start immediately jobGraph.setScheduleMode(ScheduleMode.EAGER);
ExecutionJobVertex.scheduleAll
public void scheduleAll(SlotProvider slotProvider, boolean queued) throws NoResourceAvailableException { ExecutionVertex[] vertices = this.taskVertices; // kick off the tasks for (ExecutionVertex ev : vertices) { ev.scheduleForExecution(slotProvider, queued); } }
ExecutionVertex.scheduleForExecution
//The current or latest execution attempt of this vertex's task public boolean scheduleForExecution(SlotProvider slotProvider, boolean queued) throws NoResourceAvailableException { return this.currentExecution.scheduleForExecution(slotProvider, queued); }
Execution.scheduleForExecution
public boolean scheduleForExecution(SlotProvider slotProvider, boolean queued) throws NoResourceAvailableException { final SlotSharingGroup sharingGroup = vertex.getJobVertex().getSlotSharingGroup(); final CoLocationConstraint locationConstraint = vertex.getLocationConstraint(); if (transitionState(CREATED, SCHEDULED)) { ScheduledUnit toSchedule = locationConstraint == null ? //生成ScheduledUnit new ScheduledUnit(this, sharingGroup) : new ScheduledUnit(this, sharingGroup, locationConstraint); final Future<SimpleSlot> slotAllocationFuture = slotProvider.allocateSlot(toSchedule, queued); //从slotProvider获取slot final Future<Void> deploymentFuture = slotAllocationFuture.handle(new BiFunction<SimpleSlot, Throwable, Void>() { @Override public Void apply(SimpleSlot simpleSlot, Throwable throwable) { if (simpleSlot != null) { //slot分配成功 try { deployToSlot(simpleSlot); //deploy } catch (Throwable t) { try { simpleSlot.releaseSlot(); } finally { markFailed(t); } } } else { markFailed(throwable); } return null; } }); }
slotProvider,参考Flink - Scheduler
deployToSlot,核心就是往TaskManager提交submitTask请求
public void deployToSlot(final SimpleSlot slot) throws JobException { ExecutionState previous = this.state; if (previous == SCHEDULED || previous == CREATED) { if (!transitionState(previous, DEPLOYING)) { //状态迁移成Deploying throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race."); } } try { // good, we are allowed to deploy if (!slot.setExecutedVertex(this)) { //设置slot和ExecuteVertex关系 throw new JobException("Could not assign the ExecutionVertex to the slot " + slot); } this.assignedResource = slot; final TaskDeploymentDescriptor deployment = vertex.createDeploymentDescriptor( //创建DeploymentDescriptor attemptId, slot, taskState, attemptNumber); // register this execution at the execution graph, to receive call backs vertex.getExecutionGraph().registerExecution(this); final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway(); final Future<Acknowledge> submitResultFuture = taskManagerGateway.submitTask(deployment, timeout); //向TaskMananger的Actor发送请求 submitResultFuture.exceptionallyAsync(new ApplyFunction<Throwable, Void>() {......} }