void Slave::_runTask(
const Future<bool>& future,
const FrameworkInfo& frameworkInfo,
const TaskInfo& task)
{
const FrameworkID frameworkId = frameworkInfo.id();
LOG(INFO) << "Launching task " << task.task_id()
<< " for framework " << frameworkId;
Framework* framework = getFramework(frameworkId);
const ExecutorInfo executorInfo = getExecutorInfo(frameworkInfo, task);
const ExecutorID& executorId = executorInfo.executor_id();
if (framework->pending.contains(executorId) &&
framework->pending[executorId].contains(task.task_id())) {
framework->pending[executorId].erase(task.task_id());
if (framework->pending[executorId].empty()) {
framework->pending.erase(executorId);
// NOTE: Ideally we would perform the following check here:
//
// if (framework->executors.empty() &&
// framework->pending.empty()) {
// removeFramework(framework);
// }
//
// However, we need 'framework' to stay valid for the rest of
// this function. As such, we perform the check before each of
// the 'return' statements below.
}
} else {
LOG(WARNING) << "Ignoring run task " << task.task_id()
<< " of framework " << frameworkId
<< " because the task has been killed in the meantime";
return;
}
// We don't send a status update here because a terminating
// framework cannot send acknowledgements.
if (framework->state == Framework::TERMINATING) {
LOG(WARNING) << "Ignoring run task " << task.task_id()
<< " of framework " << frameworkId
<< " because the framework is terminating";
// Refer to the comment after 'framework->pending.erase' above
// for why we need this.
if (framework->executors.empty() && framework->pending.empty()) {
removeFramework(framework);
}
return;
}
if (!future.isReady()) {
LOG(ERROR) << "Failed to unschedule directories scheduled for gc: "
<< (future.isFailed() ? future.failure() : "future discarded");
const StatusUpdate update = protobuf::createStatusUpdate(
frameworkId,
info.id(),
task.task_id(),
TASK_LOST,
TaskStatus::SOURCE_SLAVE,
UUID::random(),
"Could not launch the task because we failed to unschedule directories"
" scheduled for gc",
TaskStatus::REASON_GC_ERROR);
// TODO(vinod): Ensure that the status update manager reliably
// delivers this update. Currently, we don't guarantee this
// because removal of the framework causes the status update
// manager to stop retrying for its un-acked updates.
statusUpdate(update, UPID());
// Refer to the comment after 'framework->pending.erase' above
// for why we need this.
if (framework->executors.empty() && framework->pending.empty()) {
removeFramework(framework);
}
return;
}
// NOTE: If the task or executor uses resources that are
// checkpointed on the slave (e.g. persistent volumes), we should
// already know about it. If the slave doesn't know about them (e.g.
// CheckpointResourcesMessage was dropped or came out of order),
// we send TASK_LOST status updates here since restarting the task
// may succeed in the event that CheckpointResourcesMessage arrives
// out of order.
Resources checkpointedTaskResources =
Resources(task.resources()).filter(needCheckpointing);
foreach (const Resource& resource, checkpointedTaskResources) {
if (!checkpointedResources.contains(resource)) {
LOG(WARNING) << "Unknown checkpointed resource " << resource
<< " for task " << task.task_id()
<< " of framework " << frameworkId;
const StatusUpdate update = protobuf::createStatusUpdate(
frameworkId,
info.id(),
task.task_id(),
TASK_LOST,
TaskStatus::SOURCE_SLAVE,
UUID::random(),
"The checkpointed resources being used by the task are unknown to "
"the slave",
TaskStatus::REASON_RESOURCES_UNKNOWN);
statusUpdate(update, UPID());
// Refer to the comment after 'framework->pending.erase' above
// for why we need this.
if (framework->executors.empty() && framework->pending.empty()) {
removeFramework(framework);
}
return;
}
}
if (task.has_executor()) {
Resources checkpointedExecutorResources =
Resources(task.executor().resources()).filter(needCheckpointing);
foreach (const Resource& resource, checkpointedExecutorResources) {
if (!checkpointedResources.contains(resource)) {
LOG(WARNING) << "Unknown checkpointed resource " << resource
<< " for executor '" << task.executor().executor_id()
<< "' of framework " << frameworkId;
const StatusUpdate update = protobuf::createStatusUpdate(
frameworkId,
info.id(),
task.task_id(),
TASK_LOST,
TaskStatus::SOURCE_SLAVE,
UUID::random(),
"The checkpointed resources being used by the executor are unknown "
"to the slave",
TaskStatus::REASON_RESOURCES_UNKNOWN,
task.executor().executor_id());
statusUpdate(update, UPID());
// Refer to the comment after 'framework->pending.erase' above
// for why we need this.
if (framework->executors.empty() && framework->pending.empty()) {
removeFramework(framework);
}
return;
}
}
}
// NOTE: The slave cannot be in 'RECOVERING' because the task would
// have been rejected in 'runTask()' in that case.
CHECK(state == DISCONNECTED || state == RUNNING || state == TERMINATING)
<< state;
if (state == TERMINATING) {
LOG(WARNING) << "Ignoring run task " << task.task_id()
<< " of framework " << frameworkId
<< " because the slave is terminating";
// Refer to the comment after 'framework->pending.erase' above
// for why we need this.
if (framework->executors.empty() && framework->pending.empty()) {
removeFramework(framework);
}
// We don't send a TASK_LOST here because the slave is
// terminating.
return;
}
CHECK(framework->state == Framework::RUNNING) << framework->state;
// Either send the task to an executor or start a new executor
// and queue the task until the executor has started.
Executor* executor = framework->getExecutor(executorId);
if (executor == NULL) {
executor = framework->launchExecutor(executorInfo, task);
}
CHECK_NOTNULL(executor);
switch (executor->state) {
case Executor::TERMINATING:
case Executor::TERMINATED: {
LOG(WARNING) << "Asked to run task '" << task.task_id()
<< "' for framework " << frameworkId
<< " with executor '" << executorId
<< "' which is terminating/terminated";
const StatusUpdate update = protobuf::createStatusUpdate(
frameworkId,
info.id(),
task.task_id(),
TASK_LOST,
TaskStatus::SOURCE_SLAVE,
UUID::random(),
"Executor terminating/terminated",
TaskStatus::REASON_EXECUTOR_TERMINATED);
statusUpdate(update, UPID());
break;
}
case Executor::REGISTERING:
// Checkpoint the task before we do anything else.
if (executor->checkpoint) {
executor->checkpointTask(task);
}
// Queue task if the executor has not yet registered.
LOG(INFO) << "Queuing task '" << task.task_id()
<< "' for executor " << *executor;
executor->queuedTasks[task.task_id()] = task;
break;
case Executor::RUNNING: {
// Checkpoint the task before we do anything else.
if (executor->checkpoint) {
executor->checkpointTask(task);
}
// Queue task until the containerizer is updated with new
// resource limits (MESOS-998).
LOG(INFO) << "Queuing task '" << task.task_id()
<< "' for executor " << *executor;
executor->queuedTasks[task.task_id()] = task;
// Update the resource limits for the container. Note that the
// resource limits include the currently queued tasks because we
// want the container to have enough resources to hold the
// upcoming tasks.
Resources resources = executor->resources;
// TODO(jieyu): Use foreachvalue instead once LinkedHashmap
// supports it.
foreach (const TaskInfo& task, executor->queuedTasks.values()) {
resources += task.resources();
}
containerizer->update(executor->containerId, resources)
.onAny(defer(self(),
&Self::runTasks,
lambda::_1,
frameworkId,
executorId,
executor->containerId,
list<TaskInfo>({task})));
break;
}
default:
LOG(FATAL) << "Executor " << *executor << " is in unexpected state "
<< executor->state;
break;
}
// We don't perform the checks for 'removeFramework' here since
// we're guaranteed by 'launchExecutor' that 'framework->executors'
// will be non-empty.
CHECK(!framework->executors.empty());
}