MesosSchedulerDriver的代码在src/sched/sched.cpp里面实现。
Driver->run()调用start()
首先检测Mesos-Master的leader
创建一个线程。
SchedulerProcess的initialize()函数
里面主要注册消息处理函数。
virtual
void initialize()
{
install<Event>(&SchedulerProcess::receive);
// TODO(benh): Get access to flags so that we can decide whether
// or not to make ZooKeeper verbose.
install<FrameworkRegisteredMessage>(
&SchedulerProcess::registered,
&FrameworkRegisteredMessage::framework_id,
&FrameworkRegisteredMessage::master_info);
install<FrameworkReregisteredMessage>(
&SchedulerProcess::reregistered,
&FrameworkReregisteredMessage::framework_id,
&FrameworkReregisteredMessage::master_info);
install<ResourceOffersMessage>(
&SchedulerProcess::resourceOffers,
&ResourceOffersMessage::offers,
&ResourceOffersMessage::pids);
install<RescindResourceOfferMessage>(
&SchedulerProcess::rescindOffer,
&RescindResourceOfferMessage::offer_id);
install<StatusUpdateMessage>(
&SchedulerProcess::statusUpdate,
&StatusUpdateMessage::update,
&StatusUpdateMessage::pid);
install<LostSlaveMessage>(
&SchedulerProcess::lostSlave,
&LostSlaveMessage::slave_id);
install<ExitedExecutorMessage>(
&SchedulerProcess::lostExecutor,
&ExitedExecutorMessage::executor_id,
&ExitedExecutorMessage::slave_id,
&ExitedExecutorMessage::status);
install<ExecutorToFrameworkMessage>(
&SchedulerProcess::frameworkMessage,
&ExecutorToFrameworkMessage::slave_id,
&ExecutorToFrameworkMessage::executor_id,
&ExecutorToFrameworkMessage::data);
install<FrameworkErrorMessage>(
&SchedulerProcess::error,
&FrameworkErrorMessage::message);
// Start detecting masters.
detector->detect()
.onAny(defer(self(), &SchedulerProcess::detected, lambda::_1));
}
|
在前面的文章中,Mesos源码分析(6): Mesos Master的初始化中,
Allocator的initialize函数中,传入的OfferCallback是Master::offer。
每过allocation_interval,Allocator都会计算每个framework的offer,然后依次调用Master::offer,将资源offer给相应的framework
在Master::offer函数中,生成如下的ResourceOffersMessage,并且发送给Framework。
对应到这里当Driver收到ResourceOffersMessage的消息的时候,会调用SchedulerProcess::resourceOffers
void resourceOffers(
const UPID& from,
const vector<Offer>& offers,
const vector<string>& pids)
{
……
VLOG(2) << "Received " << offers.size() << " offers";
……
scheduler->resourceOffers(driver, offers);
VLOG(1) << "Scheduler::resourceOffers took " << stopwatch.elapsed();
}
|
最终调用了Framework的resourceOffers。
Test Framework的resourceOffers函数,根据得到的offers,创建一系列tasks,然后调用driver的launchTasks函数
virtual
void resourceOffers(SchedulerDriver* driver,
const vector<Offer>& offers)
{
foreach (const Offer& offer, offers) {
cout << "Received offer " << offer.id() << " with " << offer.resources()
<< endl;
static
const Resources TASK_RESOURCES = Resources::parse(
"cpus:" + stringify(CPUS_PER_TASK) +
";mem:" + stringify(MEM_PER_TASK)).get();
Resources remaining = offer.resources();
// Launch tasks.
vector<TaskInfo> tasks;
while (tasksLaunched < totalTasks &&
remaining.flatten().contains(TASK_RESOURCES)) {
int taskId = tasksLaunched++;
cout << "Launching task " << taskId << " using offer "
<< offer.id() << endl;
TaskInfo task;
task.set_name("Task " + lexical_cast<string>(taskId));
task.mutable_task_id()->set_value(lexical_cast<string>(taskId));
task.mutable_slave_id()->MergeFrom(offer.slave_id());
task.mutable_executor()->MergeFrom(executor);
Option<Resources> resources =
remaining.find(TASK_RESOURCES.flatten(role));
CHECK_SOME(resources);
task.mutable_resources()->MergeFrom(resources.get());
remaining -= resources.get();
tasks.push_back(task);
}
driver->launchTasks(offer.id(), tasks);
}
}
|
SchedulerProcess的launchTasks函数实现如下:
void launchTasks(const vector<OfferID>& offerIds,
const vector<TaskInfo>& tasks,
const Filters& filters)
{
Offer::Operation operation;
operation.set_type(Offer::Operation::LAUNCH);
Offer::Operation::Launch* launch = operation.mutable_launch();
foreach (const TaskInfo& task, tasks) {
launch->add_task_infos()->CopyFrom(task);
}
acceptOffers(offerIds, {operation}, filters);
}
void acceptOffers(
const vector<OfferID>& offerIds,
const vector<Offer::Operation>& operations,
const Filters& filters)
{
// TODO(jieyu): Move all driver side verification to master since
// we are moving towards supporting pure launguage scheduler.
if (!connected) {
VLOG(1) << "Ignoring accept offers message as master is disconnected";
// NOTE: Reply to the framework with TASK_LOST messages for each
// task launch. See details from notes in launchTasks.
foreach (const Offer::Operation& operation, operations) {
if (operation.type() != Offer::Operation::LAUNCH) {
continue;
}
foreach (const TaskInfo& task, operation.launch().task_infos()) {
StatusUpdate update = protobuf::createStatusUpdate(
framework.id(),
None(),
task.task_id(),
TASK_LOST,
TaskStatus::SOURCE_MASTER,
None(),
"Master disconnected",
TaskStatus::REASON_MASTER_DISCONNECTED);
statusUpdate(UPID(), update, UPID());
}
}
return;
}
Call call;
CHECK(framework.has_id());
call.mutable_framework_id()->CopyFrom(framework.id());
call.set_type(Call::ACCEPT);
Call::Accept* accept = call.mutable_accept();
// Setting accept.operations.
foreach (const Offer::Operation& _operation, operations) {
Offer::Operation* operation = accept->add_operations();
operation->CopyFrom(_operation);
}
// Setting accept.offer_ids.
foreach (const OfferID& offerId, offerIds) {
accept->add_offer_ids()->CopyFrom(offerId);
if (!savedOffers.contains(offerId)) {
// TODO(jieyu): A duplicated offer ID could also cause this
// warning being printed. Consider refine this message here
// and in launchTasks as well.
LOG(WARNING) << "Attempting to accept an unknown offer " << offerId;
} else {
// Keep only the slave PIDs where we run tasks so we can send
// framework messages directly.
foreach (const Offer::Operation& operation, operations) {
if (operation.type() != Offer::Operation::LAUNCH) {
continue;
}
foreach (const TaskInfo& task, operation.launch().task_infos()) {
const SlaveID& slaveId = task.slave_id();
if (savedOffers[offerId].contains(slaveId)) {
savedSlavePids[slaveId] = savedOffers[offerId][slaveId];
} else {
LOG(WARNING) << "Attempting to launch task " << task.task_id()
<< " with the wrong slave id " << slaveId;
}
}
}
}
// Remove the offer since we saved all the PIDs we might use.
savedOffers.erase(offerId);
}
// Setting accept.filters.
accept->mutable_filters()->CopyFrom(filters);
CHECK_SOME(master);
send(master.get().pid(), call);
}
|
最终向Mesos-Master的leader发送launchTasks的消息。