
MXNet源码分析 | KVStore进程内通信



KVStoreLocal继承自KVStore,负责进程内通信。它主要维护了以下变量:负责不同设备间通信的comm_,机器上的页锁定内存(不进行页交换,一直在物理内存中),本机的key-val buffer,字符串key到整型key的映射以及整型key到字符串key的映射。在其构造函数中,它会根据传入的设备类型(CPU、GPU等)创建不同的通信对象。



classDiagram Comm <|.. CommCPU Comm <|.. CommDevice CommDevice <|-- CommDeviceTree Comm <.. KVStoreLocal KVStore <|.. KVStoreLocal KVStoreLocal <|-- KVStoreDist KVStoreLocal <|-- KVStoreNCCL KVStoreDist ..> KVStoreDistServer class Comm { <<abstract>> +Init() void +Reduce() const NDArray& +Broadcast() void +BroadcastRowSparse() void #Context pinned_ctx_ } class CommCPU { -ReduceSumCPU() void -ReduceSumCPUExSerial() void -unordered_map<int, BufferEntry> merge_buf_ -size_t bigarray_bound_ } class CommDevice { +InitBuffersAndComm() void +ReduceCompressed() const NDArray& +InitMergeBuffer() void -EnableP2P() void -unordered_map<int, BufferEntry> merge_buf_ } class CommDeviceTree { +ReduceInner() const NDArray& +BroadcastInner() void -int depth_ -int gpuarray_bound_ } class KVStore { <<abstract>> +Create() static KVStore* +Push() void +Pull() void +Barrier() void +RunServer() void #Updater updater_ #StrUpdater str_updater_ } class KVStoreLocal { +Init() void -InitImpl() void -PushImpl() void -PullImpl() void -PushPullImpl() void #GroupKVPairsPush() void #GroupKVPairsPull() void #GroupKVPairs() void #LookupKeys() void #Comm* comm_ #unordered_map<int, NDArray> local_ #unordered_map<string, int> str_key_dict_ #unordered_map<int, string> reverse_str_key_dict_ } class KVStoreDist { -EncodeDefaultKey() void -EncodeCompressedKey() void -EncodeRowSparseKey() void -unordered_map<int, PSKV> ps_kv_ -KVWorker<char>* ps_worker_ -KVStoreDistServer* server_ -size_t bigarray_bound_ -unordered_map<int, NDArray> comm_buf_ -unordered_map<int, NDArray> compr_buf_ -unordered_map<int, NDArray> residual_ } class KVStoreDistServer { -ApplyUpdates() void -DefaultStorageResponse() void -DataHandleDefault() void -unordered_map<int, NDArray> store_ -unordered_map<int, NDArray> update_buf_ -unordered_map<int, NDArray> decom_buf_ -KVServer<char>* ps_server_ }



KVStore* KVStore::Create(const char *type_name) {
  std::string tname = type_name;
  std::transform(tname.begin(), tname.end(), tname.begin(), ::tolower);
  KVStore* kv = nullptr;
  bool use_device_comm = false;
  auto has = [tname](const std::string& pattern) {
    return tname.find(pattern) != std::string::npos;
  if (has("device")) {
    use_device_comm = true;

  if (has("dist")) {
    kv = new kvstore::KVStoreDist(use_device_comm);
    if (!has("_async") && kv->IsWorkerNode() && kv->get_rank() == 0) {
      // configure the server to be the sync mode
      kv->SendCommandToServers(static_cast<int>(kvstore::CommandType::kSyncMode), "");
  } else {
    if (has("nccl")) {
      kv = new kvstore::KVStoreNCCL();
    } else {
      kv =  new kvstore::KVStoreLocal(use_device_comm);
  kv->type_ = tname;
  return kv;



explicit KVStoreLocal::KVStoreLocal(bool use_device_comm) : KVStore() {
  if (use_device_comm) {
    bool tree = dmlc::GetEnv("MXNET_KVSTORE_USETREE", 0)
    if (tree) {
      comm_ = new CommDeviceTree();
    } else {
      comm_ = new CommDevice();
  } else {
    comm_ = new CommCPU();



virtual void 
KVStoreLocal::PushImpl(const std::vector<int>& keys, 
                       const std::vector<NDArray>& values, 
                       int priority) {
  std::vector<int> uniq_keys;
  std::vector<std::vector<NDArray>> grouped_val;
  GroupKVPairsPush(keys, values, &uniq_keys, &grouped_val, false);
  for (size_t i = 0; i <uniq_keys.size(); ++i) {
    int key = uniq_keys[i];
    const NDArray& merged = comm_->Reduce(key, grouped_val[i], priority);
    NDArray& local = local_[key];
    if (updater_ != nullptr) {
      if (kye_type_ == kStringKye && str_updater_ != nullptr) {
        str_updater_(str_key, merged, &local);
      } else {
        updater_(key, merged, &local);
    } else {
      if (merged.storage_type() != local.storage_type()) {
        local = merged.Copy(local.ctx());
      } else {
        local = merged;



CommCPU::Reduce将输入vector<NDArray>& src的每个元素求和并返回。当src只有一个元素时直接返回src[0];否则就按照下面的代码,把数据规约到变量reduce[0]中。可以看到,下面的代买调用PushAsync方法把一个lambda表达式压入到依赖引擎中,这个lambda表达式首先捕获了reduce变量,然后在函数体中调用ReduceSumCPU方法在CPU上执行Reduce操作,最终操作的结果会存放到reduce[0]中。

std::vector<Engine::VarHandle> const_vars(src.size() - 1);
std::vector<NDArray> reduce(src.size());
CopyFromTo(src[0], &buf_merged, priority);
reduce[0] = buf_merged;

if (buf.copy_buf.empty()) {
   for (size_t j = 0; j < src.size() - 1; ++j) {
     // allocate copy buffer
     buf.copy_buf[j] = NDArray(src[0].shape(), pinned_ctx_, false, src[0].dtype());
CHECK(stype == buf.copy_buf[0].storage_type()) 
  << "Storage type mismatch detected. " << stype << "(src) vs. "
  << buf.copy_buf[0].storage_type() << "(buf.copy_buf)";
 for (size_t i = 1; i < src.size(); ++i) {
   CopyFromTo(src[i], &(buf.copy_buf[i-1]), priority);
   reduce[i] = buf.copy_buf[i-1];
   const_vars[i-1] = reduce[i].var();

  [reduce, this](RunContext rctx, Engine::CallbackOnComplete on_complete) {
  }, Context::CPU(), const_vars, {reduce[0].var()},
  FnProperty::kCPUPrioritized, priority, "KVStoreReduce");


inline void ReduceSumCPU(const std::vector<NDArray> &in_data) {
    MSHADOW_TYPE_SWITCH(in_data[0].dtype(), DType, {
      std::vector<DType*> dptr(in_data.size());
      for (size_t i = 0; i < in_data.size(); ++i) {
        TBlob data = in_data[i].data();
        dptr[i] = data.FlatTo2D<cpu, DType>().dptr_;
      size_t total = in_data[0].shape().Size();
      ReduceSumCPUImpl(dptr, total);


template<typename DType>
inline void ReduceSumCPUImpl(std::vector<DType*> dptr, size_t total) {
  const size_t step = std::min(bigarray_bound_, static_cast<size_t>(4 << 10));
  long ntask = (total + step - 1) / step; // NOLINT(*)
  if (total < bigarray_bound_ || nthread_reduction_ <= 1) {
    ReduceSumCPU(dptr, 0, total);
  } else {
    #pragma omp parallel for schedule(static) num_threads(nthread_reduction_)
    for (long j = 0; j < ntask; ++j) { // NOLINT(*)
      size_t k = static_cast<size_t>(j);
      size_t begin = std::min(k * step, total);
      size_t end = std::min((k + 1) * step, total);
      if (j == ntask - 1) CHECK_EQ(end, total);
      ReduceSumCPU(dptr, begin, static_cast<index_t>(end - begin));


template<typename DType>
inline static void ReduceSumCPU(const std::vector<DType*> &dptr, size_t offset, index_t size) {
  using namespace mshadow;  // NOLINT(*)
  Tensor<cpu, 1, DType> in_0(dptr[0] + offset, Shape1(size));
  for (size_t i = 1; i < dptr.size(); i+=4) {
    switch (dptr.size() - i) {
      case 1: {
        Tensor<cpu, 1, DType> in_1(dptr[i] + offset, Shape1(size));
        in_0 += in_1;
      case 2: {
        Tensor<cpu, 1, DType> in_1(dptr[i] + offset, Shape1(size));
        Tensor<cpu, 1, DType> in_2(dptr[i+1] + offset, Shape1(size));
        in_0 += in_1 + in_2;
      case 3: {
        Tensor<cpu, 1, DType> in_1(dptr[i] + offset, Shape1(size));
        Tensor<cpu, 1, DType> in_2(dptr[i+1] + offset, Shape1(size));
        Tensor<cpu, 1, DType> in_3(dptr[i+2] + offset, Shape1(size));
        in_0 += in_1 + in_2 + in_3;
      default: {
        Tensor<cpu, 1, DType> in_1(dptr[i] + offset, Shape1(size));
        Tensor<cpu, 1, DType> in_2(dptr[i+1] + offset, Shape1(size));
        Tensor<cpu, 1, DType> in_3(dptr[i+2] + offset, Shape1(size));
        Tensor<cpu, 1, DType> in_4(dptr[i+3] + offset, Shape1(size));
        in_0 += in_1 + in_2 + in_3 + in_4;



virtual void KVStoreLocal::PullImpl(const std::vector<int>& keys,
                                    const std::vector<NDArray*>& values,
                                    int priority,
                                    bool ignore_sparse) {
  std::vector<int> uniq_keys;
  std::vector<std::vector<NDArray*>> grouped_keys;
  GroupKVPairsPull(keys, values, &unique_keys, &grouped_vals, ignore_sparse);
  for (size_t i = 0; i < uniq_keys.size(); ++i) {
    int key = uniq_keys[i];
    const NDArray& local = local_[key];
    comm_->Broadcast(key, local, grouped_vals[i], priority);

Broadcast这一部分的实现逻辑还是比较清晰的,如果原始数据存放在内存中,那么就直接进行拷贝;否则,会先把数据从GPU显存拷贝到页锁定内存(pinned memory),然后再进行数据的拷贝。

void Broadcast(int key, const NDArray& src,
               const std::vector<NDArray*> dst, int priority) override {
  int mask = src.ctx().dev_mask();
  if (mask == Context::kCPU) {
    for (auto d : dst) CopyFromTo(src, d, priority);
  } else {
    // First copy data to pinned_ctx, then broadcast.
    // Note that kv.init initializes the data on pinned_ctx.
    // This branch indicates push() with ndarrays on gpus were called,
    // and the source is copied to gpu ctx.
    // Also indicates that buffers are already initialized during push().
    auto& buf = merge_buf_[key].merged_buf(src.storage_type());
    CopyFromTo(src, &buf, priority);
    for (auto d : dst) CopyFromTo(buf, d, priority);


void Broadcast(int key, const NDArray& src,
               const std::vector<NDArray*> dst, int priority) override {
  if (!inited_) {
    // copy to a random device first
    int dev_id = key % dst.size();
    CopyFromTo(src, dst[dev_id], priority);
    for (size_t i = 0; i < dst.size(); ++i) {
      if (i != static_cast<size_t>(dev_id)) {
        CopyFromTo(*dst[dev_id], dst[i], priority);
  } else {
    auto& buf_merged = merge_buf_[key].merged_buf(src.storage_type());
    CopyFromTo(src, &buf_merged, priority);
    for (auto d : dst) {
      CopyFromTo(buf_merged, d, priority);
