前言

近期在做一些Hadoop运维的相关工作,发现了一个有趣的问题,我们公司的Hadoop集群磁盘占比数值參差不齐,高的接近80%。低的接近40%。并没有充分利用好上面的资源,可是balance的操作跑的也是正常的啊。所以打算看一下Hadoop的balance的源码,更深层次的去了解Hadoop Balance的机制。

Balancer和Distpatch

上面2个类的设计就是与Hadoop Balance操作最紧密联系的类,Balancer类负载找出<source, target>这种起始,目标结果对。然后存入到Distpatch中,然后通过Distpatch进行分发,只是在分发之前,会进行block的验证,推断此block能否被移动,这里会涉及到一些条件的推断,详细的能够通过后面的代码中的凝视去发现。

在Balancer的最后阶段,会将source和Target增加到dispatcher中。详见以下的代码:

/**
   * 依据源节点和目标节点,构造任务对
   */
  private void matchSourceWithTargetToMove(Source source, StorageGroup target) {
    long size = Math.min(source.availableSizeToMove(), target.availableSizeToMove());
    final Task task = new Task(target, size);
    source.addTask(task);
    target.incScheduledSize(task.getSize());
    //增加分发器中
    dispatcher.add(source, target);
    LOG.info("Decided to move "+StringUtils.byteDesc(size)+" bytes from "
        + source.getDisplayName() + " to " + target.getDisplayName());
  }
然后就是dispather类中的代码。然后进行block的分发操作:

/**
   * For each datanode, choose matching nodes from the candidates. Either the
   * datanodes or the candidates are source nodes with (utilization > Avg), and
   * the others are target nodes with (utilization < Avg).
   */
  private <G extends StorageGroup, C extends StorageGroup>
      void chooseStorageGroups(Collection<G> groups, Collection<C> candidates,
          Matcher matcher) {
    for(final Iterator<G> i = groups.iterator(); i.hasNext();) {
      final G g = i.next();
      for(; choose4One(g, candidates, matcher); );
      if (!g.hasSpaceForScheduling()) {
        //假设候选节点没有空间调度,则直接移除掉
        i.remove();
      }
    }
  }
继续调用后面的方法

 /**
   * Decide all <source, target> pairs and
   * the number of bytes to move from a source to a target
   * Maximum bytes to be moved per storage group is
   * min(1 Band worth of bytes,  MAX_SIZE_TO_MOVE).
   * 从源节点列表和目标节点列表中各自选择节点组成一个个对,选择顺序优先为同节点组,同机架,然后是针对全部
   * @return total number of bytes to move in this iteration
   */
  private long chooseStorageGroups() {
    // First, match nodes on the same node group if cluster is node group aware
    if (dispatcher.getCluster().isNodeGroupAware()) {
      //首先匹配的条件是同节点组
      chooseStorageGroups(Matcher.SAME_NODE_GROUP);
    }
    
    // Then, match nodes on the same rack
    //然后是同机架
    chooseStorageGroups(Matcher.SAME_RACK);
    // At last, match all remaining nodes
    //最后是匹配全部的节点
    chooseStorageGroups(Matcher.ANY_OTHER);
    
    return dispatcher.bytesToMove();
  }
然后再通过调用Dispatch的层层方法。以下是简图:

最后核心的检验block块是否合适的代码为以下这个:

/**
   * Decide if the block is a good candidate to be moved from source to target.
   * A block is a good candidate if 
   * 1. the block is not in the process of being moved/has not been moved;
   * 移动的块不是正在被移动的块
   * 2. the block does not have a replica on the target;
   * 在目标节点上没有移动的block块
   * 3. doing the move does not reduce the number of racks that the block has
   * 移动之后,不同机架上的block块的数量应该是不变的.	
   */
  private boolean isGoodBlockCandidate(Source source, StorageGroup target,
      DBlock block) {
    if (source.storageType != target.storageType) {
      return false;
    }
    // check if the block is moved or not
    //假设所要移动的块是存在于正在被移动的块列表,则返回false
    if (movedBlocks.contains(block.getBlock())) {
      return false;
    }
    //假设移动的块已经存在于目标节点上,则返回false,将不会予以移动
    if (block.isLocatedOn(target)) {
      return false;
    }

    //假设开启了机架感知的配置,则目标节点不应该有同样的block
    if (cluster.isNodeGroupAware()
        && isOnSameNodeGroupWithReplicas(target, block, source)) {
      return false;
    }
    
    //须要维持机架上的block块数量不变
    if (reduceNumOfRacks(source, target, block)) {
      return false;
    }
    return true;
  }

以下是Balancer.java和Dispatch.java类的完整代码解析:

Balancer.java:

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.balancer;

import static com.google.common.base.Preconditions.checkArgument;

import java.io.IOException;
import java.io.PrintStream;
import java.net.URI;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.Formatter;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.StorageType;
import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DDatanode;
import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DDatanode.StorageGroup;
import org.apache.hadoop.hdfs.server.balancer.Dispatcher.Source;
import org.apache.hadoop.hdfs.server.balancer.Dispatcher.Task;
import org.apache.hadoop.hdfs.server.balancer.Dispatcher.Util;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicy;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyDefault;
import org.apache.hadoop.hdfs.server.namenode.UnsupportedActionException;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Time;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.google.common.base.Preconditions;

/** <p>The balancer is a tool that balances disk space usage on an HDFS cluster
 * when some datanodes become full or when new empty nodes join the cluster.
 * The tool is deployed as an application program that can be run by the 
 * cluster administrator on a live HDFS cluster while applications
 * adding and deleting files.
 * 
 * <p>SYNOPSIS
 * <pre>
 * To start:
 *      bin/start-balancer.sh [-threshold <threshold>]
 *      Example: bin/ start-balancer.sh 
 *                     start the balancer with a default threshold of 10%
 *               bin/ start-balancer.sh -threshold 5
 *                     start the balancer with a threshold of 5%
 * To stop:
 *      bin/ stop-balancer.sh
 * </pre>
 * 
 * <p>DESCRIPTION
 * <p>The threshold parameter is a fraction in the range of (1%, 100%) with a 
 * default value of 10%. The threshold sets a target for whether the cluster 
 * is balanced. A cluster is balanced if for each datanode, the utilization 
 * of the node (ratio of used space at the node to total capacity of the node) 
 * differs from the utilization of the (ratio of used space in the cluster 
 * to total capacity of the cluster) by no more than the threshold value. 
 * The smaller the threshold, the more balanced a cluster will become. 
 * It takes more time to run the balancer for small threshold values. 
 * Also for a very small threshold the cluster may not be able to reach the 
 * balanced state when applications write and delete files concurrently.
 * 
 * <p>The tool moves blocks from highly utilized datanodes to poorly 
 * utilized datanodes iteratively. In each iteration a datanode moves or 
 * receives no more than the lesser of 10G bytes or the threshold fraction 
 * of its capacity. Each iteration runs no more than 20 minutes.
 * 每次移动不超过10G大小,每次移动不超过20分钟。
 * At the end of each iteration, the balancer obtains updated datanodes
 * information from the namenode.
 * 
 * <p>A system property that limits the balancer's use of bandwidth is 
 * defined in the default configuration file:
 * <pre>
 * <property>
 *   <name>dfs.balance.bandwidthPerSec</name>
 *   <value>1048576</value>
 * <description>  Specifies the maximum bandwidth that each datanode 
 * can utilize for the balancing purpose in term of the number of bytes 
 * per second. </description>
 * </property>
 * </pre>
 * 
 * <p>This property determines the maximum speed at which a block will be 
 * moved from one datanode to another. The default value is 1MB/s. The higher 
 * the bandwidth, the faster a cluster can reach the balanced state, 
 * but with greater competition with application processes. If an 
 * administrator changes the value of this property in the configuration 
 * file, the change is observed when HDFS is next restarted.
 * 
 * <p>MONITERING BALANCER PROGRESS
 * <p>After the balancer is started, an output file name where the balancer 
 * progress will be recorded is printed on the screen.  The administrator 
 * can monitor the running of the balancer by reading the output file. 
 * The output shows the balancer's status iteration by iteration. In each 
 * iteration it prints the starting time, the iteration number, the total 
 * number of bytes that have been moved in the previous iterations, 
 * the total number of bytes that are left to move in order for the cluster 
 * to be balanced, and the number of bytes that are being moved in this 
 * iteration. Normally "Bytes Already Moved" is increasing while "Bytes Left 
 * To Move" is decreasing.
 * 
 * <p>Running multiple instances of the balancer in an HDFS cluster is 
 * prohibited by the tool.
 * 
 * <p>The balancer automatically exits when any of the following five 
 * conditions is satisfied:
 * <ol>
 * <li>The cluster is balanced;
 * <li>No block can be moved;
 * <li>No block has been moved for five consecutive(连续) iterations;
 * <li>An IOException occurs while communicating with the namenode;
 * <li>Another balancer is running.
 * </ol>
 * 以下5种情况会导致Balance操作的失败
 * 1、整个集群已经达到平衡状态
 * 2、经过计算发现没有能够被移动的block块
 * 3、在连续5次的迭代中。没有block块被移动
 * 4、当datanode节点与namenode节点通信的时候,发生IO异常
 * 5、已经存在一个Balance操作
 * 
 * <p>Upon exit, a balancer returns an exit code and prints one of the 
 * following messages to the output file in corresponding to the above exit 
 * reasons:
 * <ol>
 * <li>The cluster is balanced. Exiting
 * <li>No block can be moved. Exiting...
 * <li>No block has been moved for 5 iterations. Exiting...
 * <li>Received an IO exception: failure reason. Exiting...
 * <li>Another balancer is running. Exiting...
 * </ol>
 * 在以下的5种情况下。balancer操作会自己主动退出
 * 1、整个集群已经达到平衡的状态
 * 2、经过计算发现没有能够被移动block块
 * 3、在5论的迭代没有block被移动
 * 4、接收端发生了I异常
 * 5、已经存在一个balanr进程在工作
 * 
 * <p>The administrator can interrupt the execution of the balancer at any 
 * time by running the command "stop-balancer.sh" on the machine where the 
 * balancer is running.
 */

@InterfaceAudience.Private
public class Balancer {
  static final Log LOG = LogFactory.getLog(Balancer.class);

  private static final Path BALANCER_ID_PATH = new Path("/system/balancer.id");

  private static final long GB = 1L << 30; //1GB
  private static final long MAX_SIZE_TO_MOVE = 10*GB;

  private static final String USAGE = "Usage: java "
      + Balancer.class.getSimpleName()
      + "\n\t[-policy <policy>]\tthe balancing policy: "
      + BalancingPolicy.Node.INSTANCE.getName() + " or "
      + BalancingPolicy.Pool.INSTANCE.getName()
      + "\n\t[-threshold <threshold>]\tPercentage of disk capacity"
      + "\n\t[-exclude [-f <hosts-file> | comma-sperated list of hosts]]"
      + "\tExcludes the specified datanodes."
      + "\n\t[-include [-f <hosts-file> | comma-sperated list of hosts]]"
      + "\tIncludes only the specified datanodes.";
  
  private final Dispatcher dispatcher;
  private final BalancingPolicy policy;
  private final double threshold;
  
  // all data node lists
  //四种datanode节点类型
  private final Collection<Source> overUtilized = new LinkedList<Source>();
  private final Collection<Source> aboveAvgUtilized = new LinkedList<Source>();
  private final Collection<StorageGroup> belowAvgUtilized
      = new LinkedList<StorageGroup>();
  private final Collection<StorageGroup> underUtilized
      = new LinkedList<StorageGroup>();

  /* Check that this Balancer is compatible(兼容) with the Block Placement Policy
   * used by the Namenode.
   * 检測此balancer均衡工具是否于与眼下的namenode节点所用的block存放策略相兼容
   */
  private static void checkReplicationPolicyCompatibility(Configuration conf
      ) throws UnsupportedActionException {
    if (!(BlockPlacementPolicy.getInstance(conf, null, null, null) instanceof 
        BlockPlacementPolicyDefault)) {
      throw new UnsupportedActionException(
          //假设不兼容则抛异常
          "Balancer without BlockPlacementPolicyDefault");
    }
  }

  /**
   * Construct a balancer.
   * Initialize balancer. It sets the value of the threshold, and 
   * builds the communication proxies to
   * namenode as a client and a secondary namenode and retry proxies
   * when connection fails.
   * 
   * 构造一个balancer均衡器,设置threshold值,读取配置中的分发线程数的值
   */
  Balancer(NameNodeConnector theblockpool, Parameters p, Configuration conf) {
    final long movedWinWidth = conf.getLong(
        DFSConfigKeys.DFS_BALANCER_MOVEDWINWIDTH_KEY,
        DFSConfigKeys.DFS_BALANCER_MOVEDWINWIDTH_DEFAULT);
        //移动线程数
    final int moverThreads = conf.getInt(
        DFSConfigKeys.DFS_BALANCER_MOVERTHREADS_KEY,
        DFSConfigKeys.DFS_BALANCER_MOVERTHREADS_DEFAULT);
        //分发线程数
    final int dispatcherThreads = conf.getInt(
        DFSConfigKeys.DFS_BALANCER_DISPATCHERTHREADS_KEY,
        DFSConfigKeys.DFS_BALANCER_DISPATCHERTHREADS_DEFAULT);
    final int maxConcurrentMovesPerNode = conf.getInt(
        DFSConfigKeys.DFS_DATANODE_BALANCE_MAX_NUM_CONCURRENT_MOVES_KEY,
        DFSConfigKeys.DFS_DATANODE_BALANCE_MAX_NUM_CONCURRENT_MOVES_DEFAULT);

    this.dispatcher = new Dispatcher(theblockpool, p.nodesToBeIncluded,
        p.nodesToBeExcluded, movedWinWidth, moverThreads, dispatcherThreads,
        maxConcurrentMovesPerNode, conf);
    this.threshold = p.threshold;
    this.policy = p.policy;
  }
  
  /**
   * 获取节点总容量大小
   */ 
  private static long getCapacity(DatanodeStorageReport report, StorageType t) {
    long capacity = 0L;
    for(StorageReport r : report.getStorageReports()) {
      if (r.getStorage().getStorageType() == t) {
        capacity += r.getCapacity();
      }
    }
    return capacity;
  }
  
  /**
   * 获取节点剩余可用容量大小 
   */
  private static long getRemaining(DatanodeStorageReport report, StorageType t) {
    long remaining = 0L;
    for(StorageReport r : report.getStorageReports()) {
      if (r.getStorage().getStorageType() == t) {
        remaining += r.getRemaining();
      }
    }
    return remaining;
  }

  /**
   * Given a datanode storage set, build a network topology and decide
   * over-utilized storages, above average utilized storages, 
   * below average utilized storages, and underutilized storages. 
   * The input datanode storage set is shuffled in order to randomize
   * to the storage matching later on.
   * 给定一个datanode集合,创建一个网络拓扑逻辑并划分出过度使用,使用率超出平均值,
   * 低于平均值的,以及未能充分利用资源的4种类型
   *
   * @return the number of bytes needed to move in order to balance the cluster.
   */
  private long init(List<DatanodeStorageReport> reports) {
    // 计算平均使用率
    for (DatanodeStorageReport r : reports) {
      //累加每一个节点上的使用空间
      policy.accumulateSpaces(r);
    }
    //计算出平均值
    policy.initAvgUtilization();

    long overLoadedBytes = 0L, underLoadedBytes = 0L;
    //进行使用率等级的划分,总共4种,over-utilized, above-average, below-average and under-utilized
    for(DatanodeStorageReport r : reports) {
      final DDatanode dn = dispatcher.newDatanode(r);
      for(StorageType t : StorageType.asList()) {
        final Double utilization = policy.getUtilization(r, t);
        if (utilization == null) { // datanode does not have such storage type 
          continue;
        }
        
        final long capacity = getCapacity(r, t);
        final double utilizationDiff = utilization - policy.getAvgUtilization(t);
        final double thresholdDiff = Math.abs(utilizationDiff) - threshold;
        //计算理论上最大移动空间
        final long maxSize2Move = computeMaxSize2Move(capacity,
            getRemaining(r, t), utilizationDiff, threshold);

        final StorageGroup g;
        if (utilizationDiff > 0) {
          //使用率超出平均值,增加发送节点列表中
          final Source s = dn.addSource(t, maxSize2Move, dispatcher);
          if (thresholdDiff <= 0) { // within threshold
            //假设在threshold范围之内,则增加aboveAvgUtilized
            aboveAvgUtilized.add(s);
          } else {
            //否则增加overUtilized,并计算超出空间
            overLoadedBytes += precentage2bytes(thresholdDiff, capacity);
            overUtilized.add(s);
          }
          g = s;
        } else {
          //与上面的相反
          g = dn.addStorageGroup(t, maxSize2Move);
          if (thresholdDiff <= 0) { // within threshold
            belowAvgUtilized.add(g);
          } else {
            underLoadedBytes += precentage2bytes(thresholdDiff, capacity);
            underUtilized.add(g);
          }
        }
        dispatcher.getStorageGroupMap().put(g);
      }
    }

    logUtilizationCollections();
    
    Preconditions.checkState(dispatcher.getStorageGroupMap().size()
        == overUtilized.size() + underUtilized.size() + aboveAvgUtilized.size()
           + belowAvgUtilized.size(),
        "Mismatched number of storage groups");
    
    // return number of bytes to be moved in order to make the cluster balanced
    return Math.max(overLoadedBytes, underLoadedBytes);
  }

  private static long computeMaxSize2Move(final long capacity, final long remaining,
      final double utilizationDiff, final double threshold) {
    final double diff = Math.min(threshold, Math.abs(utilizationDiff));
    long maxSizeToMove = precentage2bytes(diff, capacity);
    if (utilizationDiff < 0) {
      maxSizeToMove = Math.min(remaining, maxSizeToMove);
    }
    return Math.min(MAX_SIZE_TO_MOVE, maxSizeToMove);
  }

  private static long precentage2bytes(double precentage, long capacity) {
    Preconditions.checkArgument(precentage >= 0,
        "precentage = " + precentage + " < 0");
    return (long)(precentage * capacity / 100.0);
  }

  /* log the over utilized & under utilized nodes */
  private void logUtilizationCollections() {
    logUtilizationCollection("over-utilized", overUtilized);
    if (LOG.isTraceEnabled()) {
      logUtilizationCollection("above-average", aboveAvgUtilized);
      logUtilizationCollection("below-average", belowAvgUtilized);
    }
    logUtilizationCollection("underutilized", underUtilized);
  }

  private static <T extends StorageGroup>
      void logUtilizationCollection(String name, Collection<T> items) {
    LOG.info(items.size() + " " + name + ": " + items);
  }

  /**
   * Decide all <source, target> pairs and
   * the number of bytes to move from a source to a target
   * Maximum bytes to be moved per storage group is
   * min(1 Band worth of bytes,  MAX_SIZE_TO_MOVE).
   * 从源节点列表和目标节点列表中各自选择节点组成一个个对,选择顺序优先为同节点组,同机架,然后是针对全部
   * @return total number of bytes to move in this iteration
   */
  private long chooseStorageGroups() {
    // First, match nodes on the same node group if cluster is node group aware
    if (dispatcher.getCluster().isNodeGroupAware()) {
      //首先匹配的条件是同节点组
      chooseStorageGroups(Matcher.SAME_NODE_GROUP);
    }
    
    // Then, match nodes on the same rack
    //然后是同机架
    chooseStorageGroups(Matcher.SAME_RACK);
    // At last, match all remaining nodes
    //最后是匹配全部的节点
    chooseStorageGroups(Matcher.ANY_OTHER);
    
    return dispatcher.bytesToMove();
  }

  /** Decide all <source, target> pairs according to the matcher. */
  private void chooseStorageGroups(final Matcher matcher) {
    /* first step: match each overUtilized datanode (source) to
     * one or more underUtilized datanodes (targets).
     *
     * 把over组的数据移动under组中
     */
    chooseStorageGroups(overUtilized, underUtilized, matcher);
    
    /* match each remaining overutilized datanode (source) to 
     * below average utilized datanodes (targets).
     * Note only overutilized datanodes that haven't had that max bytes to move
     * satisfied in step 1 are selected
     
     * 把over组的数据移动到below
     */
    chooseStorageGroups(overUtilized, belowAvgUtilized, matcher);

    /* match each remaining underutilized datanode (target) to 
     * above average utilized datanodes (source).
     * Note only underutilized datanodes that have not had that max bytes to
     * move satisfied in step 1 are selected.
     * 
     * 然后,再把under组的数据移动一部分到above组中
     */
    chooseStorageGroups(underUtilized, aboveAvgUtilized, matcher);
  }

  /**
   * For each datanode, choose matching nodes from the candidates. Either the
   * datanodes or the candidates are source nodes with (utilization > Avg), and
   * the others are target nodes with (utilization < Avg).
   */
  private <G extends StorageGroup, C extends StorageGroup>
      void chooseStorageGroups(Collection<G> groups, Collection<C> candidates,
          Matcher matcher) {
    for(final Iterator<G> i = groups.iterator(); i.hasNext();) {
      final G g = i.next();
      for(; choose4One(g, candidates, matcher); );
      if (!g.hasSpaceForScheduling()) {
        //假设候选节点没有空间调度,则直接移除掉
        i.remove();
      }
    }
  }

  /**
   * For the given datanode, choose a candidate and then schedule it.
   * @return true if a candidate is chosen; false if no candidates is chosen.
   */
  private <C extends StorageGroup> boolean choose4One(StorageGroup g,
      Collection<C> candidates, Matcher matcher) {
    final Iterator<C> i = candidates.iterator();
    final C chosen = chooseCandidate(g, i, matcher);
  
    if (chosen == null) {
      return false;
    }
    if (g instanceof Source) {
      matchSourceWithTargetToMove((Source)g, chosen);
    } else {
      matchSourceWithTargetToMove((Source)chosen, g);
    }
    if (!chosen.hasSpaceForScheduling()) {
      i.remove();
    }
    return true;
  }
  
  /**
   * 依据源节点和目标节点,构造任务对
   */
  private void matchSourceWithTargetToMove(Source source, StorageGroup target) {
    long size = Math.min(source.availableSizeToMove(), target.availableSizeToMove());
    final Task task = new Task(target, size);
    source.addTask(task);
    target.incScheduledSize(task.getSize());
    //增加分发器中
    dispatcher.add(source, target);
    LOG.info("Decided to move "+StringUtils.byteDesc(size)+" bytes from "
        + source.getDisplayName() + " to " + target.getDisplayName());
  }
  
  /** Choose a candidate for the given datanode. */
  private <G extends StorageGroup, C extends StorageGroup>
      C chooseCandidate(G g, Iterator<C> candidates, Matcher matcher) {
    if (g.hasSpaceForScheduling()) {
      for(; candidates.hasNext(); ) {
        final C c = candidates.next();
        if (!c.hasSpaceForScheduling()) {
          candidates.remove();
        } else if (matcher.match(dispatcher.getCluster(),
            g.getDatanodeInfo(), c.getDatanodeInfo())) {
          //假设满足匹配的条件,则返回值
          return c;
        }
      }
    }
    return null;
  }

  /* reset all fields in a balancer preparing for the next iteration */
  private void resetData(Configuration conf) {
    this.overUtilized.clear();
    this.aboveAvgUtilized.clear();
    this.belowAvgUtilized.clear();
    this.underUtilized.clear();
    this.policy.reset();
    dispatcher.reset(conf);;
  }
  
  /** Run an iteration for all datanodes. */
  private ExitStatus run(int iteration, Formatter formatter,
      Configuration conf) {
    try {
      final List<DatanodeStorageReport> reports = dispatcher.init();
      final long bytesLeftToMove = init(reports);
      if (bytesLeftToMove == 0) {
        System.out.println("The cluster is balanced. Exiting...");
        return ExitStatus.SUCCESS;
      } else {
        LOG.info( "Need to move "+ StringUtils.byteDesc(bytesLeftToMove)
            + " to make the cluster balanced." );
      }
      
      /* Decide all the nodes that will participate in the block move and
       * the number of bytes that need to be moved from one node to another
       * in this iteration. Maximum bytes to be moved per node is
       * Min(1 Band worth of bytes,  MAX_SIZE_TO_MOVE).
       */
      final long bytesToMove = chooseStorageGroups();
      if (bytesToMove == 0) {
        System.out.println("No block can be moved. Exiting...");
        return ExitStatus.NO_MOVE_BLOCK;
      } else {
        LOG.info( "Will move " + StringUtils.byteDesc(bytesToMove) +
            " in this iteration");
      }

      formatter.format("%-24s %10d  %19s  %18s  %17s%n",
          DateFormat.getDateTimeInstance().format(new Date()),
          iteration,
          StringUtils.byteDesc(dispatcher.getBytesMoved()),
          StringUtils.byteDesc(bytesLeftToMove),
          StringUtils.byteDesc(bytesToMove)
          );
      
      /* For each pair of <source, target>, start a thread that repeatedly 
       * decide a block to be moved and its proxy source, 
       * then initiates the move until all bytes are moved or no more block
       * available to move.
       * Exit no byte has been moved for 5 consecutive iterations.
       *
       * 假设发如今5次连续的迭代中还是没有字节被移动,则退出
       */
      if (!dispatcher.dispatchAndCheckContinue()) {
        return ExitStatus.NO_MOVE_PROGRESS;
      }

      return ExitStatus.IN_PROGRESS;
    } catch (IllegalArgumentException e) {
      System.out.println(e + ".  Exiting ...");
      return ExitStatus.ILLEGAL_ARGUMENTS;
    } catch (IOException e) {
      System.out.println(e + ".  Exiting ...");
      return ExitStatus.IO_EXCEPTION;
    } catch (InterruptedException e) {
      System.out.println(e + ".  Exiting ...");
      return ExitStatus.INTERRUPTED;
    } finally {
      dispatcher.shutdownNow();
    }
  }

  /**
   * Balance all namenodes.
   * For each iteration,
   * for each namenode,
   * execute a {@link Balancer} to work through all datanodes once.  
   * 
   * 开放给外部调用的run方法
   */
  static int run(Collection<URI> namenodes, final Parameters p,
      Configuration conf) throws IOException, InterruptedException {
    final long sleeptime = 2000*conf.getLong(
        DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
        DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT);
    LOG.info("namenodes  = " + namenodes);
    LOG.info("parameters = " + p);
    
    final Formatter formatter = new Formatter(System.out);
    System.out.println("Time Stamp               Iteration#  Bytes Already Moved  Bytes Left To Move  Bytes Being Moved");
    
    final List<NameNodeConnector> connectors
        = new ArrayList<NameNodeConnector>(namenodes.size());
    try {
      for (URI uri : namenodes) {
        final NameNodeConnector nnc = new NameNodeConnector(
            Balancer.class.getSimpleName(), uri, BALANCER_ID_PATH, conf);
        nnc.getKeyManager().startBlockKeyUpdater();
        connectors.add(nnc);
      }
    
      boolean done = false;
      for(int iteration = 0; !done; iteration++) {
        done = true;
        Collections.shuffle(connectors);
        for(NameNodeConnector nnc : connectors) {
          //初始化均衡器具
          final Balancer b = new Balancer(nnc, p, conf);
          //均衡器运行balance操作
          final ExitStatus r = b.run(iteration, formatter, conf);
          // clean all lists
          b.resetData(conf);
          if (r == ExitStatus.IN_PROGRESS) {
            done = false;
          } else if (r != ExitStatus.SUCCESS) {
            //must be an error statue, return.
            return r.getExitCode();
          }
        }

        if (!done) {
          Thread.sleep(sleeptime);
        }
      }
    } finally {
      for(NameNodeConnector nnc : connectors) {
        nnc.close();
      }
    }
    return ExitStatus.SUCCESS.getExitCode();
  }

  /* Given elaspedTime in ms, return a printable string */
  private static String time2Str(long elapsedTime) {
    String unit;
    double time = elapsedTime;
    if (elapsedTime < 1000) {
      unit = "milliseconds";
    } else if (elapsedTime < 60*1000) {
      unit = "seconds";
      time = time/1000;
    } else if (elapsedTime < 3600*1000) {
      unit = "minutes";
      time = time/(60*1000);
    } else {
      unit = "hours";
      time = time/(3600*1000);
    }

    return time+" "+unit;
  }

  static class Parameters {
    static final Parameters DEFAULT = new Parameters(
        BalancingPolicy.Node.INSTANCE, 10.0,
        Collections.<String> emptySet(), Collections.<String> emptySet());

    final BalancingPolicy policy;
    final double threshold;
    // exclude the nodes in this set from balancing operations
    Set<String> nodesToBeExcluded;
    //include only these nodes in balancing operations
    Set<String> nodesToBeIncluded;

    Parameters(BalancingPolicy policy, double threshold,
        Set<String> nodesToBeExcluded, Set<String> nodesToBeIncluded) {
      this.policy = policy;
      this.threshold = threshold;
      this.nodesToBeExcluded = nodesToBeExcluded;
      this.nodesToBeIncluded = nodesToBeIncluded;
    }

    @Override
    public String toString() {
      return Balancer.class.getSimpleName() + "." + getClass().getSimpleName()
          + "[" + policy + ", threshold=" + threshold +
          ", number of nodes to be excluded = "+ nodesToBeExcluded.size() +
          ", number of nodes to be included = "+ nodesToBeIncluded.size() +"]";
    }
  }

  static class Cli extends Configured implements Tool {
    /**
     * Parse arguments and then run Balancer.
     * 
     * @param args command specific arguments.
     * @return exit code. 0 indicates success, non-zero indicates failure.
     */
    @Override
    public int run(String[] args) {
      final long startTime = Time.now();
      final Configuration conf = getConf();

      try {
        checkReplicationPolicyCompatibility(conf);

        final Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(conf);
        return Balancer.run(namenodes, parse(args), conf);
      } catch (IOException e) {
        System.out.println(e + ".  Exiting ...");
        return ExitStatus.IO_EXCEPTION.getExitCode();
      } catch (InterruptedException e) {
        System.out.println(e + ".  Exiting ...");
        return ExitStatus.INTERRUPTED.getExitCode();
      } finally {
        System.out.format("%-24s ", DateFormat.getDateTimeInstance().format(new Date()));
        System.out.println("Balancing took " + time2Str(Time.now()-startTime));
      }
    }

    /** parse command line arguments */
    static Parameters parse(String[] args) {
      BalancingPolicy policy = Parameters.DEFAULT.policy;
      double threshold = Parameters.DEFAULT.threshold;
      Set<String> nodesTobeExcluded = Parameters.DEFAULT.nodesToBeExcluded;
      Set<String> nodesTobeIncluded = Parameters.DEFAULT.nodesToBeIncluded;

      if (args != null) {
        try {
          for(int i = 0; i < args.length; i++) {
            if ("-threshold".equalsIgnoreCase(args[i])) {
              checkArgument(++i < args.length,
                "Threshold value is missing: args = " + Arrays.toString(args));
              try {
                threshold = Double.parseDouble(args[i]);
                if (threshold < 1 || threshold > 100) {
                  throw new IllegalArgumentException(
                      "Number out of range: threshold = " + threshold);
                }
                LOG.info( "Using a threshold of " + threshold );
              } catch(IllegalArgumentException e) {
                System.err.println(
                    "Expecting a number in the range of [1.0, 100.0]: "
                    + args[i]);
                throw e;
              }
            } else if ("-policy".equalsIgnoreCase(args[i])) {
              checkArgument(++i < args.length,
                "Policy value is missing: args = " + Arrays.toString(args));
              try {
                policy = BalancingPolicy.parse(args[i]);
              } catch(IllegalArgumentException e) {
                System.err.println("Illegal policy name: " + args[i]);
                throw e;
              }
            } else if ("-exclude".equalsIgnoreCase(args[i])) {
              checkArgument(++i < args.length,
                  "List of nodes to exclude | -f <filename> is missing: args = "
                  + Arrays.toString(args));
              if ("-f".equalsIgnoreCase(args[i])) {
                checkArgument(++i < args.length,
                    "File containing nodes to exclude is not specified: args = "
                    + Arrays.toString(args));
                nodesTobeExcluded = Util.getHostListFromFile(args[i], "exclude");
              } else {
                nodesTobeExcluded = Util.parseHostList(args[i]);
              }
            } else if ("-include".equalsIgnoreCase(args[i])) {
              checkArgument(++i < args.length,
                "List of nodes to include | -f <filename> is missing: args = "
                + Arrays.toString(args));
              if ("-f".equalsIgnoreCase(args[i])) {
                checkArgument(++i < args.length,
                    "File containing nodes to include is not specified: args = "
                    + Arrays.toString(args));
                nodesTobeIncluded = Util.getHostListFromFile(args[i], "include");
               } else {
                nodesTobeIncluded = Util.parseHostList(args[i]);
              }
            } else {
              throw new IllegalArgumentException("args = "
                  + Arrays.toString(args));
            }
          }
          checkArgument(nodesTobeExcluded.isEmpty() || nodesTobeIncluded.isEmpty(),
              "-exclude and -include options cannot be specified together.");
        } catch(RuntimeException e) {
          printUsage(System.err);
          throw e;
        }
      }
      
      return new Parameters(policy, threshold, nodesTobeExcluded, nodesTobeIncluded);
    }

    private static void printUsage(PrintStream out) {
      out.println(USAGE + "\n");
    }
  }

  /**
   * Run a balancer
   * @param args Command line arguments
   */
  public static void main(String[] args) {
    if (DFSUtil.parseHelpArgument(args, USAGE, System.out, true)) {
      System.exit(0);
    }

    try {
      System.exit(ToolRunner.run(new HdfsConfiguration(), new Cli(), args));
    } catch (Throwable e) {
      LOG.error("Exiting balancer due an exception", e);
      System.exit(-1);
    }
  }
}
以下是Dispatch.java类的解析:

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.balancer;

import static org.apache.hadoop.hdfs.protocolPB.PBHelper.vintPrefixed;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.Socket;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.StorageType;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
import org.apache.hadoop.hdfs.protocol.datatransfer.TrustedChannelResolver;
import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.DataTransferSaslUtil;
import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DDatanode.StorageGroup;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.HostsFileReader;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Time;

import com.google.common.base.Preconditions;

/** Dispatching block replica moves between datanodes. */
@InterfaceAudience.Private
public class Dispatcher {
  static final Log LOG = LogFactory.getLog(Dispatcher.class);

  private static final long GB = 1L << 30; // 1GB
  private static final long MAX_BLOCKS_SIZE_TO_FETCH = 2 * GB;

  private static final int MAX_NO_PENDING_MOVE_ITERATIONS = 5;
  private static final long DELAY_AFTER_ERROR = 10 * 1000L; // 10 seconds

  private final NameNodeConnector nnc;
  private final SaslDataTransferClient saslClient;

  /** Set of datanodes to be excluded. */
  private final Set<String> excludedNodes;
  /** Restrict to the following nodes. */
  private final Set<String> includedNodes;

  private final Collection<Source> sources = new HashSet<Source>();
  private final Collection<StorageGroup> targets = new HashSet<StorageGroup>();

  private final GlobalBlockMap globalBlocks = new GlobalBlockMap();
  private final MovedBlocks<StorageGroup> movedBlocks;

  /** Map (datanodeUuid,storageType -> StorageGroup) */
  private final StorageGroupMap storageGroupMap = new StorageGroupMap();

  private NetworkTopology cluster;

  private final ExecutorService moveExecutor;
  private final ExecutorService dispatchExecutor;
  /** The maximum number of concurrent blocks moves at a datanode */
  private final int maxConcurrentMovesPerNode;

  private final AtomicLong bytesMoved = new AtomicLong();

  private static class GlobalBlockMap {
    private final Map<Block, DBlock> map = new HashMap<Block, DBlock>();

    /**
     * Get the block from the map;
     * if the block is not found, create a new block and put it in the map.
     */
    private DBlock get(Block b) {
      DBlock block = map.get(b);
      if (block == null) {
        block = new DBlock(b);
        map.put(b, block);
      }
      return block;
    }
    
    /** Remove all blocks except for the moved blocks. */
    private void removeAllButRetain(MovedBlocks<StorageGroup> movedBlocks) {
      for (Iterator<Block> i = map.keySet().iterator(); i.hasNext();) {
        if (!movedBlocks.contains(i.next())) {
          i.remove();
        }
      }
    }
  }

  static class StorageGroupMap {
    private static String toKey(String datanodeUuid, StorageType storageType) {
      return datanodeUuid + ":" + storageType;
    }

    private final Map<String, StorageGroup> map = new HashMap<String, StorageGroup>();

    StorageGroup get(String datanodeUuid, StorageType storageType) {
      return map.get(toKey(datanodeUuid, storageType));
    }

    void put(StorageGroup g) {
      final String key = toKey(g.getDatanodeInfo().getDatanodeUuid(), g.storageType);
      final StorageGroup existing = map.put(key, g);
      Preconditions.checkState(existing == null);
    }

    int size() {
      return map.size();
    }

    void clear() {
      map.clear();
    }
  }

  /** This class keeps track of a scheduled block move */
  private class PendingMove {
    private DBlock block;
    private Source source;
    private DDatanode proxySource;
    private StorageGroup target;

    private PendingMove() {
    }

    @Override
    public String toString() {
      final Block b = block.getBlock();
      return b + " with size=" + b.getNumBytes() + " from "
          + source.getDisplayName() + " to " + target.getDisplayName()
          + " through " + proxySource.datanode;
    }

    /**
     * Choose a block & a proxy source for this pendingMove whose source &
     * target have already been chosen.
     * 
     * @return true if a block and its proxy are chosen; false otherwise
     */
    private boolean chooseBlockAndProxy() {
      // iterate all source's blocks until find a good one
      for (Iterator<DBlock> i = source.getBlockIterator(); i.hasNext();) {
        if (markMovedIfGoodBlock(i.next())) {
          i.remove();
          return true;
        }
      }
      return false;
    }

    /**
     * @return true if the given block is good for the tentative move.
     */
    private boolean markMovedIfGoodBlock(DBlock block) {
      synchronized (block) {
        synchronized (movedBlocks) {
          if (isGoodBlockCandidate(source, target, block)) {
            this.block = block;
            if (chooseProxySource()) {
              movedBlocks.put(block);
              if (LOG.isDebugEnabled()) {
                LOG.debug("Decided to move " + this);
              }
              return true;
            }
          }
        }
      }
      return false;
    }

    /**
     * Choose a proxy source.
     * 
     * @return true if a proxy is found; otherwise false
     */
    private boolean chooseProxySource() {
      final DatanodeInfo targetDN = target.getDatanodeInfo();
      // if node group is supported, first try add nodes in the same node group
      if (cluster.isNodeGroupAware()) {
        for (StorageGroup loc : block.getLocations()) {
          if (cluster.isOnSameNodeGroup(loc.getDatanodeInfo(), targetDN)
              && addTo(loc)) {
            return true;
          }
        }
      }
      // check if there is replica which is on the same rack with the target
      for (StorageGroup loc : block.getLocations()) {
        if (cluster.isOnSameRack(loc.getDatanodeInfo(), targetDN) && addTo(loc)) {
          return true;
        }
      }
      // find out a non-busy replica
      for (StorageGroup loc : block.getLocations()) {
        if (addTo(loc)) {
          return true;
        }
      }
      return false;
    }

    /** add to a proxy source for specific block movement */
    private boolean addTo(StorageGroup g) {
      final DDatanode dn = g.getDDatanode();
      if (dn.addPendingBlock(this)) {
        proxySource = dn;
        return true;
      }
      return false;
    }

    /** Dispatch the move to the proxy source & wait for the response. */
    private void dispatch() {
      if (LOG.isDebugEnabled()) {
        LOG.debug("Start moving " + this);
      }

      Socket sock = new Socket();
      DataOutputStream out = null;
      DataInputStream in = null;
      try {
        sock.connect(
            NetUtils.createSocketAddr(target.getDatanodeInfo().getXferAddr()),
            HdfsServerConstants.READ_TIMEOUT);

        sock.setKeepAlive(true);

        OutputStream unbufOut = sock.getOutputStream();
        InputStream unbufIn = sock.getInputStream();
        ExtendedBlock eb = new ExtendedBlock(nnc.getBlockpoolID(),
            block.getBlock());
        final KeyManager km = nnc.getKeyManager(); 
        Token<BlockTokenIdentifier> accessToken = km.getAccessToken(eb);
        IOStreamPair saslStreams = saslClient.socketSend(sock, unbufOut,
            unbufIn, km, accessToken, target.getDatanodeInfo());
        unbufOut = saslStreams.out;
        unbufIn = saslStreams.in;
        out = new DataOutputStream(new BufferedOutputStream(unbufOut,
            HdfsConstants.IO_FILE_BUFFER_SIZE));
        in = new DataInputStream(new BufferedInputStream(unbufIn,
            HdfsConstants.IO_FILE_BUFFER_SIZE));

        sendRequest(out, eb, accessToken);
        receiveResponse(in);
        bytesMoved.addAndGet(block.getNumBytes());
        LOG.info("Successfully moved " + this);
      } catch (IOException e) {
        LOG.warn("Failed to move " + this + ": " + e.getMessage());
        // Proxy or target may have some issues, delay before using these nodes
        // further in order to avoid a potential storm of "threads quota
        // exceeded" warnings when the dispatcher gets out of sync with work
        // going on in datanodes.
        proxySource.activateDelay(DELAY_AFTER_ERROR);
        target.getDDatanode().activateDelay(DELAY_AFTER_ERROR);
      } finally {
        IOUtils.closeStream(out);
        IOUtils.closeStream(in);
        IOUtils.closeSocket(sock);

        proxySource.removePendingBlock(this);
        target.getDDatanode().removePendingBlock(this);

        synchronized (this) {
          reset();
        }
        synchronized (Dispatcher.this) {
          Dispatcher.this.notifyAll();
        }
      }
    }

    /** Send a block replace request to the output stream */
    private void sendRequest(DataOutputStream out, ExtendedBlock eb,
        Token<BlockTokenIdentifier> accessToken) throws IOException {
      new Sender(out).replaceBlock(eb, target.storageType, accessToken,
          source.getDatanodeInfo().getDatanodeUuid(), proxySource.datanode);
    }

    /** Receive a block copy response from the input stream */
    private void receiveResponse(DataInputStream in) throws IOException {
      BlockOpResponseProto response =
          BlockOpResponseProto.parseFrom(vintPrefixed(in));
      while (response.getStatus() == Status.IN_PROGRESS) {
        // read intermediate responses
        response = BlockOpResponseProto.parseFrom(vintPrefixed(in));
      }
      if (response.getStatus() != Status.SUCCESS) {
        if (response.getStatus() == Status.ERROR_ACCESS_TOKEN) {
          throw new IOException("block move failed due to access token error");
        }
        throw new IOException("block move is failed: " + response.getMessage());
      }
    }

    /** reset the object */
    private void reset() {
      block = null;
      source = null;
      proxySource = null;
      target = null;
    }
  }

  /** A class for keeping track of block locations in the dispatcher. */
  private static class DBlock extends MovedBlocks.Locations<StorageGroup> {
    DBlock(Block block) {
      super(block);
    }
  }

  /** The class represents a desired move. */
  static class Task {
    private final StorageGroup target;
    private long size; // bytes scheduled to move

    Task(StorageGroup target, long size) {
      this.target = target;
      this.size = size;
    }

    long getSize() {
      return size;
    }
  }

  /** A class that keeps track of a datanode. */
  static class DDatanode {

    /** A group of storages in a datanode with the same storage type. */
    class StorageGroup {
      final StorageType storageType;
      final long maxSize2Move;
      private long scheduledSize = 0L;

      private StorageGroup(StorageType storageType, long maxSize2Move) {
        this.storageType = storageType;
        this.maxSize2Move = maxSize2Move;
      }

      private DDatanode getDDatanode() {
        return DDatanode.this;
      }

      DatanodeInfo getDatanodeInfo() {
        return DDatanode.this.datanode;
      }

      /** Decide if still need to move more bytes */
      synchronized boolean hasSpaceForScheduling() {
        return availableSizeToMove() > 0L;
      }

      /** @return the total number of bytes that need to be moved */
      synchronized long availableSizeToMove() {
        return maxSize2Move - scheduledSize;
      }

      /** increment scheduled size */
      synchronized void incScheduledSize(long size) {
        scheduledSize += size;
      }

      /** @return scheduled size */
      synchronized long getScheduledSize() {
        return scheduledSize;
      }

      /** Reset scheduled size to zero. */
      synchronized void resetScheduledSize() {
        scheduledSize = 0L;
      }

      /** @return the name for display */
      String getDisplayName() {
        return datanode + ":" + storageType;
      }

      @Override
      public String toString() {
        return getDisplayName();
      }
    }

    final DatanodeInfo datanode;
    final EnumMap<StorageType, StorageGroup> storageMap
        = new EnumMap<StorageType, StorageGroup>(StorageType.class);
    protected long delayUntil = 0L;
    /** blocks being moved but not confirmed yet */
    private final List<PendingMove> pendings;
    private final int maxConcurrentMoves;

    @Override
    public String toString() {
      return getClass().getSimpleName() + ":" + datanode + ":" + storageMap.values();
    }

    private DDatanode(DatanodeStorageReport r, int maxConcurrentMoves) {
      this.datanode = r.getDatanodeInfo();
      this.maxConcurrentMoves = maxConcurrentMoves;
      this.pendings = new ArrayList<PendingMove>(maxConcurrentMoves);
    }

    private void put(StorageType storageType, StorageGroup g) {
      final StorageGroup existing = storageMap.put(storageType, g);
      Preconditions.checkState(existing == null);
    }

    StorageGroup addStorageGroup(StorageType storageType, long maxSize2Move) {
      final StorageGroup g = new StorageGroup(storageType, maxSize2Move);
      put(storageType, g);
      return g;
    }

    Source addSource(StorageType storageType, long maxSize2Move, Dispatcher d) {
      final Source s = d.new Source(storageType, maxSize2Move, this);
      put(storageType, s);
      return s;
    }

    synchronized private void activateDelay(long delta) {
      delayUntil = Time.monotonicNow() + delta;
    }

    synchronized private boolean isDelayActive() {
      if (delayUntil == 0 || Time.monotonicNow() > delayUntil) {
        delayUntil = 0;
        return false;
      }
      return true;
    }

    /** Check if the node can schedule more blocks to move */
    synchronized boolean isPendingQNotFull() {
      return pendings.size() < maxConcurrentMoves;
    }

    /** Check if all the dispatched moves are done */
    synchronized boolean isPendingQEmpty() {
      return pendings.isEmpty();
    }

    /** Add a scheduled block move to the node */
    synchronized boolean addPendingBlock(PendingMove pendingBlock) {
      if (!isDelayActive() && isPendingQNotFull()) {
        return pendings.add(pendingBlock);
      }
      return false;
    }

    /** Remove a scheduled block move from the node */
    synchronized boolean removePendingBlock(PendingMove pendingBlock) {
      return pendings.remove(pendingBlock);
    }
  }

  /** A node that can be the sources of a block move */
  class Source extends DDatanode.StorageGroup {

    private final List<Task> tasks = new ArrayList<Task>(2);
    private long blocksToReceive = 0L;
    /**
     * Source blocks point to the objects in {@link Dispatcher#globalBlocks}
     * because we want to keep one copy of a block and be aware that the
     * locations are changing over time.
     */
    private final List<DBlock> srcBlocks = new ArrayList<DBlock>();

    private Source(StorageType storageType, long maxSize2Move, DDatanode dn) {
      dn.super(storageType, maxSize2Move);
    }

    /** Add a task */
    void addTask(Task task) {
      Preconditions.checkState(task.target != this,
          "Source and target are the same storage group " + getDisplayName());
      incScheduledSize(task.size);
      tasks.add(task);
    }

    /** @return an iterator to this source's blocks */
    Iterator<DBlock> getBlockIterator() {
      return srcBlocks.iterator();
    }

    /**
     * Fetch new blocks of this source from namenode and update this source's
     * block list & {@link Dispatcher#globalBlocks}.
     * 
     * @return the total size of the received blocks in the number of bytes.
     */
    private long getBlockList() throws IOException {
      final long size = Math.min(MAX_BLOCKS_SIZE_TO_FETCH, blocksToReceive);
      final BlocksWithLocations newBlocks = nnc.getBlocks(getDatanodeInfo(), size);

      long bytesReceived = 0;
      for (BlockWithLocations blk : newBlocks.getBlocks()) {
        bytesReceived += blk.getBlock().getNumBytes();
        synchronized (globalBlocks) {
          final DBlock block = globalBlocks.get(blk.getBlock());
          synchronized (block) {
            block.clearLocations();

            // update locations
            final String[] datanodeUuids = blk.getDatanodeUuids();
            final StorageType[] storageTypes = blk.getStorageTypes();
            for (int i = 0; i < datanodeUuids.length; i++) {
              final StorageGroup g = storageGroupMap.get(
                  datanodeUuids[i], storageTypes[i]);
              if (g != null) { // not unknown
                block.addLocation(g);
              }
            }
          }
          if (!srcBlocks.contains(block) && isGoodBlockCandidate(block)) {
            // filter bad candidates
            srcBlocks.add(block);
          }
        }
      }
      return bytesReceived;
    }

    /** Decide if the given block is a good candidate to move or not */
    private boolean isGoodBlockCandidate(DBlock block) {
      for (Task t : tasks) {
        if (Dispatcher.this.isGoodBlockCandidate(this, t.target, block)) {
          return true;
        }
      }
      return false;
    }

    /**
     * Choose a move for the source. The block's source, target, and proxy
     * are determined too. When choosing proxy and target, source &
     * target throttling has been considered. They are chosen only when they
     * have the capacity to support this block move. The block should be
     * dispatched immediately after this method is returned.
     * 
     * @return a move that's good for the source to dispatch immediately.
     */
    private PendingMove chooseNextMove() {
      for (Iterator<Task> i = tasks.iterator(); i.hasNext();) {
        final Task task = i.next();
        final DDatanode target = task.target.getDDatanode();
        PendingMove pendingBlock = new PendingMove();
        if (target.addPendingBlock(pendingBlock)) {
          // target is not busy, so do a tentative block allocation
          pendingBlock.source = this;
          pendingBlock.target = task.target;
          if (pendingBlock.chooseBlockAndProxy()) {
            long blockSize = pendingBlock.block.getNumBytes();
            incScheduledSize(-blockSize);
            task.size -= blockSize;
            if (task.size == 0) {
              i.remove();
            }
            return pendingBlock;
          } else {
            // cancel the tentative move
            target.removePendingBlock(pendingBlock);
          }
        }
      }
      return null;
    }

    /** Iterate all source's blocks to remove moved ones */
    private void removeMovedBlocks() {
      for (Iterator<DBlock> i = getBlockIterator(); i.hasNext();) {
        if (movedBlocks.contains(i.next().getBlock())) {
          i.remove();
        }
      }
    }

    private static final int SOURCE_BLOCKS_MIN_SIZE = 5;

    /** @return if should fetch more blocks from namenode */
    private boolean shouldFetchMoreBlocks() {
      return srcBlocks.size() < SOURCE_BLOCKS_MIN_SIZE && blocksToReceive > 0;
    }

    private static final long MAX_ITERATION_TIME = 20 * 60 * 1000L; // 20 mins

    /**
     * This method iteratively does the following: it first selects a block to
     * move, then sends a request to the proxy source to start the block move
     * when the source's block list falls below a threshold, it asks the
     * namenode for more blocks. It terminates when it has dispatch enough block
     * move tasks or it has received enough blocks from the namenode, or the
     * elapsed time of the iteration has exceeded the max time limit.
     */
    private void dispatchBlocks() {
      final long startTime = Time.monotonicNow();
      this.blocksToReceive = 2 * getScheduledSize();
      boolean isTimeUp = false;
      int noPendingMoveIteration = 0;
      while (!isTimeUp && getScheduledSize() > 0
          && (!srcBlocks.isEmpty() || blocksToReceive > 0)) {
        final PendingMove p = chooseNextMove();
        if (p != null) {
          // move the block
          moveExecutor.execute(new Runnable() {
            @Override
            public void run() {
              p.dispatch();
            }
          });
          continue;
        }

        // Since we cannot schedule any block to move,
        // remove any moved blocks from the source block list and
        removeMovedBlocks(); // filter already moved blocks
        // check if we should fetch more blocks from the namenode
        if (shouldFetchMoreBlocks()) {
          // fetch new blocks
          try {
            blocksToReceive -= getBlockList();
            continue;
          } catch (IOException e) {
            LOG.warn("Exception while getting block list", e);
            return;
          }
        } else {
          // source node cannot find a pending block to move, iteration +1
          noPendingMoveIteration++;
          // in case no blocks can be moved for source node's task,
          // jump out of while-loop after 5 iterations.
          if (noPendingMoveIteration >= MAX_NO_PENDING_MOVE_ITERATIONS) {
            resetScheduledSize();
          }
        }

        // check if time is up or not
        if (Time.monotonicNow() - startTime > MAX_ITERATION_TIME) {
          isTimeUp = true;
          continue;
        }

        // Now we can not schedule any block to move and there are
        // no new blocks added to the source block list, so we wait.
        try {
          synchronized (Dispatcher.this) {
            Dispatcher.this.wait(1000); // wait for targets/sources to be idle
          }
        } catch (InterruptedException ignored) {
        }
      }
    }
  }

  public Dispatcher(NameNodeConnector nnc, Set<String> includedNodes,
      Set<String> excludedNodes, long movedWinWidth, int moverThreads,
      int dispatcherThreads, int maxConcurrentMovesPerNode, Configuration conf) {
    this.nnc = nnc;
    this.excludedNodes = excludedNodes;
    this.includedNodes = includedNodes;
    this.movedBlocks = new MovedBlocks<StorageGroup>(movedWinWidth);

    this.cluster = NetworkTopology.getInstance(conf);

    this.moveExecutor = Executors.newFixedThreadPool(moverThreads);
    this.dispatchExecutor = Executors.newFixedThreadPool(dispatcherThreads);
    this.maxConcurrentMovesPerNode = maxConcurrentMovesPerNode;

    final boolean fallbackToSimpleAuthAllowed = conf.getBoolean(
        CommonConfigurationKeys.IPC_CLIENT_FALLBACK_TO_SIMPLE_AUTH_ALLOWED_KEY,
        CommonConfigurationKeys.IPC_CLIENT_FALLBACK_TO_SIMPLE_AUTH_ALLOWED_DEFAULT);
    this.saslClient = new SaslDataTransferClient(
        DataTransferSaslUtil.getSaslPropertiesResolver(conf),
        TrustedChannelResolver.getInstance(conf), fallbackToSimpleAuthAllowed);
  }

  StorageGroupMap getStorageGroupMap() {
    return storageGroupMap;
  }

  NetworkTopology getCluster() {
    return cluster;
  }
  
  long getBytesMoved() {
    return bytesMoved.get();
  }

  long bytesToMove() {
    Preconditions.checkState(
        storageGroupMap.size() >= sources.size() + targets.size(),
        "Mismatched number of storage groups (" + storageGroupMap.size()
            + " < " + sources.size() + " sources + " + targets.size()
            + " targets)");

    long b = 0L;
    for (Source src : sources) {
      b += src.getScheduledSize();
    }
    return b;
  }

  void add(Source source, StorageGroup target) {
    sources.add(source);
    targets.add(target);
  }

  private boolean shouldIgnore(DatanodeInfo dn) {
    // ignore decommissioned nodes
    final boolean decommissioned = dn.isDecommissioned();
    // ignore decommissioning nodes
    final boolean decommissioning = dn.isDecommissionInProgress();
    // ignore nodes in exclude list
    final boolean excluded = Util.isExcluded(excludedNodes, dn);
    // ignore nodes not in the include list (if include list is not empty)
    final boolean notIncluded = !Util.isIncluded(includedNodes, dn);

    if (decommissioned || decommissioning || excluded || notIncluded) {
      if (LOG.isTraceEnabled()) {
        LOG.trace("Excluding datanode " + dn + ": " + decommissioned + ", "
            + decommissioning + ", " + excluded + ", " + notIncluded);
      }
      return true;
    }
    return false;
  }

  /** Get live datanode storage reports and then build the network topology. */
  List<DatanodeStorageReport> init() throws IOException {
    final DatanodeStorageReport[] reports = nnc.getLiveDatanodeStorageReport();
    final List<DatanodeStorageReport> trimmed = new ArrayList<DatanodeStorageReport>(); 
    // create network topology and classify utilization collections:
    // over-utilized, above-average, below-average and under-utilized.
    for (DatanodeStorageReport r : DFSUtil.shuffle(reports)) {
      final DatanodeInfo datanode = r.getDatanodeInfo();
      if (shouldIgnore(datanode)) {
        continue;
      }
      trimmed.add(r);
      cluster.add(datanode);
    }
    return trimmed;
  }

  public DDatanode newDatanode(DatanodeStorageReport r) {
    return new DDatanode(r, maxConcurrentMovesPerNode);
  }

  public boolean dispatchAndCheckContinue() throws InterruptedException {
    return nnc.shouldContinue(dispatchBlockMoves());
  }

  /**
   * Dispatch block moves for each source. The thread selects blocks to move &
   * sends request to proxy source to initiate block move. The process is flow
   * controlled. Block selection is blocked if there are too many un-confirmed
   * block moves.
   * 
   * @return the total number of bytes successfully moved in this iteration.
   */
  private long dispatchBlockMoves() throws InterruptedException {
    final long bytesLastMoved = bytesMoved.get();
    final Future<?

>[] futures = new Future<?

>[sources.size()]; final Iterator<Source> i = sources.iterator(); for (int j = 0; j < futures.length; j++) { final Source s = i.next(); futures[j] = dispatchExecutor.submit(new Runnable() { @Override public void run() { s.dispatchBlocks(); } }); } // wait for all dispatcher threads to finish for (Future<?> future : futures) { try { future.get(); } catch (ExecutionException e) { LOG.warn("Dispatcher thread failed", e.getCause()); } } // wait for all block moving to be done waitForMoveCompletion(); return bytesMoved.get() - bytesLastMoved; } /** The sleeping period before checking if block move is completed again */ static private long blockMoveWaitTime = 30000L; /** set the sleeping period for block move completion check */ static void setBlockMoveWaitTime(long time) { blockMoveWaitTime = time; } /** Wait for all block move confirmations. */ private void waitForMoveCompletion() { for(;;) { boolean empty = true; for (StorageGroup t : targets) { if (!t.getDDatanode().isPendingQEmpty()) { empty = false; break; } } if (empty) { return; //all pending queues are empty } try { Thread.sleep(blockMoveWaitTime); } catch (InterruptedException ignored) { } } } /** * Decide if the block is a good candidate to be moved from source to target. * A block is a good candidate if * 1. the block is not in the process of being moved/has not been moved; * 移动的块不是正在被移动的块 * 2. the block does not have a replica on the target; * 在目标节点上没有移动的block块 * 3. doing the move does not reduce the number of racks that the block has * 移动之后,不同机架上的block块的数量应该是不变的. */ private boolean isGoodBlockCandidate(Source source, StorageGroup target, DBlock block) { if (source.storageType != target.storageType) { return false; } // check if the block is moved or not //假设所要移动的块是存在于正在被移动的块列表,则返回false if (movedBlocks.contains(block.getBlock())) { return false; } //假设移动的块已经存在于目标节点上,则返回false,将不会予以移动 if (block.isLocatedOn(target)) { return false; } //假设开启了机架感知的配置,则目标节点不应该有同样的block if (cluster.isNodeGroupAware() && isOnSameNodeGroupWithReplicas(target, block, source)) { return false; } //须要维持机架上的block块数量不变 if (reduceNumOfRacks(source, target, block)) { return false; } return true; } /** * Determine whether moving the given block replica from source to target * would reduce the number of racks of the block replicas. */ private boolean reduceNumOfRacks(Source source, StorageGroup target, DBlock block) { final DatanodeInfo sourceDn = source.getDatanodeInfo(); if (cluster.isOnSameRack(sourceDn, target.getDatanodeInfo())) { // source and target are on the same rack return false; } boolean notOnSameRack = true; synchronized (block) { for (StorageGroup loc : block.getLocations()) { if (cluster.isOnSameRack(loc.getDatanodeInfo(), target.getDatanodeInfo())) { notOnSameRack = false; break; } } } if (notOnSameRack) { // target is not on the same rack as any replica return false; } for (StorageGroup g : block.getLocations()) { if (g != source && cluster.isOnSameRack(g.getDatanodeInfo(), sourceDn)) { // source is on the same rack of another replica return false; } } return true; } /** * Check if there are any replica (other than source) on the same node group * with target. If true, then target is not a good candidate for placing * specific replica as we don't want 2 replicas under the same nodegroup. * * @return true if there are any replica (other than source) on the same node * group with target */ private boolean isOnSameNodeGroupWithReplicas( StorageGroup target, DBlock block, Source source) { final DatanodeInfo targetDn = target.getDatanodeInfo(); for (StorageGroup g : block.getLocations()) { if (g != source && cluster.isOnSameNodeGroup(g.getDatanodeInfo(), targetDn)) { return true; } } return false; } /** Reset all fields in order to prepare for the next iteration */ void reset(Configuration conf) { cluster = NetworkTopology.getInstance(conf); storageGroupMap.clear(); sources.clear(); targets.clear(); globalBlocks.removeAllButRetain(movedBlocks); movedBlocks.cleanup(); } /** shutdown thread pools */ void shutdownNow() { dispatchExecutor.shutdownNow(); moveExecutor.shutdownNow(); } static class Util { /** @return true if data node is part of the excludedNodes. */ static boolean isExcluded(Set<String> excludedNodes, DatanodeInfo dn) { return isIn(excludedNodes, dn); } /** * @return true if includedNodes is empty or data node is part of the * includedNodes. */ static boolean isIncluded(Set<String> includedNodes, DatanodeInfo dn) { return (includedNodes.isEmpty() || isIn(includedNodes, dn)); } /** * Match is checked using host name , ip address with and without port * number. * * @return true if the datanode's transfer address matches the set of nodes. */ private static boolean isIn(Set<String> datanodes, DatanodeInfo dn) { return isIn(datanodes, dn.getPeerHostName(), dn.getXferPort()) || isIn(datanodes, dn.getIpAddr(), dn.getXferPort()) || isIn(datanodes, dn.getHostName(), dn.getXferPort()); } /** @return true if nodes contains host or host:port */ private static boolean isIn(Set<String> nodes, String host, int port) { if (host == null) { return false; } return (nodes.contains(host) || nodes.contains(host + ":" + port)); } /** * Parse a comma separated string to obtain set of host names * * @return set of host names */ static Set<String> parseHostList(String string) { String[] addrs = StringUtils.getTrimmedStrings(string); return new HashSet<String>(Arrays.asList(addrs)); } /** * Read set of host names from a file * * @return set of host names */ static Set<String> getHostListFromFile(String fileName, String type) { Set<String> nodes = new HashSet<String>(); try { HostsFileReader.readFileToSet(type, fileName, nodes); return StringUtils.getTrimmedStrings(nodes); } catch (IOException e) { throw new IllegalArgumentException( "Failed to read host list from file: " + fileName); } } } }