Loading

[25] Zookeeper-源码2

1. ZkServer 选举源码

1.1 选举准备

QuorumPeer

@Override
public synchronized void start() {
    if (!getView().containsKey(myid)) {
        throw new RuntimeException("My id " + myid + " not in the peer list");
     }
    loadDataBase();
    startServerCnxnFactory();
    try {
        adminServer.start();
    } catch (AdminServerException e) {
        LOG.warn("Problem starting AdminServer", e);
        System.out.println(e);
    }
    // =====> 准备选举环境
    startLeaderElection();
    super.start();
}

synchronized public void startLeaderElection() {
  if (getPeerState() == ServerState.LOOKING) {
    // =====> 创建选票
    // 1. 选票组件:epoch(Leader任期代号)、zxid(某Leader当选期间执行的事务编号)、myid(serverId)
    // 2. 开始选票时,都是先投自己
    currentVote = new Vote(myid, getLastLoggedZxid(), getCurrentEpoch());
  }

  // ...

  // =====> 创建选举算法实例
  this.electionAlg = createElectionAlgorithm(electionType);
}

protected Election createElectionAlgorithm(int electionAlgorithm){
    Election le = null;

    //TODO: use a factory rather than a switch
    switch (electionAlgorithm) {
    case 0:
        le = new LeaderElection(this);
        break;
    case 1:
        le = new AuthFastLeaderElection(this);
        break;
    case 2:
        le = new AuthFastLeaderElection(this, true);
        break;
    case 3:
        // =====> (1) QuorumCnxnManager 负责选举过程中的所有网络通信
        QuorumCnxManager qcm = createCnxnManager();
        QuorumCnxManager oldQcm = qcmRef.getAndSet(qcm);
        if (oldQcm != null) {
          LOG.warn("Clobbering already-set QuorumCnxManager (restarting leader election?)");
          oldQcm.halt();
        }
        QuorumCnxManager.Listener listener = qcm.listener;
        if (listener != null) {
            // =====> (2) 启动监听线程
            listener.start();
            // =====> (3) 准备开始选举
            FastLeaderElection fle = new FastLeaderElection(this, qcm);
            fle.start();
            le = fle;
        } else {
            LOG.error("Null listener when initializing cnx manager");
        }
        break;
    default:
        assert false;
    }
    return le;
}

a. 网络组件初始化

public QuorumCnxManager createCnxnManager() {
    return new QuorumCnxManager(this,
            this.getId(),
            this.getView(),
            this.authServer,
            this.authLearner,
            this.tickTime * this.syncLimit,
            this.getQuorumListenOnAllIPs(),
            this.quorumCnxnThreadsSize,
            this.isQuorumSaslAuthEnabled());
}

QuorumCnxManager

public QuorumCnxManager(QuorumPeer self,
                        final long mySid,
                        Map<Long,QuorumPeer.QuorumServer> view,
                        QuorumAuthServer authServer,
                        QuorumAuthLearner authLearner,
                        int socketTimeout,
                        boolean listenOnAllIPs,
                        int quorumCnxnThreadsSize,
                        boolean quorumSaslAuthEnabled) {
    // =====> 各种队列
    this.recvQueue = new ArrayBlockingQueue<Message>(RECV_CAPACITY);
    this.queueSendMap = new ConcurrentHashMap<Long, ArrayBlockingQueue<ByteBuffer>>();
    this.senderWorkerMap = new ConcurrentHashMap<Long, SendWorker>();
    this.lastMessageSent = new ConcurrentHashMap<Long, ByteBuffer>();

    String cnxToValue = System.getProperty("zookeeper.cnxTimeout");
    if(cnxToValue != null){
        this.cnxTO = Integer.parseInt(cnxToValue);
    }

    this.self = self;

    this.mySid = mySid;
    this.socketTimeout = socketTimeout;
    this.view = view;
    this.listenOnAllIPs = listenOnAllIPs;

    initializeAuth(mySid, authServer, authLearner, quorumCnxnThreadsSize,
            quorumSaslAuthEnabled);

    // Starts listener thread that waits for connection requests
    listener = new Listener();
    listener.setName("QuorumPeerListener");
}

b. 监听线程初始化

点击 QuorumCnxManager.Listener,找到对应的 run 方法:

@Override
public void run() {
  int numRetries = 0;
  InetSocketAddress addr;
  Socket client = null;
  Exception exitException = null;
  while ((!shutdown) && (portBindMaxRetry == 0 || numRetries < portBindMaxRetry)) {
    try {
        if (self.shouldUsePortUnification()) {
            LOG.info("Creating TLS-enabled quorum server socket");
            ss = new UnifiedServerSocket(self.getX509Util(), true);
        } else if (self.isSslQuorum()) {
            LOG.info("Creating TLS-only quorum server socket");
            ss = new UnifiedServerSocket(self.getX509Util(), false);
        } else {
            ss = new ServerSocket();
        }

        ss.setReuseAddress(true);

        if (self.getQuorumListenOnAllIPs()) {
            int port = self.getElectionAddress().getPort();
            addr = new InetSocketAddress(port);
        } else {
            // Resolve hostname for this server in case the
            // underlying ip address has changed.
            self.recreateSocketAddresses(self.getId());
            addr = self.getElectionAddress();
        }
        LOG.info("My election bind port: " + addr.toString());
        setName(addr.toString());
        // =====> 绑定服务器地址
        ss.bind(addr);
        while (!shutdown) {
          try {
            // =====> 阻塞,等待处理请求
            client = ss.accept();
            setSockOpts(client);
            LOG.info("Received connection request "
                    + formatInetAddr((InetSocketAddress)client.getRemoteSocketAddress()));
            // Receive and handle the connection request
            // asynchronously if the quorum sasl authentication is
            // enabled. This is required because sasl server
            // authentication process may take few seconds to finish,
            // this may delay next peer connection requests.
            if (quorumSaslAuthEnabled) {
                receiveConnectionAsync(client);
            } else {
                receiveConnection(client);
            }
            numRetries = 0;
          } catch (SocketTimeoutException e) {
            // ...
          }
        }
    } catch (IOException e) {
        // ...
        closeSocket(client);
    }
  }

  // ...

}

c. 选举准备

FastLeaderElection

public FastLeaderElection(QuorumPeer self, QuorumCnxManager manager){
    this.stop = false;
    this.manager = manager;
    starter(self, manager);
}

private void starter(QuorumPeer self, QuorumCnxManager manager) {
    this.self = self;
    proposedLeader = -1;
    proposedZxid = -1;

    // =====> 初始化队列和信息
    sendqueue = new LinkedBlockingQueue<ToSend>();
    recvqueue = new LinkedBlockingQueue<Notification>();
    this.messenger = new Messenger(manager);
}

1.2 选举执行

(0)QuorumPeer

public synchronized void start() {
    if (!getView().containsKey(myid)) {
        throw new RuntimeException("My id " + myid + " not in the peer list");
    }
    // 冷启动数据恢复
    loadDataBase();
    startServerCnxnFactory();
    try {
        // 启动通信工厂实例对象
        adminServer.start();
    } catch (AdminServerException e) {
        LOG.warn("Problem starting AdminServer", e);
        System.out.println(e);
    }
    // 准备选举环境
    startLeaderElection();
    // =====> 执行选举
    super.start();
}

(1)执行 super.start(); 就相当于执行 QuorumPeer 中的 run() 方法

@Override
public void run() {
  updateThreadName();

  // ...

  try {
    /*
     * [Main loop] 当 Zookeeper 启动后,首先都是 Looking 状态,通过选举
     * 让其中一台服务器成为 Leader,其他的服务器成为 Follower。
     */
    while (running) {
      switch (getPeerState()) {
      case LOOKING:
        LOG.info("LOOKING");

        if (Boolean.getBoolean("readonlymode.enabled")) {
          LOG.info("Attempting to start ReadOnlyZooKeeperServer");

          // Create read-only server but don't start it immediately
          final ReadOnlyZooKeeperServer roZk =
                        new ReadOnlyZooKeeperServer(logFactory, this, this.zkDb);

          // Instead of starting roZk immediately, wait some grace
          // period before we decide we're partitioned.
          //
          // Thread is used here because otherwise it would require
          // changes in each of election strategy classes which is
          // unnecessary code coupling.
          Thread roZkMgr = new Thread() {
            public void run() {
              try {
                // lower-bound grace period to 2 secs
                sleep(Math.max(2000, tickTime));
                if (ServerState.LOOKING.equals(getPeerState())) {
                  roZk.startup();
                }
              } catch (InterruptedException e) {
                LOG.info(...);
              } catch (Exception e) {
                LOG.error("FAILED to start ReadOnlyZooKeeperServer", e);
              }
            }
          };
          try {
            roZkMgr.start();
            reconfigFlagClear();
            if (shuttingDownLE) {
              shuttingDownLE = false;
              startLeaderElection();
            }
            // =====> 进行选举 lookForLeader,选举结束返回最终成为 Leader 胜选的那张选票
            setCurrentVote(makeLEStrategy().lookForLeader());
          } catch (Exception e) {
            LOG.warn("Unexpected exception", e);
            setPeerState(ServerState.LOOKING);
          } finally {
            // If the thread is in the the grace period, interrupt
            // to come out of waiting.
            roZkMgr.interrupt();
            roZk.shutdown();
          }
        } else {
          try {
             reconfigFlagClear();
             if (shuttingDownLE) {
               shuttingDownLE = false;
               startLeaderElection();
             }
             setCurrentVote(makeLEStrategy().lookForLeader());
          } catch (Exception e) {
            LOG.warn("Unexpected exception", e);
            setPeerState(ServerState.LOOKING);
          }
        }
        break;
      case OBSERVING:
        try {
          LOG.info("OBSERVING");
          setObserver(makeObserver(logFactory));
          observer.observeLeader();
        } catch (Exception e) {
          LOG.warn("Unexpected exception",e );
        } finally {
          observer.shutdown();
          setObserver(null);
          updateServerState();
        }
        break;
      case FOLLOWING:
        try {
          LOG.info("FOLLOWING");
          setFollower(makeFollower(logFactory));
          follower.followLeader();
        } catch (Exception e) {
           LOG.warn("Unexpected exception",e);
        } finally {
           follower.shutdown();
           setFollower(null);
           updateServerState();
        }
        break;
      case LEADING:
        LOG.info("LEADING");
        try {
          setLeader(makeLeader(logFactory));
          leader.lead();
          setLeader(null);
        } catch (Exception e) {
          LOG.warn("Unexpected exception",e);
        } finally {
          if (leader != null) {
            leader.shutdown("Forcing shutdown");
            setLeader(null);
          }
          updateServerState();
        }
        break;
      }
      start_fle = Time.currentElapsedTime();
    }
  } finally {
    LOG.warn("QuorumPeer main thread exited");
    MBeanRegistry instance = MBeanRegistry.getInstance();
    instance.unregister(jmxQuorumBean);
    instance.unregister(jmxLocalPeerBean);

    for (RemotePeerBean remotePeerBean : jmxRemotePeerBean.values()) {
      instance.unregister(remotePeerBean);
    }

    jmxQuorumBean = null;
    jmxLocalPeerBean = null;
    jmxRemotePeerBean = null;
  }
}

(2)[FastLeaderElection] lookForLeader

/**
 * Starts a new round of leader election. Whenever our QuorumPeer
 * changes its state to LOOKING, this method is invoked, and it
 * sends notifications to all other peers.
 */
public Vote lookForLeader() throws InterruptedException {

  // ...

  try {
    // =====> 正常启动中,所有其他服务器,都会给我发送一个投票
    // =====> 保存每一个服务器的最新合法有效的投票
    HashMap<Long, Vote> recvset = new HashMap<Long, Vote>();

    // =====> 存储合法选举之外的投票结果
    HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>();

    // =====> 一次选举的最大等待时间,默认值是 0.2s
    int notTimeout = finalizeWait;

    // =====> 每发起一轮选举则 logicalclock++,在没有合法的 epoch 数据之前,都使用逻辑时钟代替
    synchronized (this) {
      // =====> 更新逻辑时钟,每进行一次选举,都需要更新逻辑时钟
      logicalclock.incrementAndGet();
      // =====> 更新选票 (serverid, zxid, epoch)
      updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
    }

    LOG.info("New election. My id =  " + self.getId()
                 + ", proposed zxid=0x" + Long.toHexString(proposedZxid));
    // =====> 广播选票,把自己的选票发给其他服务器
    sendNotifications();

    /*
     * Loop in which we exchange notifications until we find a leader
     */

    // =====> 一轮一轮的选举直到选举成功
    while ((self.getPeerState() == ServerState.LOOKING) && (!stop)) {
      /*
       * Remove next notification from queue, times out after 2 times
       * the termination time
       */
      Notification n = recvqueue.poll(notTimeout, TimeUnit.MILLISECONDS);

      /*
       * Sends more notifications if haven't received enough.
       * Otherwise processes new notification.
       */
      if (n == null) {
        if (manager.haveDelivered()) {
          sendNotifications();
        } else {
          manager.connectAll();
        }

        /*
         * Exponential backoff
         */
        int tmpTimeOut = notTimeout*2;
        notTimeout = (tmpTimeOut < maxNotificationInterval?
                            tmpTimeOut : maxNotificationInterval);
        LOG.info("Notification time out: " + notTimeout);
      } else if (validVoter(n.sid) && validVoter(n.leader)) {
        /*
         * Only proceed if the vote comes from a replica in the current or next
         * voting view for a replica in the current or next voting view.
         */
        switch (n.state) {
        case LOOKING:
          // If notification > current, replace and send messages out
          if (n.electionEpoch > logicalclock.get()) {
            logicalclock.set(n.electionEpoch);
            recvset.clear();
            if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
                getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
              updateProposal(n.leader, n.zxid, n.peerEpoch);
            } else {
              updateProposal(getInitId(),
                  getInitLastLoggedZxid(),
                  getPeerEpoch());
            }
            sendNotifications();
          } else if (n.electionEpoch < logicalclock.get()) {
            if(LOG.isDebugEnabled()){
              LOG.debug(...);
            }
            break;
          } else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
              proposedLeader, proposedZxid, proposedEpoch)) {
            updateProposal(n.leader, n.zxid, n.peerEpoch);
            sendNotifications();
          }

          if(LOG.isDebugEnabled()){
            LOG.debug("Adding vote: from=" + n.sid +
                ", proposed leader=" + n.leader +
                ", proposed zxid=0x" + Long.toHexString(n.zxid) +
                ", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
          }

          // don't care about the version if it's in LOOKING state
          recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));

          if (termPredicate(recvset,
              new Vote(proposedLeader, proposedZxid,
                  logicalclock.get(), proposedEpoch))) {

            // Verify if there is any change in the proposed leader
            while((n = recvqueue.poll(finalizeWait, TimeUnit.MILLISECONDS)) != null){
              if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
                  proposedLeader, proposedZxid, proposedEpoch)){
                recvqueue.put(n);
                break;
              }
            }

            /*
             * This predicate is true once we don't read any new
             * relevant message from the reception queue
             */
            if (n == null) {
              self.setPeerState((proposedLeader == self.getId()) ?
                  ServerState.LEADING: learningState());
              Vote endVote = new Vote(proposedLeader,
                  proposedZxid, logicalclock.get(), proposedEpoch);
              leaveInstance(endVote);
              return endVote;
            }
          }
          break;
        case OBSERVING:
          LOG.debug("Notification from observer: " + n.sid);
          break;
        case FOLLOWING:
        case LEADING:
          /*
           * Consider all notifications from the same epoch
           * together.
           */
          if (n.electionEpoch == logicalclock.get()) {
            recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
            if (termPredicate(recvset,
                new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state))
                && checkLeader(outofelection, n.leader, n.electionEpoch)) {
              self.setPeerState((n.leader == self.getId()) ?
                  ServerState.LEADING: learningState());
              Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch);
              leaveInstance(endVote);
              return endVote;
            }
          }

          /*
           * Before joining an established ensemble, verify that
           * a majority are following the same leader.
           */
          outofelection.put(n.sid, new Vote(n.version, n.leader,
                              n.zxid, n.electionEpoch, n.peerEpoch, n.state));
          if (termPredicate(outofelection, new Vote(n.version, n.leader,
              n.zxid, n.electionEpoch, n.peerEpoch, n.state))
              && checkLeader(outofelection, n.leader, n.electionEpoch)) {
            synchronized(this){
              logicalclock.set(n.electionEpoch);
              self.setPeerState((n.leader == self.getId()) ? ServerState.LEADING: learningState());
            }
            Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch);
            leaveInstance(endVote);
            return endVote;
          }
          break;
        default:
          LOG.warn("Notification state unrecoginized: " + n.state
              + " (n.state), " + n.sid + " (n.sid)");
          break;
        }
      } else {
        if (!validVoter(n.leader)) {
          LOG.warn("Ignoring notification for non-cluster member sid {} from sid {}", n.leader, n.sid);
        }
        if (!validVoter(n.sid)) {
          LOG.warn("Ignoring notification for sid {} from non-quorum member sid {}", n.leader, n.sid);
        }
      }
    }
    return null;
  } finally {
    // ...
  }
}

(3)[FastLeaderElection] 调用 sendNotifications 广播选票,把自己的选票发给其他服务器

/**
 * Send notifications to all peers upon a change in our vote
 */
private void sendNotifications() {
  // =====> 遍历投票参与者,给每台服务器发送选票
  for (long sid : self.getCurrentAndNextConfigVoters()) {
    QuorumVerifier qv = self.getQuorumVerifier();
    // =====> 创建发送选票
    ToSend notmsg = new ToSend(ToSend.mType.notification,
        proposedLeader,
        proposedZxid,
        logicalclock.get(),
        QuorumPeer.ServerState.LOOKING,
        sid,
        proposedEpoch, qv.toString().getBytes());
    if(LOG.isDebugEnabled()){
      LOG.debug("Sending Notification: " + proposedLeader + " (n.leader), 0x"
          + Long.toHexString(proposedZxid) + " (n.zxid), 0x"
          + Long.toHexString(logicalclock.get())  +
          + " (n.round), " + sid + " (recipient), " + self.getId()
          + " (myid), 0x" + Long.toHexString(proposedEpoch) + " (n.peerEpoch)");
    }
    // =====> 把发送选票放入发送队列
    sendqueue.offer(notmsg);
  }
}

(4)[FastLeaderElection] 中的 WorkerSender 线程

/**
 * This worker simply dequeues a message to send and and queues it on the manager's queue.
 */
class WorkerSender extends ZooKeeperThread {
  volatile boolean stop;
  QuorumCnxManager manager;

  WorkerSender(QuorumCnxManager manager){
    super("WorkerSender");
    this.stop = false;
    this.manager = manager;
  }

  public void run() {
    while (!stop) {
      try {
        // =====> 队列阻塞,时刻准备接收要发送的选票
        ToSend m = sendqueue.poll(3000, TimeUnit.MILLISECONDS);
        if(m == null) continue;
        // =====> 处理要发送的选票
        process(m);
      } catch (InterruptedException e) {
        break;
      }
    }
    LOG.info("WorkerSender is down");
  }

  /**
   * Called by run() once there is a new message to send.
   *
   * @param m     message to send
   */
  void process(ToSend m) {
    ByteBuffer requestBuffer = buildMsg(m.state.ordinal(),
                      m.leader,
                      m.zxid,
                      m.electionEpoch,
                      m.peerEpoch,
                      m.configData);

    // =====> 发送选票
    manager.toSend(m.sid, requestBuffer);
  }
}

QuorumCnxManager 负责实际发送选票

/**
 * Processes invoke this message to queue a message to send. Currently,
 * only leader election uses it.
 */
public void toSend(Long sid, ByteBuffer b) {
  /*
   * If sending message to myself, then simply enqueue it (loopback).
   * =====> 判断如果是发给自己的消息,直接入自己的 RecvQueue
   */
  if (this.mySid == sid) {
     b.position(0);
     addToRecvQueue(new Message(b.duplicate(), sid));
    /*
     * Otherwise send to the corresponding thread to send.
     */
  } else {
     /*
      * Start a new connection if doesn't have one already.
      * =====> 如果是发给其他服务器,创建对应的发送队列 OR
      * 获取已经存在的发送队列,并把要发送的消息放入该队列。
      */
     ArrayBlockingQueue<ByteBuffer> bq = new ArrayBlockingQueue<ByteBuffer>(SEND_CAPACITY);
     // =====> 每一个 Zk 实例对应一个队列
     ArrayBlockingQueue<ByteBuffer> oldq = queueSendMap.putIfAbsent(sid, bq);
     if (oldq != null) {
       addToSendQueue(oldq, b);
     } else {
       addToSendQueue(bq, b);
     }
     // =====> 将选票发送出去
     connectOne(sid);
  }
}

(5)[QuorumCnxManager] 如果数据是发送给自己的,添加到自己的接收队列

public void addToRecvQueue(Message msg) {
  synchronized(recvQLock) {
    if (recvQueue.remainingCapacity() == 0) {
      try {
        recvQueue.remove();
      } catch (NoSuchElementException ne) {
        // element could be removed by poll()
         LOG.debug("Trying to remove from an empty recvQueue. Ignoring exception " + ne);
      }
    }
    try {
      recvQueue.add(msg);
    } catch (IllegalStateException ie) {
      // This should never happen
      LOG.error("Unable to insert element in the recvQueue " + ie);
    }
  }
}

(6)[QuorumCnxManager] 发送到其他节点的,则数据添加到发送队列

private void addToSendQueue(ArrayBlockingQueue<ByteBuffer> queue, ByteBuffer buffer) {
  if (queue.remainingCapacity() == 0) {
    try {
      queue.remove();
    } catch (NoSuchElementException ne) {
      // element could be removed by poll()
      LOG.debug("Trying to remove from an empty Queue. Ignoring exception " + ne);
    }
  }
  try {
    // =====> 将要发送的消息添加到发送队列
    queue.add(buffer);
  } catch (IllegalStateException ie) {
    // This should never happen
    LOG.error("Unable to insert an element in the queue " + ie);
  }
}

(7)[QuorumCnxManager] 与要发送的服务器节点建立通信连接

/**
 * Try to establish a connection to server with id sid.
 *
 *  @param sid  server id
 */
synchronized void connectOne(long sid){
  if (senderWorkerMap.get(sid) != null) {
    LOG.debug("There is a connection already for server " + sid);
    return;
  }
  synchronized (self.QV_LOCK) {
    boolean knownId = false;
    // Resolve hostname for the remote server before attempting to
    // connect in case the underlying ip address has changed.
    self.recreateSocketAddresses(sid);
    Map<Long, QuorumPeer.QuorumServer> lastCommittedView = self.getView();
    QuorumVerifier lastSeenQV = self.getLastSeenQuorumVerifier();
    Map<Long, QuorumPeer.QuorumServer> lastProposedView = lastSeenQV.getAllMembers();
    if (lastCommittedView.containsKey(sid)) {
      knownId = true;
      if (connectOne(sid, lastCommittedView.get(sid).electionAddr))
        return;
    }
    if (lastSeenQV != null && lastProposedView.containsKey(sid)
                    && (!knownId || (lastProposedView.get(sid).electionAddr !=
                                lastCommittedView.get(sid).electionAddr))) {
      knownId = true;
      if (connectOne(sid, lastProposedView.get(sid).electionAddr))
        return;
    }
    if (!knownId) {
      LOG.warn("Invalid server id: " + sid);
      return;
    }
  }
}

/**
 * Try to establish a connection to server with id sid using its electionAddr.
 *
 *  @param sid  server id
 *  @return boolean success indication
 */
synchronized private boolean connectOne(long sid, InetSocketAddress electionAddr){
  if (senderWorkerMap.get(sid) != null) {
    LOG.debug("There is a connection already for server " + sid);
    return true;
  }

  Socket sock = null;
  try {
    LOG.debug("Opening channel to server " + sid);
    if (self.isSslQuorum()) {
       SSLSocket sslSock = self.getX509Util().createSSLSocket();
       setSockOpts(sslSock);
       sslSock.connect(electionAddr, cnxTO);
       sslSock.startHandshake();
       sock = sslSock;
       LOG.info("SSL handshake complete with {}-{}-{}", sslSock.getRemoteSocketAddress(),
            sslSock.getSession().getProtocol(), sslSock.getSession().getCipherSuite());
     } else {
       sock = new Socket();
       setSockOpts(sock);
       sock.connect(electionAddr, cnxTO);
     }
     LOG.debug("Connected to server " + sid);
    // Sends connection request asynchronously if the quorum
    // sasl authentication is enabled. This is required because
    // sasl server authentication process may take few seconds to
    // finish, this may delay next peer connection requests.
    if (quorumSaslAuthEnabled) {
      initiateConnectionAsync(sock, sid);
    } else {
      // =====> 处理连接
      initiateConnection(sock, sid);
    }
    return true;
  } catch (UnresolvedAddressException e) {
    // ...
    closeSocket(sock);
    return false;
  }
}

/**
 * If this server has initiated the connection, then it gives up on the
 * connection if it loses challenge. Otherwise, it keeps the connection.
 */
public void initiateConnection(final Socket sock, final Long sid) {
  try {
    // =====> 开始连接
    startConnection(sock, sid);
  } catch (IOException e) {
    LOG.error("Exception while connecting, id: {}, addr: {}, closing learner connection",
        new Object[] { sid, sock.getRemoteSocketAddress() }, e);
    closeSocket(sock);
    return;
  }
}

(8)[QuorumCnxManager] 创建并启动发送器线程(SendWorker) 和接收器线程(RecvWorker)

private boolean startConnection(Socket sock, Long sid)
    throws IOException {
  DataOutputStream dout = null;
  DataInputStream din = null;
  try {
    // Use BufferedOutputStream to reduce the number of IP packets. This is
    // important for x-DC scenarios.
    // =====> 通过输出流向服务器发送数据
    BufferedOutputStream buf = new BufferedOutputStream(sock.getOutputStream());
    dout = new DataOutputStream(buf);

    // Sending id and challenge
    // represents protocol version (in other words - message type)
    dout.writeLong(PROTOCOL_VERSION);
    dout.writeLong(self.getId());
    String addr = formatInetAddr(self.getElectionAddress());
    byte[] addr_bytes = addr.getBytes();
    dout.writeInt(addr_bytes.length);
    dout.write(addr_bytes);
    dout.flush();
    // =====> 通过输入流读取对方发送过来的选票
    din = new DataInputStream(new BufferedInputStream(sock.getInputStream()));
  } catch (IOException e) {
    LOG.warn("Ignoring exception reading or writing challenge: ", e);
    closeSocket(sock);
    return false;
  }

  // authenticate learner
  QuorumPeer.QuorumServer qps = self.getVotingView().get(sid);
  if (qps != null) {
    // TODO - investigate why reconfig makes qps null.
    authLearner.authenticate(sock, qps.hostname);
  }

  // If lost the challenge, then drop the new connection
  // =====> 若对方 id 比自身大,是没有资格给对方发送连接请求的,直接关闭自己的客户端
  if (sid > self.getId()) {
    LOG.info("Have smaller server identifier, so dropping the "
                + "connection: (" + sid + ", " + self.getId() + ")");
    closeSocket(sock);
    // Otherwise proceed with the connection
  } else {
    // =====> 初始化: 发送器、接收器
    SendWorker sw = new SendWorker(sock, sid);
    RecvWorker rw = new RecvWorker(sock, din, sid, sw);
    sw.setRecv(rw);

    SendWorker vsw = senderWorkerMap.get(sid);

    if(vsw != null)
      vsw.finish();

    senderWorkerMap.put(sid, sw);
    queueSendMap.putIfAbsent(sid, new ArrayBlockingQueue<ByteBuffer>(SEND_CAPACITY));

    // =====> 启动发送器线程和接收器线程
    sw.start();
    rw.start();

    return true;

  }
  return false;
}

(9)[QuorumCnxManager] 查找 SendWorker 下的 run() 方法

@Override
public void run() {
  threadCnt.incrementAndGet();
  try {
    /**
     * If there is nothing in the queue to send, then we
     * send the lastMessage to ensure that the last message
     * was received by the peer. The message could be dropped
     * in case self or the peer shutdown their connection
     * (and exit the thread) prior to reading/processing
     * the last message. Duplicate messages are handled correctly
     * by the peer.
     *
     * If the send queue is non-empty, then we have a recent
     * message than that stored in lastMessage. To avoid sending
     * stale message, we should send the message in the send queue.
     */
    ArrayBlockingQueue<ByteBuffer> bq = queueSendMap.get(sid);
    if (bq == null || isSendQueueEmpty(bq)) {
       ByteBuffer b = lastMessageSent.get(sid);
       if (b != null) {
         LOG.debug("Attempting to send lastMessage to sid=" + sid);
         send(b);
       }
    }
  } catch (IOException e) {
    LOG.error("Failed to send last message. Shutting down thread.", e);
    this.finish();
  }

  try {
    // =====> 只要连接没有断开
    while (running && !shutdown && sock != null) {

      ByteBuffer b = null;
      try {
        ArrayBlockingQueue<ByteBuffer> bq = queueSendMap.get(sid);
        if (bq != null) {
          // =====> 不断从发送队列 SendQueue 中,获取发送消息并执行发送
          b = pollSendQueue(bq, 1000, TimeUnit.MILLISECONDS);
        } else {
          LOG.error("No queue of incoming messages for server " + sid);
          break;
        }

        if (b != null) {
          // =====> 更新对于 sid 这台服务器的最近一条消息
          lastMessageSent.put(sid, b);
          // =====> 执行发送
          send(b);
        }
      } catch (InterruptedException e) {
        LOG.warn("Interrupted while waiting for message on queue", e);
      }
    }
  } catch (Exception e) {
    LOG.warn("Exception when using channel: for id " + sid
                 + " my id = " + QuorumCnxManager.this.mySid
                 + " error = " + e);
  }
  this.finish();
  LOG.warn("Send worker leaving thread " + " id " + sid + " my id = " + self.getId());
}

synchronized void send(ByteBuffer b) throws IOException {
    byte[] msgBytes = new byte[b.capacity()];
    try {
    	b.position(0);
    	b.get(msgBytes);
    } catch (BufferUnderflowException be) {
    	LOG.error("BufferUnderflowException ", be);
    	return;
    }
    // =====> 输出流向外发送
    dout.writeInt(b.capacity());
    dout.write(b.array());
    dout.flush();
}

(10)[QuorumCnxManager] 查找 RecvWorker 下的 run() 方法

@Override
public void run() {
  threadCnt.incrementAndGet();
  try {
    // =====> 只要连接没有断开
    while (running && !shutdown && sock != null) {
      /**
       * Reads the first int to determine the length of the message
       */
      int length = din.readInt();
      if (length <= 0 || length > PACKETMAXSIZE) {
        throw new IOException("Received packet with invalid packet: " + length);
      }
      /**
       * Allocates a new ByteBuffer to receive the message
       */
      byte[] msgArray = new byte[length];
      // =====> 输入流接收消息
      din.readFully(msgArray, 0, length);
      ByteBuffer message = ByteBuffer.wrap(msgArray);
      // =====> 接收对方发送过来的选票 -> 见(5)代码
      addToRecvQueue(new Message(message.duplicate(), sid));
    }
  } catch (Exception e) {
    LOG.warn("Connection broken for id " + sid + ", my id = "
				+ QuorumCnxManager.this.mySid + ", error = " , e);
  } finally {
    LOG.warn("Interrupting SendWorker");
    sw.finish();
    closeSocket(sock);
  }
}

(11)[FastLeaderElection] 中的 WorkerReceiver 线程

class WorkerReceiver extends ZooKeeperThread  {
  volatile boolean stop;
  QuorumCnxManager manager;

  WorkerReceiver(QuorumCnxManager manager) {
    super("WorkerReceiver");
    this.stop = false;
    this.manager = manager;
  }

  public void run() {

    Message response;
    while (!stop) {
      // Sleeps on receive
      try {
	    // =====> 从 RecvQueue 中取出选举投票消息(其他服务器发送过来的)
        response = manager.pollRecvQueue(3000, TimeUnit.MILLISECONDS);
        if(response == null) continue;

        // ...

      } catch (InterruptedException e) {
        LOG.warn("Interrupted Exception while waiting for new message" +
            e.toString());
      }
    }
    LOG.info("WorkerReceiver is down");
  }
}

2. L/F 状态同步源码

2.1 流程图示

当选举结束后,每个节点都需要根据自己的角色更新自己的状态。选举出的 Leader 更新自己状态为 Leader,其他节点更新自己状态为 Follower。

(1)follower 必须要让 leader 知道自己的状态:epoch、zxid、sid;

  • 必须要找出谁是 leader;
  • 发起请求连接 leader;
  • 发送自己的信息给 leader;

当 leader 接收到信息,必须要返回对应的信息给 follower。

(2)当 leader 得知 follower 的状态了,就确定需要做何种方式的数据同步 DIFF、TRUNC、SNAP;

(3)执行数据同步;

  1. 【DIFF】咱俩一样,不需要做什么;
  2. 【TRUNC】 follower 的 zxid 比 leader 的 zxid 大,所以 follower 要回滚;
  3. 【COMMIT】 leader 的 zxid 比 follower 的 zxid 大,发送 Proposal 给 follower 提交执行;
  4. 【SNAP】若 follower 并没有任何数据,直接使用 SNAP 的方式来执行数据同步(直接把数据全部序列到 follower)。

(4)当 leader 接收到超过半数 follower 的 ack 之后,进入正常工作状态,集群启动完成了。

2.2 代码流程

a. QuorumPeer#run()

部分代码截取:

case FOLLOWING:
  try {
     LOG.info("FOLLOWING");
    setFollower(makeFollower(logFactory));
    // =====> Follower 更新状态入口
    follower.followLeader();
  } catch (Exception e) {
     LOG.warn("Unexpected exception",e);
  } finally {
     follower.shutdown();
     setFollower(null);
     updateServerState();
  }
  break;
case LEADING:
  LOG.info("LEADING");
  try {
    setLeader(makeLeader(logFactory));
    // =====> Leader 更新状态入口
    leader.lead();
    setLeader(null);
  } catch (Exception e) {
    LOG.warn("Unexpected exception",e);
  } finally {
    if (leader != null) {
      leader.shutdown("Forcing shutdown");
      setLeader(null);
    }
    updateServerState();
  }
  break;
}
  • Leader 更新状态入口:leader.lead()
  • Follower 更新状态入口:follower.followerLeader()

b. Leader、LearnerHandler

/**
 * This method is main function that is called to lead
 *
 * @throws IOException
 * @throws InterruptedException
 */
void lead() throws IOException, InterruptedException {
  // ...

  try {
    self.tick.set(0);
    // =====> 恢复数据到内存,启动时其实已经加载过了
    zk.loadData();

    leaderStateSummary = new StateSummary(self.getCurrentEpoch(), zk.getLastProcessedZxid());

    // Start thread that waits for connection requests from new followers.
    // =====> 等待其他 follower 节点向 leader 节点发送同步状态
    cnxAcceptor = new LearnerCnxAcceptor();
    cnxAcceptor.start();

    long epoch = getEpochToPropose(self.getId(), self.getAcceptedEpoch());

    zk.setZxid(ZxidUtils.makeZxid(epoch, 0));

    synchronized(this){
      lastProposed = zk.getZxid();
    }

    newLeaderProposal.packet = new QuorumPacket(NEWLEADER, zk.getZxid(), null, null);


    if ((newLeaderProposal.packet.getZxid() & 0xffffffffL) != 0) {
      LOG.info("NEWLEADER proposal has Zxid of "
				+ Long.toHexString(newLeaderProposal.packet.getZxid()));
    }

    // ...

  } finally {
    zk.unregisterJMX(this);
  }
}

class LearnerCnxAcceptor extends ZooKeeperCriticalThread {
  private volatile boolean stop = false;

  public LearnerCnxAcceptor() {
    super("LearnerCnxAcceptor-" + ss.getLocalSocketAddress(), zk.getZooKeeperServerListener());
  }

  @Override
  public void run() {
    try {
      while (!stop) {
        Socket s = null;
        boolean error = false;
        try {

          // =====> 等待接收 follower 的状态同步申请
          s = ss.accept();

          // start with the initLimit, once the ack is processed
          // in LearnerHandler switch to the syncLimit.
          s.setSoTimeout(self.tickTime * self.initLimit);
          s.setTcpNoDelay(nodelay);

          BufferedInputStream is = new BufferedInputStream(s.getInputStream());

          // =====> 一旦接收到 follower 的请求,就创建 LearnerHandler 对象来处理请求!
          LearnerHandler fh = new LearnerHandler(s, is, Leader.this);
          // =====> 启动线程
          fh.start();

        } catch (...) {
            // ...
        }
      }
    } catch (Exception e) {
      LOG.warn("Exception while accepting follower", e.getMessage());
      handleException(this.getName(), e);
    }
  }

  public void halt() {
    stop = true;
  }
}


// =====> 其中 ss 的初始化是在创建 Leader 对象时创建的 socket ↓

private final ServerSocket ss;

Leader(QuorumPeer self,LeaderZooKeeperServer zk) throws IOException {
  this.self = self;
  this.proposalStats = new BufferStats();
  try {
    if (self.shouldUsePortUnification() || self.isSslQuorum()) {
      boolean allowInsecureConnection = self.shouldUsePortUnification();
      if (self.getQuorumListenOnAllIPs()) {
        ss = new UnifiedServerSocket(self.getX509Util(),
                    allowInsecureConnection, self.getQuorumAddress().getPort());
      } else {
        ss = new UnifiedServerSocket(self.getX509Util(), allowInsecureConnection);
      }
    } else {
      if (self.getQuorumListenOnAllIPs()) {
        ss = new ServerSocket(self.getQuorumAddress().getPort());
      } else {
        ss = new ServerSocket();
      }
    }
    ss.setReuseAddress(true);
    if (!self.getQuorumListenOnAllIPs()) {
      ss.bind(self.getQuorumAddress());
    }
  } catch (BindException e) {
    ...
  }
  this.zk = zk;
  this.learnerSnapshotThrottler = createLearnerSnapshotThrottler(
      maxConcurrentSnapshots, maxConcurrentSnapshotTimeout);
}

Leader.LearnerCnxAcceptor#run() 截选:

while (!stop) {
	Socket s = null;
	boolean error = false;

	s = ss.accept();

	// start with the initLimit, once the ack is processed
	// in LearnerHandler switch to the syncLimit
	s.setSoTimeout(self.tickTime * self.initLimit);
	s.setTcpNoDelay(nodelay);

	BufferedInputStream is = new BufferedInputStream(s.getInputStream());

    // =====> 创建 LearnerHandler
    // 类声明:public class LearnerHandler extends ZooKeeperThread {...}
    // 说明是一个线程。所以 fh.start() 执行的是 LearnerHandler 中的 run()。
	LearnerHandler fh = new LearnerHandler(s, is, Leader.this);
	fh.start();
}

【LearnerHandler】There will be an instance of this class created by the Leader for each learner. All communication with a learner is handled by this class.

/**
 * This thread will receive packets from the peer and process them and
 * also listen to new connections from new peers.
 */
@Override
public void run() {
  try {
    leader.addLearnerHandler(this);
	// =====> 心跳处理
    tickOfNextAckDeadline = leader.self.tick.get()
        + leader.self.initLimit + leader.self.syncLimit;

    ia = BinaryInputArchive.getArchive(bufferedInput);
    bufferedOutput = new BufferedOutputStream(sock.getOutputStream());
    oa = BinaryOutputArchive.getArchive(bufferedOutput);

	// =====> 从网络中接收消息,并反序列化为 Packet
    QuorumPacket qp = new QuorumPacket();
    ia.readRecord(qp, "packet");

	// =====> 选举结束后,observer 和 follower 都应该给 leader 发送一个标志信息
	// FOLLOWERINFO 或 OBSERVERINFO
    if(qp.getType() != Leader.FOLLOWERINFO && qp.getType() != Leader.OBSERVERINFO) {
      LOG.error("First packet " + qp.toString()
          + " is not FOLLOWERINFO or OBSERVERINFO!");
      return;
    }

    byte learnerInfoData[] = qp.getData();
    if (learnerInfoData != null) {
      ByteBuffer bbsid = ByteBuffer.wrap(learnerInfoData);
      if (learnerInfoData.length >= 8) {
        this.sid = bbsid.getLong();
      }
      if (learnerInfoData.length >= 12) {
        this.version = bbsid.getInt(); // protocolVersion
      }
      if (learnerInfoData.length >= 20) {
        long configVersion = bbsid.getLong();
        if (configVersion > leader.self.getQuorumVerifier().getVersion()) {
          throw new IOException(
              "Follower is ahead of the leader (has a later activated configuration)");
        }
      }
    } else {
      this.sid = leader.followerCounter.getAndDecrement();
    }

    if (leader.self.getView().containsKey(this.sid)) {
      LOG.info("Follower sid: " + this.sid + " : info : "
          + leader.self.getView().get(this.sid).toString());
    } else {
      LOG.info("Follower sid: " + this.sid + " not in the current config " 
                  + Long.toHexString(leader.self.getQuorumVerifier().getVersion()));
    }

    if (qp.getType() == Leader.OBSERVERINFO) {
        learnerType = LearnerType.OBSERVER;
    }

	// =====> 读取 Follower 发送过来的 lastAcceptedEpoch
	// =====> 选举过程中所使用的 epoch,其实还是上一任 leader 的 epoch
    long lastAcceptedEpoch = ZxidUtils.getEpochFromZxid(qp.getZxid());

    long peerLastZxid;
    StateSummary ss = null;

	// =====> 读取 Follower 发送过来的 zxid
    long zxid = qp.getZxid();

	// =====> Leader 根据从 Follower 获取 sid 和旧的 epoch,构建新的 epoch
    long newEpoch = leader.getEpochToPropose(this.getSid(), lastAcceptedEpoch);
    long newLeaderZxid = ZxidUtils.makeZxid(newEpoch, 0);

    if (this.getVersion() < 0x10000) {
      // we are going to have to extrapolate the epoch information
      long epoch = ZxidUtils.getEpochFromZxid(zxid);
      ss = new StateSummary(epoch, zxid);
      // fake the message
      leader.waitForEpochAck(this.getSid(), ss);
    } else {
      byte ver[] = new byte[4];
      ByteBuffer.wrap(ver).putInt(0x10000);
	  // =====> Leader 向 Follower 发送信息 (包含 zxid 和 newEpoch)
      QuorumPacket newEpochPacket = new QuorumPacket(Leader.LEADERINFO, newLeaderZxid, ver, null);
      oa.writeRecord(newEpochPacket, "packet");
      bufferedOutput.flush();
      // =====> 接收到 Follower 应答的 ackEpoch
      QuorumPacket ackEpochPacket = new QuorumPacket();
      ia.readRecord(ackEpochPacket, "packet");
      if (ackEpochPacket.getType() != Leader.ACKEPOCH) {
        LOG.error(ackEpochPacket.toString()
            + " is not ACKEPOCH");
        return;
      }
      ByteBuffer bbepoch = ByteBuffer.wrap(ackEpochPacket.getData());
      // =====> 保存了对方 follower 或者 observer 的状态 epoch 和 zxid
      ss = new StateSummary(bbepoch.getInt(), ackEpochPacket.getZxid());
      leader.waitForEpochAck(this.getSid(), ss);
    }
    peerLastZxid = ss.getLastZxid();

    // =====> 方法判断 Leader 和 Follower 是否需要同步
    // Take any necessary action if we need to send TRUNC or DIFF
    // startForwarding() will be called in all cases
    boolean needSnap = syncFollower(peerLastZxid, leader.zk.getZKDatabase(), leader);

    /* if we are not truncating or sending a diff just send a snapshot */
    if (needSnap) {
      boolean exemptFromThrottle = getLearnerType() != LearnerType.OBSERVER;
      LearnerSnapshot snapshot =
          leader.getLearnerSnapshotThrottler().beginSnapshot(exemptFromThrottle);
      try {
        long zxidToSend = leader.zk.getZKDatabase().getDataTreeLastProcessedZxid();
        oa.writeRecord(new QuorumPacket(Leader.SNAP, zxidToSend, null, null), "packet");
        bufferedOutput.flush();

        LOG.info("Sending snapshot last zxid of peer is 0x{}, zxid of leader is 0x{}, "
            + "send zxid of db as 0x{}, {} concurrent snapshots, "
            + "snapshot was {} from throttle",
            Long.toHexString(peerLastZxid),
            Long.toHexString(leaderLastZxid),
            Long.toHexString(zxidToSend),
            snapshot.getConcurrentSnapshotNumber(),
            snapshot.isEssential() ? "exempt" : "not exempt");
        // Dump data to peer
        leader.zk.getZKDatabase().serializeSnapshot(oa);
        oa.writeString("BenWasHere", "signature");
        bufferedOutput.flush();
      } finally {
        snapshot.close();
      }
    }

    LOG.debug("Sending NEWLEADER message to " + sid);
    // the version of this quorumVerifier will be set by leader.lead() in case
    // the leader is just being established. waitForEpochAck makes sure that
    // readyToStart is true if we got here, so the version was set
    if (getVersion() < 0x10000) {
      QuorumPacket newLeaderQP = new QuorumPacket(Leader.NEWLEADER,
          newLeaderZxid, null, null);
      oa.writeRecord(newLeaderQP, "packet");
    } else {
      QuorumPacket newLeaderQP = new QuorumPacket(Leader.NEWLEADER,
          newLeaderZxid, leader.self.getLastSeenQuorumVerifier()
              .toString().getBytes(), null);
      queuedPackets.add(newLeaderQP);
    }
    bufferedOutput.flush();

    // Start thread that blast packets in the queue to learner
    startSendingPackets();

    /*
     * Have to wait for the first ACK, wait until
     * the leader is ready, and only then we can
     * start processing messages.
     */
    qp = new QuorumPacket();
    ia.readRecord(qp, "packet");
    if(qp.getType() != Leader.ACK){
      LOG.error("Next packet was supposed to be an ACK,"
        + " but received packet: {}", packetToString(qp));
      return;
    }

    if(LOG.isDebugEnabled()){
      LOG.debug("Received NEWLEADER-ACK message from " + sid);
    }
    leader.waitForNewLeaderAck(getSid(), qp.getZxid());

    syncLimitCheck.start();

    // now that the ack has been processed expect the syncLimit
    sock.setSoTimeout(leader.self.tickTime * leader.self.syncLimit);

    /*
     * Wait until leader starts up
     */
    synchronized(leader.zk){
      while(!leader.zk.isRunning() && !this.isInterrupted()){
        leader.zk.wait(20);
      }
    }
    // Mutation packets will be queued during the serialize,
    // so we need to mark when the peer can actually start
    // using the data
    LOG.debug("Sending UPTODATE message to " + sid);
    queuedPackets.add(new QuorumPacket(Leader.UPTODATE, -1, null, null));

    while (true) {
      qp = new QuorumPacket();
      ia.readRecord(qp, "packet");

      long traceMask = ZooTrace.SERVER_PACKET_TRACE_MASK;
      if (qp.getType() == Leader.PING) {
        traceMask = ZooTrace.SERVER_PING_TRACE_MASK;
      }
      if (LOG.isTraceEnabled()) {
        ZooTrace.logQuorumPacket(LOG, traceMask, 'i', qp);
      }
      tickOfNextAckDeadline = leader.self.tick.get() + leader.self.syncLimit;


      ByteBuffer bb;
      long sessionId;
      int cxid;
      int type;

      switch (qp.getType()) {
      case Leader.ACK:
        if (this.learnerType == LearnerType.OBSERVER) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Received ACK from Observer  " + this.sid);
          }
        }
        syncLimitCheck.updateAck(qp.getZxid());
        leader.processAck(this.sid, qp.getZxid(), sock.getLocalSocketAddress());
        break;
      case Leader.PING:
        // Process the touches
        ByteArrayInputStream bis = new ByteArrayInputStream(qp.getData());
        DataInputStream dis = new DataInputStream(bis);
        while (dis.available() > 0) {
          long sess = dis.readLong();
          int to = dis.readInt();
          leader.zk.touch(sess, to);
        }
        break;
      case Leader.REVALIDATE:
        bis = new ByteArrayInputStream(qp.getData());
        dis = new DataInputStream(bis);
        long id = dis.readLong();
        int to = dis.readInt();
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        DataOutputStream dos = new DataOutputStream(bos);
        dos.writeLong(id);
        boolean valid = leader.zk.checkIfValidGlobalSession(id, to);
        if (valid) {
          try {
            //set the session owner
            // as the follower that
            // owns the session
            leader.zk.setOwner(id, this);
          } catch (SessionExpiredException e) {
            LOG.error("Somehow session " + Long.toHexString(id) +
                " expired right after being renewed! (impossible)", e);
          }
        }
        if (LOG.isTraceEnabled()) {
          ZooTrace.logTraceMessage(LOG, ZooTrace.SESSION_TRACE_MASK,
                       "Session 0x" + Long.toHexString(id) + " is valid: "+ valid);
        }
        dos.writeBoolean(valid);
        qp.setData(bos.toByteArray());
        queuedPackets.add(qp);
        break;
      case Leader.REQUEST:
        bb = ByteBuffer.wrap(qp.getData());
        sessionId = bb.getLong();
        cxid = bb.getInt();
        type = bb.getInt();
        bb = bb.slice();
        Request si;
        if(type == OpCode.sync){
          si = new LearnerSyncRequest(this, sessionId, cxid, type, bb, qp.getAuthinfo());
        } else {
          si = new Request(null, sessionId, cxid, type, bb, qp.getAuthinfo());
        }
        si.setOwner(this);
        leader.zk.submitLearnerRequest(si);
        break;
      default:
        LOG.warn("unexpected quorum packet, type: {}", packetToString(qp));
        break;
      }
    }
  } catch (IOException e) {
    // ...
  } finally {
    // ...
  }
}

c. Follower extends Learner

void followLeader() throws InterruptedException {

  // ...

  try {
    // =====> 1. 查找 leader
    QuorumServer leaderServer = findLeader();
    try {
      // =====> 2. 连接到 leader
      connectToLeader(leaderServer.addr, leaderServer.hostname);
      // =====> 3. 向 leader 注册
      long newEpochZxid = registerWithLeader(Leader.FOLLOWERINFO);
      if (self.isReconfigStateChange())
         throw new Exception("learned about role change");
      // check to see if the leader zxid is lower than ours
      // this should never happen but is just a safety check
      long newEpoch = ZxidUtils.getEpochFromZxid(newEpochZxid);
      if (newEpoch < self.getAcceptedEpoch()) {
        LOG.error("Proposed leader epoch "
                    + ZxidUtils.zxidToString(newEpochZxid)
                    + " is less than our accepted epoch "
                    + ZxidUtils.zxidToString(self.getAcceptedEpoch()));
        throw new IOException("Error: Epoch of leader is lower");
      }
      syncWithLeader(newEpochZxid);
      QuorumPacket qp = new QuorumPacket();

      // =====> 4. 循环等待接收消息
      while (this.isRunning()) {
        // =====> 4.1 读取 packet 信息
        readPacket(qp);
        // =====> 4.2 处理 packet 消息
        processPacket(qp);
      }

    } catch (Exception e) {
      // ...
    }
  } finally {
    zk.unregisterJMX((Learner)this);
  }
}

/**
 * 1. Returns the address of the node we think is the leader.
 */
protected QuorumServer findLeader() {
  QuorumServer leaderServer = null;
  // Find the leader by id
  // =====> 选举投票的时候记录的,最后推荐的 leader 的 sid
  Vote current = self.getCurrentVote();
  // =====> 如果这个 sid 在启动的所有服务器范围中
  for (QuorumServer s : self.getView().values()) {
    if (s.id == current.getId()) {
      // Ensure we have the leader's correct IP address before attempting to connect.
      // =====> 尝试连接 leader 的正确 IP 地址
      s.recreateSocketAddresses();
      leaderServer = s;
      break;
    }
  }
  if (leaderServer == null) {
    LOG.warn("Couldn't find the leader with id = " + current.getId());
  }
  return leaderServer;
}

/**
 * 2. Establish a connection with the Leader found by findLeader. Retries
 * until either initLimit time has elapsed or 5 tries have happened.
 * @param addr - the address of the Leader to connect to.
 */
protected void connectToLeader(InetSocketAddress addr, String hostname)
    throws IOException, InterruptedException, X509Exception {
  this.sock = createSocket();

  int initLimitTime = self.tickTime * self.initLimit;
  int remainingInitLimitTime = initLimitTime;
  long startNanoTime = nanoTime();

  for (int tries = 0; tries < 5; tries++) {
    try {
      // recalculate the init limit time because retries sleep for 1000 milliseconds
      remainingInitLimitTime = initLimitTime - (int)((nanoTime() - startNanoTime) / 1000000);
      if (remainingInitLimitTime <= 0) {
        LOG.error("initLimit exceeded on retries.");
        throw new IOException("initLimit exceeded on retries.");
      }

      // =====> 连接
      sockConnect(sock, addr, Math.min(self.tickTime * self.syncLimit, remainingInitLimitTime));
      if (self.isSslQuorum())  {
        ((SSLSocket) sock).startHandshake();
      }
      sock.setTcpNoDelay(nodelay);
      break;
    } catch (IOException e) {
      // ...
    }
    Thread.sleep(1000);
  }

  self.authLearner.authenticate(sock, hostname);

  leaderIs = BinaryInputArchive.getArchive(new BufferedInputStream(sock.getInputStream()));
  bufferedOutput = new BufferedOutputStream(sock.getOutputStream());
  leaderOs = BinaryOutputArchive.getArchive(bufferedOutput);
}

/**
 * 3. Once connected to the leader, perform the handshake protocol to
 * establish a following / observing connection.
 * @param pktType
 * @return the zxid the Leader sends for synchronization purposes.
 * @throws IOException
 */
protected long registerWithLeader(int pktType) throws IOException {
  /*
   * Send follower info, including last zxid and sid
   */
  long lastLoggedZxid = self.getLastLoggedZxid();
  QuorumPacket qp = new QuorumPacket();
  qp.setType(pktType);
  qp.setZxid(ZxidUtils.makeZxid(self.getAcceptedEpoch(), 0));

  /*
   * Add sid to payload
   */
  LearnerInfo li = new LearnerInfo(self.getId(), 0x10000, self.getQuorumVerifier().getVersion());
  ByteArrayOutputStream bsid = new ByteArrayOutputStream();
  BinaryOutputArchive boa = BinaryOutputArchive.getArchive(bsid);
  boa.writeRecord(li, "LearnerInfo");
  qp.setData(bsid.toByteArray());

  // =====> 发送 FollowerInfo 给 Leader
  writePacket(qp, true);
  // =====> 读取 Leader 返回的结果:LeaderInfo
  readPacket(qp);

  final long newEpoch = ZxidUtils.getEpochFromZxid(qp.getZxid());

  // =====> 如果接收到 LeaderInfo
  if (qp.getType() == Leader.LEADERINFO) {
    // we are connected to a 1.0 server so accept the new epoch and read the next packet
    leaderProtocolVersion = ByteBuffer.wrap(qp.getData()).getInt();
    byte epochBytes[] = new byte[4];
    final ByteBuffer wrappedEpochBytes = ByteBuffer.wrap(epochBytes);
    // =====> 接收 leader 的 epoch
    if (newEpoch > self.getAcceptedEpoch()) {
      // =====> 把自己原来的 epoch 保存在 wrappedEpochBytes 里
      wrappedEpochBytes.putInt((int)self.getCurrentEpoch());
      // =====> 把 Leader 发送过来的 epoch 保存起来
      self.setAcceptedEpoch(newEpoch);
    } else if (newEpoch == self.getAcceptedEpoch()) {
      // since we have already acked an epoch equal to the leaders, we cannot ack
      // again, but we still need to send our lastZxid to the leader so that we can
      // sync with it if it does assume leadership of the epoch.
      // the -1 indicates that this reply should not count as an ack for the new epoch
      wrappedEpochBytes.putInt(-1);
    } else {
      throw new IOException("Leaders epoch, " + newEpoch
                  + " is less than accepted epoch, " + self.getAcceptedEpoch());
    }
    // =====> 发送 ackEpoch 给 leader (包含了自己的 epoch 和 zxid)
    QuorumPacket ackNewEpoch = new QuorumPacket(Leader.ACKEPOCH, lastLoggedZxid, epochBytes, null);
    writePacket(ackNewEpoch, true);
    return ZxidUtils.makeZxid(newEpoch, 0);
  } else {
    if (newEpoch > self.getAcceptedEpoch()) {
      self.setAcceptedEpoch(newEpoch);
    }
    if (qp.getType() != Leader.NEWLEADER) {
      LOG.error("First packet should have been NEWLEADER");
      throw new IOException("First packet should have been NEWLEADER");
    }
    return qp.getZxid();
  }
}

/**
 * 4. Examine the packet received in qp and dispatch based on its contents.
 * @param qp
 * @throws IOException
 */
protected void processPacket(QuorumPacket qp) throws Exception{
  switch (qp.getType()) {
  case Leader.PING:
    ping(qp);
    break;
  case Leader.PROPOSAL:
    TxnHeader hdr = new TxnHeader();
    Record txn = SerializeUtils.deserializeTxn(qp.getData(), hdr);
    if (hdr.getZxid() != lastQueued + 1) {
      LOG.warn("Got zxid 0x"
          + Long.toHexString(hdr.getZxid())
          + " expected 0x"
          + Long.toHexString(lastQueued + 1));
    }
    lastQueued = hdr.getZxid();

    if (hdr.getType() == OpCode.reconfig){
       SetDataTxn setDataTxn = (SetDataTxn) txn;
       QuorumVerifier qv = self.configFromString(new String(setDataTxn.getData()));
       self.setLastSeenQuorumVerifier(qv, true);
    }

    fzk.logRequest(hdr, txn);
    break;
  case Leader.COMMIT:                    // <======
    fzk.commit(qp.getZxid());
    break;

  case Leader.COMMITANDACTIVATE:
     // get the new configuration from the request
     Request request = fzk.pendingTxns.element();
     SetDataTxn setDataTxn = (SetDataTxn) request.getTxn();
     QuorumVerifier qv = self.configFromString(new String(setDataTxn.getData()));

     // get new designated leader from (current) leader's message
     ByteBuffer buffer = ByteBuffer.wrap(qp.getData());
     long suggestedLeaderId = buffer.getLong();
    boolean majorChange = self.processReconfig(qv, suggestedLeaderId, qp.getZxid(), true);
     // commit (writes the new config to ZK tree (/zookeeper/config)
     fzk.commit(qp.getZxid());
    if (majorChange) {
       throw new Exception("changes proposed in reconfig");
     }
     break;
  case Leader.UPTODATE:
    LOG.error("Received an UPTODATE message after Follower started");
    break;
  case Leader.REVALIDATE:
    revalidate(qp);
    break;
  case Leader.SYNC:
    fzk.sync();
    break;
  default:
    LOG.warn("Unknown packet type: {}", LearnerHandler.packetToString(qp));
    break;
  }
}

/**
 * When a COMMIT message is received, eventually this method is called,
 * which matches up the zxid from the COMMIT with (hopefully) the head of
 * the pendingTxns queue and hands it to the commitProcessor to commit.
 * @param zxid - must correspond to the head of pendingTxns if it exists
 */
public void commit(long zxid) {
  if (pendingTxns.size() == 0) {
    LOG.warn("Committing " + Long.toHexString(zxid)
        + " without seeing txn");
    return;
  }
  long firstElementZxid = pendingTxns.element().zxid;
  if (firstElementZxid != zxid) {
    LOG.error("Committing zxid 0x" + Long.toHexString(zxid)
        + " but next pending txn 0x"
        + Long.toHexString(firstElementZxid));
    System.exit(12);
  }

  // =====>
  Request request = pendingTxns.remove();
  commitProcessor.commit(request);
}

3. Leader/Follower 启动流程

Leader 启动流程图:

Follower 启动流程图:

4. Client 启动源码

4.1 zkCli.sh

ZOOBIN="${BASH_SOURCE-$0}"
ZOOBIN="$(dirname "${ZOOBIN}")"
ZOOBINDIR="$(cd "${ZOOBIN}"; pwd)"

if [ -e "$ZOOBIN/../libexec/zkEnv.sh" ]; then
  . "$ZOOBINDIR"/../libexec/zkEnv.sh
else
  . "$ZOOBINDIR"/zkEnv.sh
fi

ZOO_LOG_FILE=zookeeper-$USER-cli-$HOSTNAME.log

"$JAVA"
    "-Dzookeeper.log.dir=${ZOO_LOG_DIR}"
    "-Dzookeeper.root.logger=${ZOO_LOG4J_PROP}"
    "-Dzookeeper.log.file=${ZOO_LOG_FILE}"
     -cp "$CLASSPATH" $CLIENT_JVMFLAGS $JVMFLAGS \
     org.apache.zookeeper.ZooKeeperMain "$@"

在 zkCli.sh 启动 Zookeeper 时,会调用 ZooKeeperMain.java。查找 ZooKeeperMain,找到程序的入口 main() 方法:

public static void main(String args[]) throws ... {
    ZooKeeperMain main = new ZooKeeperMain(args);
    main.run();
}

4.2 ZookeeperMain

public ZooKeeperMain(String args[]) throws IOException, InterruptedException {
  cl.parseOptions(args);
  System.out.println("Connecting to " + cl.getOption("server"));
  connectToZK(cl.getOption("server"));
}

protected void connectToZK(String newHost) throws InterruptedException, IOException {
  if (zk != null && zk.getState().isAlive()) {
    zk.close();
  }

  host = newHost;
  boolean readOnly = cl.getOption("readonly") != null;
  if (cl.getOption("secure") != null) {
    System.setProperty(ZKClientConfig.SECURE_CLIENT, "true");
    System.out.println("Secure connection is enabled");
  }
  // =====> 创建 ZooKeeperAdmin 对象
  zk = new ZooKeeperAdmin(host, Integer.parseInt(cl.getOption("timeout")), new MyWatcher(), readOnly);
}
public ZooKeeperAdmin(String connectString, int sessionTimeout, 
    Watcher watcher, boolean canBeReadOnly) throws IOException {
  super(connectString, sessionTimeout, watcher, canBeReadOnly);
}

public ZooKeeper(String connectString, int sessionTimeout, Watcher watcher,
    boolean canBeReadOnly) throws IOException {
  this(connectString, sessionTimeout, watcher, canBeReadOnly,
      createDefaultHostProvider(connectString));
}

public ZooKeeper(String connectString, int sessionTimeout, Watcher watcher,
    boolean canBeReadOnly, HostProvider aHostProvider) throws IOException {
  this(connectString, sessionTimeout, watcher, canBeReadOnly, aHostProvider, null);
}

public ZooKeeper(String connectString, int sessionTimeout, Watcher watcher,
    boolean canBeReadOnly, HostProvider aHostProvider,
    ZKClientConfig clientConfig) throws IOException {
  LOG.info("Initiating client connection, connectString=" + connectString
      + " sessionTimeout=" + sessionTimeout + " watcher=" + watcher);

  if (clientConfig == null) {
    clientConfig = new ZKClientConfig();
  }
  this.clientConfig = clientConfig;
  watchManager = defaultWatchManager();
  // =====> a. 赋值 watcher 给默认的 defaultWatcher
  watchManager.defaultWatcher = watcher;
  // =====> b. 解析连接地址
  ConnectStringParser connectStringParser = new ConnectStringParser(connectString);
  hostProvider = aHostProvider;

  // =====> c. 客户端与服务器端通信的终端
  cnxn = createConnection(connectStringParser.getChrootPath(),
                              hostProvider, sessionTimeout, this, watchManager,
                              getClientCnxnSocket(), canBeReadOnly);
  // =====> d. 执行 run()
  cnxn.start();
}

a. 创建默认监听器

b. 解析连接地址

ConnectStringParser

public ConnectStringParser(String connectString) {
  // connectString = "hadoop102:2181,hadoop103:2181,hadoop104:2181"
  // parse out chroot, if any
  int off = connectString.indexOf('/');
  if (off >= 0) {
    String chrootPath = connectString.substring(off);
    // ignore "/" chroot spec, same as null
    if (chrootPath.length() == 1) {
      this.chrootPath = null;
    } else {
      PathUtils.validatePath(chrootPath);
      this.chrootPath = chrootPath;
    }
    connectString = connectString.substring(0, off);
  } else {
    this.chrootPath = null;
  }

  List<String> hostsList = split(connectString,",");
  for (String host : hostsList) {
    int port = DEFAULT_PORT;
    int pidx = host.lastIndexOf(':');
    if (pidx >= 0) {
      // otherwise : is at the end of the string, ignore
      if (pidx < host.length() - 1) {
        port = Integer.parseInt(host.substring(pidx + 1));
      }
      host = host.substring(0, pidx);
    }
    serverAddresses.add(InetSocketAddress.createUnresolved(host, port));
  }
}

c. 创建通信

public static final String ZOOKEEPER_CLIENT_CNXN_SOCKET = "zookeeper.clientCnxnSocket";

public static final String ZOOKEEPER_CLIENT_CNXN_SOCKET = ZooKeeper.ZOOKEEPER_CLIENT_CNXN_SOCKET;

private ClientCnxnSocket getClientCnxnSocket() throws IOException {
  String clientCnxnSocketName = getClientConfig()
              .getProperty(ZKClientConfig.ZOOKEEPER_CLIENT_CNXN_SOCKET);
  if (clientCnxnSocketName == null) {
    clientCnxnSocketName = ClientCnxnSocketNIO.class.getName();
  }
  try {
    Constructor<?> clientCxnConstructor =
        Class.forName(clientCnxnSocketName).getDeclaredConstructor(ZKClientConfig.class);
    ClientCnxnSocket clientCxnSocket =
        (ClientCnxnSocket) clientCxnConstructor.newInstance(getClientConfig());
    return clientCxnSocket;
  } catch (Exception e) {
    IOException ioe = new IOException("Couldn't instantiate " + clientCnxnSocketName);
    ioe.initCause(e);
    throw ioe;
  }
}
protected ClientCnxn createConnection(String chrootPath,
    HostProvider hostProvider, int sessionTimeout, ZooKeeper zooKeeper,
    ClientWatchManager watcher, ClientCnxnSocket clientCnxnSocket,
    boolean canBeReadOnly) throws IOException {
  return new ClientCnxn(chrootPath, hostProvider, sessionTimeout, this,
      watchManager, clientCnxnSocket, canBeReadOnly);
}

ClientCnxn

/**
 * This class manages the socket i/o for the client. ClientCnxn maintains a list
 * of available servers to connect to and "transparently" switches servers it is
 * connected to as needed.
 */

public ClientCnxn(String chrootPath, HostProvider hostProvider,
		int sessionTimeout, ZooKeeper zooKeeper, ClientWatchManager watcher,
		ClientCnxnSocket clientCnxnSocket, boolean canBeReadOnly) throws IOException {
  this(chrootPath, hostProvider, sessionTimeout, zooKeeper, watcher,
     clientCnxnSocket, 0, new byte[16], canBeReadOnly);
}

public ClientCnxn(String chrootPath, HostProvider hostProvider, int sessionTimeout, ZooKeeper zooKeeper,
    ClientWatchManager watcher, ClientCnxnSocket clientCnxnSocket,
    long sessionId, byte[] sessionPasswd, boolean canBeReadOnly) {
  this.zooKeeper = zooKeeper;
  this.watcher = watcher;
  this.sessionId = sessionId;
  this.sessionPasswd = sessionPasswd;
  this.sessionTimeout = sessionTimeout;
  this.hostProvider = hostProvider;
  this.chrootPath = chrootPath;

  connectTimeout = sessionTimeout / hostProvider.size();
  readTimeout = sessionTimeout * 2 / 3;
  readOnly = canBeReadOnly;

  // =====> 创建两个线程,均继承自 ZooKeeperThread
  sendThread = new SendThread(clientCnxnSocket);
  eventThread = new EventThread();
  this.clientConfig=zooKeeper.getClientConfig();
  initRequestTimeout();
}

public void start() {
    sendThread.start();
    eventThread.start();
}

ClientCnxn.SendThread#run():

@Override
public void run() {
  clientCnxnSocket.introduce(this, sessionId, outgoingQueue);
  clientCnxnSocket.updateNow();
  clientCnxnSocket.updateLastSendAndHeard();
  int to;
  long lastPingRwServer = Time.currentElapsedTime();
  final int MAX_SEND_PING_INTERVAL = 10000; //10 seconds
  InetSocketAddress serverAddress = null;

  // =====> 循环发送、循环接收
  while (state.isAlive()) {
    try {
      if (!clientCnxnSocket.isConnected()) {
        // don't re-establish connection if we are closing
        if (closing) {
          break;
        }
        if (rwServerAddress != null) {
          serverAddress = rwServerAddress;
          rwServerAddress = null;
        } else {
          serverAddress = hostProvider.next(1000);
        }
        // =====> 启动连接服务端
        startConnect(serverAddress);
        clientCnxnSocket.updateLastSendAndHeard();
      }

      if (state.isConnected()) {
        // determine whether we need to send an AuthFailed event.
        if (zooKeeperSaslClient != null) {
          boolean sendAuthEvent = false;
          if (zooKeeperSaslClient.getSaslState() == ZooKeeperSaslClient.SaslState.INITIAL) {
            try {
              zooKeeperSaslClient.initialize(ClientCnxn.this);
            } catch (SaslException e) {
               LOG.error("SASL authentication with Zookeeper Quorum member failed: " + e);
              state = States.AUTH_FAILED;
              sendAuthEvent = true;
            }
          }
          KeeperState authState = zooKeeperSaslClient.getKeeperState();
          if (authState != null) {
            if (authState == KeeperState.AuthFailed) {
              // An authentication error occurred during authentication with the Zookeeper Server.
              state = States.AUTH_FAILED;
              sendAuthEvent = true;
            } else {
              if (authState == KeeperState.SaslAuthenticated) {
                sendAuthEvent = true;
              }
            }
          }

          if (sendAuthEvent) {
            eventThread.queueEvent(new WatchedEvent(Watcher.Event.EventType.None, authState, null));
            if (state == States.AUTH_FAILED) {
              eventThread.queueEventOfDeath();
            }
          }
        }
        to = readTimeout - clientCnxnSocket.getIdleRecv();
      } else {
        to = connectTimeout - clientCnxnSocket.getIdleRecv();
      }

      if (to <= 0) {
        String warnInfo;
        warnInfo = "Client session timed out, have not heard from server in "
                      + clientCnxnSocket.getIdleRecv() + "ms" + " for sessionid 0x"
                      + Long.toHexString(sessionId);
        LOG.warn(warnInfo);
        throw new SessionTimeoutException(warnInfo);
      }
      if (state.isConnected()) {
        //1000(1 second) is to prevent race condition missing to send the second ping
        //also make sure not to send too many pings when readTimeout is small.
        int timeToNextPing = readTimeout / 2 - clientCnxnSocket.getIdleSend()
                                - ((clientCnxnSocket.getIdleSend() > 1000) ? 1000 : 0);
        //send a ping request either time is due or no packet sent out within MAX_SEND_PING_INTERVAL
        if (timeToNextPing <= 0 || clientCnxnSocket.getIdleSend() > MAX_SEND_PING_INTERVAL) {
          sendPing();
          clientCnxnSocket.updateLastSend();
        } else {
          if (timeToNextPing < to) {
            to = timeToNextPing;
          }
        }
      }

      // If we are in read-only mode, seek for read/write server
      if (state == States.CONNECTEDREADONLY) {
        long now = Time.currentElapsedTime();
        int idlePingRwServer = (int) (now - lastPingRwServer);
        if (idlePingRwServer >= pingRwTimeout) {
          lastPingRwServer = now;
          idlePingRwServer = 0;
          pingRwTimeout =
            Math.min(2*pingRwTimeout, maxPingRwTimeout);
          pingRwServer();
        }
        to = Math.min(to, pingRwTimeout - idlePingRwServer);
      }

      // =====> 接收服务端响应并处理
      clientCnxnSocket.doTransport(to, pendingQueue, ClientCnxn.this);
    } catch (Throwable e) {
      // ...
    }
  }
  synchronized (state) {
    // When it comes to this point, it guarantees that later queued
    // packet to outgoingQueue will be notified of death.
    cleanup();
  }
  clientCnxnSocket.close();
  if (state.isAlive()) {
    eventThread.queueEvent(new WatchedEvent(
        Event.EventType.None, Event.KeeperState.Disconnected, null));
  }
  eventThread.queueEvent(new WatchedEvent(
          Event.EventType.None, Event.KeeperState.Closed, null));
  ZooTrace.logTraceMessage(LOG, ZooTrace.getTextTraceLevel(),
      "SendThread exited loop for session: 0x"
           + Long.toHexString(getSessionId()));
}

private void startConnect(InetSocketAddress addr) throws IOException {
  // initializing it for new connection
  saslLoginFailed = false;
  if(!isFirstConnect){
    try {
      Thread.sleep(r.nextInt(1000));
    } catch (InterruptedException e) {
      LOG.warn("Unexpected exception", e);
    }
  }
  state = States.CONNECTING;

  String hostPort = addr.getHostString() + ":" + addr.getPort();
  MDC.put("myid", hostPort);
  setName(getName().replaceAll("\\(.*\\)", "(" + hostPort + ")"));
  if (clientConfig.isSaslClientEnabled()) {
    try {
      if (zooKeeperSaslClient != null) {
        zooKeeperSaslClient.shutdown();
      }
      zooKeeperSaslClient = new ZooKeeperSaslClient(SaslServerPrincipal.getServerPrincipal(addr, clientConfig),
        clientConfig);
    } catch (LoginException e) {
      // ...
    }
  }
  logStartConnect(addr);

  // =====> 建立连接
  clientCnxnSocket.connect(addr);
}

ClientCnxnSocketNIO

@Override
void connect(InetSocketAddress addr) throws IOException {
  SocketChannel sock = createSock();
  // ...

  registerAndConnect(sock, addr);

  // ...
}

d. 执行 run()

void run() throws CliException, IOException, InterruptedException {
  if (cl.getCommand() == null) {
    System.out.println("Welcome to ZooKeeper!");

    boolean jlinemissing = false;
    // only use jline if it's in the classpath
    try {
      Class<?> consoleC = Class.forName("jline.console.ConsoleReader");
      Class<?> completorC =
        Class.forName("org.apache.zookeeper.JLineZNodeCompleter");

      System.out.println("JLine support is enabled");

      Object console =
        consoleC.getConstructor().newInstance();

      Object completor =
        completorC.getConstructor(ZooKeeper.class).newInstance(zk);
      Method addCompletor = consoleC.getMethod("addCompleter",
          Class.forName("jline.console.completer.Completer"));
      addCompletor.invoke(console, completor);

      String line;
      Method readLine = consoleC.getMethod("readLine", String.class);

	  // =====> 一行一行读取命令
      while ((line = (String)readLine.invoke(console, getPrompt())) != null) {
        executeLine(line);
      }
    } catch (ClassNotFoundException e) {
      LOG.debug("Unable to start jline", e);
      jlinemissing = true;
    } catch (NoSuchMethodException e) {
      LOG.debug("Unable to start jline", e);
      jlinemissing = true;
    } catch (InvocationTargetException e) {
      LOG.debug("Unable to start jline", e);
      jlinemissing = true;
    } catch (IllegalAccessException e) {
      LOG.debug("Unable to start jline", e);
      jlinemissing = true;
    } catch (InstantiationException e) {
      LOG.debug("Unable to start jline", e);
      jlinemissing = true;
    }

    if (jlinemissing) {
      System.out.println("JLine support is disabled");
      BufferedReader br =
        new BufferedReader(new InputStreamReader(System.in));

      String line;
      while ((line = br.readLine()) != null) {
        executeLine(line);
      }
    }
  } else {
    // Command line args non-null.  Run what was passed.
    processCmd(cl);
  }
  System.exit(exitCode);
}


public void executeLine(String line) throws CliException, InterruptedException, IOException {
  if (!line.equals("")) {
	cl.parseCommand(line);
	addToHistory(commandCount,line);
	// =====> 处理客户端命令
	processCmd(cl);
	commandCount++;
  }
}


protected boolean processCmd(MyCommandOptions co) throws CliException, IOException, InterruptedException {
  boolean watch = false;
  try {
    // =====> 解析命令
    watch = processZKCmd(co);
    exitCode = 0;
  } catch (CliException ex) {
    exitCode = ex.getExitCode();
    System.err.println(ex.getMessage());
  }
  return watch;
}


protected boolean processZKCmd(MyCommandOptions co) throws CliException, IOException, InterruptedException {
  String[] args = co.getArgArray();
  String cmd = co.getCommand();
  if (args.length < 1) {
    usage();
    throw new MalformedCommandException("No command entered");
  }

  if (!commandMap.containsKey(cmd)) {
    usage();
    throw new CommandNotFoundException("Command not found " + cmd);
  }

  boolean watch = false;
  LOG.debug("Processing " + cmd);


  if (cmd.equals("quit")) {
    zk.close();
    System.exit(exitCode);
  } else if (cmd.equals("redo") && args.length >= 2) {
    Integer i = Integer.decode(args[1]);
    if (commandCount <= i || i < 0) { // don't allow redoing this redo
      throw new MalformedCommandException("Command index out of range");
    }
    cl.parseCommand(history.get(i));
    if (cl.getCommand().equals("redo")) {
      throw new MalformedCommandException("No redoing redos");
    }
    history.put(commandCount, history.get(i));
    processCmd(cl);
  } else if (cmd.equals("history")) {
    for (int i = commandCount - 10; i <= commandCount; ++i) {
      if (i < 0) continue;
      System.out.println(i + " - " + history.get(i));
    }
  } else if (cmd.equals("printwatches")) {
    if (args.length == 1) {
      System.out.println("printwatches is " + (printWatches ? "on" : "off"));
    } else {
      printWatches = args[1].equals("on");
    }
  } else if (cmd.equals("connect")) {
    if (args.length >= 2) {
      connectToZK(args[1]);
    } else {
      connectToZK(host);
    }
  }

  // Below commands all need a live connection
  if (zk == null || !zk.getState().isAlive()) {
    System.out.println("Not connected");
    return false;
  }

  // execute from commandMap
  CliCommand cliCmd = commandMapCli.get(cmd);
  if(cliCmd != null) {
    cliCmd.setZk(zk);
    watch = cliCmd.parse(args).exec();
  } else if (!commandMap.containsKey(cmd)) {
     usage();
  }
  return watch;
}
posted @ 2023-02-12 16:19  tree6x7  阅读(36)  评论(0编辑  收藏  举报