Zookeeper选举Leader源码剖析
开始分析
【1】分析入口类做了什么
//org.apache.zookeeper.server.quorum包下QuorumPeerMain类 public static void main(String[] args) { QuorumPeerMain main = new QuorumPeerMain(); try { main.initializeAndRun(args); } catch (IllegalArgumentException e) {..} catch (ConfigException e) {..} catch (DatadirException e) {..} catch (AdminServerException e) {..} catch (Exception e) {..} ServiceUtils.requestSystemExit(ExitCode.EXECUTION_FINISHED.getValue()); } protected void initializeAndRun(String[] args) throws ConfigException, IOException, AdminServerException { QuorumPeerConfig config = new QuorumPeerConfig(); if (args.length == 1) { //解析配置文件加载到内存 //主要是调用了QuorumPeerConfig类#parse方法,解析逻辑在parseProperties方法 config.parse(args[0]); } //启动延时的定期清理快照数据文件 DatadirCleanupManager purgeMgr = new DatadirCleanupManager( config.getDataDir(), config.getDataLogDir(), config.getSnapRetainCount(), config.getPurgeInterval()); purgeMgr.start(); if (args.length == 1 && config.isDistributed()) { //集群的入口 runFromConfig(config); } else { //单机的入口 ZooKeeperServerMain.main(args); } }
【2】runFromConfig方法做了什么
public void runFromConfig(QuorumPeerConfig config) throws IOException, AdminServerException { try { ManagedUtil.registerLog4jMBeans(); } catch (JMException e) { LOG.warn("Unable to register log4j JMX control", e); } LOG.info("Starting quorum peer, myid=" + config.getServerId()); final MetricsProvider metricsProvider; try { metricsProvider = MetricsProviderBootstrap.startMetricsProvider( config.getMetricsProviderClassName(), config.getMetricsProviderConfiguration()); } catch (MetricsProviderLifeCycleException error) { throw new IOException("Cannot boot MetricsProvider " + config.getMetricsProviderClassName(), error); } try { ServerMetrics.metricsProviderInitialized(metricsProvider); ProviderRegistry.initialize(); ServerCnxnFactory cnxnFactory = null; ServerCnxnFactory secureCnxnFactory = null; if (config.getClientPortAddress() != null) { //初始化服务端连接对象 cnxnFactory = ServerCnxnFactory.createFactory(); //设置监听端口,从配置文件中拿 cnxnFactory.configure(config.getClientPortAddress(), config.getMaxClientCnxns(), config.getClientPortListenBacklog(), false); } if (config.getSecureClientPortAddress() != null) { secureCnxnFactory = ServerCnxnFactory.createFactory(); secureCnxnFactory.configure(config.getSecureClientPortAddress(), config.getMaxClientCnxns(), config.getClientPortListenBacklog(), true); } //构建本机节点,并将配置参数的数据传入 quorumPeer = getQuorumPeer(); quorumPeer.setTxnFactory(new FileTxnSnapLog(config.getDataLogDir(), config.getDataDir())); quorumPeer.enableLocalSessions(config.areLocalSessionsEnabled()); quorumPeer.enableLocalSessionsUpgrading(config.isLocalSessionsUpgradingEnabled()); //quorumPeer.setQuorumPeers(config.getAllMembers()); quorumPeer.setElectionType(config.getElectionAlg()); quorumPeer.setMyid(config.getServerId()); quorumPeer.setTickTime(config.getTickTime()); quorumPeer.setMinSessionTimeout(config.getMinSessionTimeout()); quorumPeer.setMaxSessionTimeout(config.getMaxSessionTimeout()); quorumPeer.setInitLimit(config.getInitLimit()); quorumPeer.setSyncLimit(config.getSyncLimit()); quorumPeer.setConnectToLearnerMasterLimit(config.getConnectToLearnerMasterLimit()); quorumPeer.setObserverMasterPort(config.getObserverMasterPort()); quorumPeer.setConfigFileName(config.getConfigFilename()); quorumPeer.setClientPortListenBacklog(config.getClientPortListenBacklog()); quorumPeer.setZKDatabase(new ZKDatabase(quorumPeer.getTxnFactory())); quorumPeer.setQuorumVerifier(config.getQuorumVerifier(), false); if (config.getLastSeenQuorumVerifier() != null) { quorumPeer.setLastSeenQuorumVerifier(config.getLastSeenQuorumVerifier(), false); } quorumPeer.initConfigInZKDatabase(); //将连接对象也存入本节点 quorumPeer.setCnxnFactory(cnxnFactory); quorumPeer.setSecureCnxnFactory(secureCnxnFactory); quorumPeer.setSslQuorum(config.isSslQuorum()); quorumPeer.setUsePortUnification(config.shouldUsePortUnification()); quorumPeer.setLearnerType(config.getPeerType()); quorumPeer.setSyncEnabled(config.getSyncEnabled()); quorumPeer.setQuorumListenOnAllIPs(config.getQuorumListenOnAllIPs()); if (config.sslQuorumReloadCertFiles) { quorumPeer.getX509Util().enableCertFileReloading(); } quorumPeer.setMultiAddressEnabled(config.isMultiAddressEnabled()); quorumPeer.setMultiAddressReachabilityCheckEnabled(config.isMultiAddressReachabilityCheckEnabled()); quorumPeer.setMultiAddressReachabilityCheckTimeoutMs(config.getMultiAddressReachabilityCheckTimeoutMs()); // sets quorum sasl authentication configurations quorumPeer.setQuorumSaslEnabled(config.quorumEnableSasl); if (quorumPeer.isQuorumSaslAuthEnabled()) { quorumPeer.setQuorumServerSaslRequired(config.quorumServerRequireSasl); quorumPeer.setQuorumLearnerSaslRequired(config.quorumLearnerRequireSasl); quorumPeer.setQuorumServicePrincipal(config.quorumServicePrincipal); quorumPeer.setQuorumServerLoginContext(config.quorumServerLoginContext); quorumPeer.setQuorumLearnerLoginContext(config.quorumLearnerLoginContext); } quorumPeer.setQuorumCnxnThreadsSize(config.quorumCnxnThreadsSize); quorumPeer.initialize(); if (config.jvmPauseMonitorToRun) { quorumPeer.setJvmPauseMonitor(new JvmPauseMonitor(config)); } //启动节点 quorumPeer.start(); ZKAuditProvider.addZKStartStopAuditLog(); quorumPeer.join(); } catch (InterruptedException e) { // warn, but generally this is ok LOG.warn("Quorum Peer interrupted", e); } finally { try { metricsProvider.stop(); } catch (Throwable error) { LOG.warn("Error while stopping metrics", error); } } }
【3】通信对象的选择
//ServerCnxnFactory类#createFactory方法 //初始化通信对象 public static ServerCnxnFactory createFactory() throws IOException { //属性值展示:String ZOOKEEPER_SERVER_CNXN_FACTORY = "zookeeper.serverCnxnFactory" //官方推荐netty:则应该是ServerCnxnFactory类的子类NettyServerCnxnFactory String serverCnxnFactoryName = System.getProperty(ZOOKEEPER_SERVER_CNXN_FACTORY); if (serverCnxnFactoryName == null) { //但是默认是子类NIOServerCnxnFactory serverCnxnFactoryName = NIOServerCnxnFactory.class.getName(); } try { //利用反射进行初始化 ServerCnxnFactory serverCnxnFactory = (ServerCnxnFactory) Class.forName(serverCnxnFactoryName).getDeclaredConstructor().newInstance(); LOG.info("Using {} as server connection factory", serverCnxnFactoryName); return serverCnxnFactory; } catch (Exception e) { IOException ioe = new IOException("Couldn't instantiate " + serverCnxnFactoryName, e); throw ioe; } }
【4】内存数据库的设计
//org.apache.zookeeper.server包下DataTree类 //节点数据是final NodeHashMap nodes; public class DataTree { private static final Logger LOG = LoggerFactory.getLogger(DataTree.class); private final RateLogger RATE_LOGGER = new RateLogger(LOG, 15 * 60 * 1000); //该映射提供了对datanode的快速查找 private final NodeHashMap nodes; private IWatchManager dataWatches; private IWatchManager childWatches; //缓存所有datanode的路径和数据的总大小 private final AtomicLong nodeDataSize = new AtomicLong(0); //根结点 private static final String rootZookeeper = "/"; private static final String procZookeeper = Quotas.procZookeeper; private static final String procChildZookeeper = procZookeeper.substring(1); private static final String quotaZookeeper = Quotas.quotaZookeeper; private static final String quotaChildZookeeper = quotaZookeeper.substring(procZookeeper.length() + 1); private static final String configZookeeper = ZooDefs.CONFIG_NODE; private static final String configChildZookeeper = configZookeeper.substring(procZookeeper.length() + 1); private final PathTrie pTrie = new PathTrie(); public static final int STAT_OVERHEAD_BYTES = (6 * 8) + (5 * 4); private final Map<Long, HashSet<String>> ephemerals = new ConcurrentHashMap<Long, HashSet<String>>(); private final Set<String> containers = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>()); private final Set<String> ttls = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>()); private final ReferenceCountedACLCache aclCache = new ReferenceCountedACLCache(); public static final int DIGEST_LOG_LIMIT = 1024; public static final int DIGEST_LOG_INTERVAL = 128; private ZxidDigest digestFromLoadedSnapshot; private volatile ZxidDigest lastProcessedZxidDigest; private boolean firstMismatchTxn = true; private final List<DigestWatcher> digestWatchers = new ArrayList<>(); private LinkedList<ZxidDigest> digestLog = new LinkedList<>(); private final DigestCalculator digestCalculator; } public class DataNode implements Record { private volatile long digest; // 指示该节点的摘要是否是最新的 volatile boolean digestCached; byte[] data; Long acl; public StatPersisted stat; private Set<String> children = null; private static final Set<String> EMPTY_SET = Collections.emptySet(); }
【5】quorumPeer.start()节点启动方法又做了什么
@Override public synchronized void start() { if (!getView().containsKey(myid)) { throw new RuntimeException("My id " + myid + " not in the peer list"); } //加载快照文件数据到内存 loadDataBase(); //启动通信对象 startServerCnxnFactory(); try { //JettyAdminServer,启动内嵌Jetty服务,默认8080端口 adminServer.start(); } catch (AdminServerException e) { LOG.warn("Problem starting AdminServer", e); } //初始化选举数据 startLeaderElection(); startJvmPauseMonitor(); super.start(); } private void startServerCnxnFactory() { if (cnxnFactory != null) { //如果有配置netty通信,则NettyServerCnxnFactory类#start方法 cnxnFactory.start(); } if (secureCnxnFactory != null) { secureCnxnFactory.start(); } } //NettyServerCnxnFactory类#start方法 @Override public void start() { if (listenBacklog != -1) { bootstrap.option(ChannelOption.SO_BACKLOG, listenBacklog); } LOG.info("binding to port {}", localAddress); parentChannel = bootstrap.bind(localAddress).syncUninterruptibly().channel(); // Port changes after bind() if the original port was 0, update // localAddress to get the real port. localAddress = (InetSocketAddress) parentChannel.localAddress(); LOG.info("bound to port {}", getLocalPort()); } //选举数据构建 public synchronized void startLeaderElection() { try { if (getPeerState() == ServerState.LOOKING) { //构建选票,myid服务器id标记,最大的事务id,当前服务器的选举轮次 currentVote = new Vote(myid, getLastLoggedZxid(), getCurrentEpoch()); } } catch (IOException e) { RuntimeException re = new RuntimeException(e.getMessage()); re.setStackTrace(e.getStackTrace()); throw re; } //确定选举算法,默认传的是3 this.electionAlg = createElectionAlgorithm(electionType); } //节点状态 public enum ServerState { LOOKING, //等待状态 FOLLOWING, //从节点 LEADING, //主节点 OBSERVING //观察状态 }
【6】选举算法分析(内涵多层队列架构)
//选举算法分析(3.8版本已经将过时的算法去除了) protected Election createElectionAlgorithm(int electionAlgorithm) { Election le = null; //TODO: use a factory rather than a switch switch (electionAlgorithm) { case 1: throw new UnsupportedOperationException("Election Algorithm 1 is not supported."); case 2: throw new UnsupportedOperationException("Election Algorithm 2 is not supported."); case 3: QuorumCnxManager qcm = createCnxnManager(); QuorumCnxManager oldQcm = qcmRef.getAndSet(qcm); if (oldQcm != null) { LOG.warn("Clobbering already-set QuorumCnxManager (restarting leader election?)"); oldQcm.halt(); } QuorumCnxManager.Listener listener = qcm.listener; if (listener != null) { //启动监听线程 listener.start(); //构建收发消息线程 FastLeaderElection fle = new FastLeaderElection(this, qcm); fle.start(); le = fle; } else { LOG.error("Null listener when initializing cnx manager"); } break; default: assert false; } return le; }
【7】翻阅监听线程 listener 做了什么
//翻阅监听线程做了什么,主要是看run方法 @Override public void run() { if (!shutdown) { LOG.debug("Listener thread started, myId: {}", self.getId()); Set<InetSocketAddress> addresses; if (self.getQuorumListenOnAllIPs()) { addresses = self.getElectionAddress().getWildcardAddresses(); } else { addresses = self.getElectionAddress().getAllAddresses(); } CountDownLatch latch = new CountDownLatch(addresses.size()); //循环的方式针对每个地址构建一个ListenerHandler listenerHandlers = addresses.stream().map(address -> new ListenerHandler(address, self.shouldUsePortUnification(), self.isSslQuorum(), latch)) .collect(Collectors.toList()); //针对每个ListenerHandler都会有一个对应的线程进行处理(线程池) final ExecutorService executor = Executors.newFixedThreadPool(addresses.size()); try { listenerHandlers.forEach(executor::submit); } finally { // prevent executor's threads to leak after ListenerHandler tasks complete executor.shutdown(); } try { latch.await(); } catch (InterruptedException ie) {..} finally { // Clean up for shutdown. for (ListenerHandler handler : listenerHandlers) { try { handler.close(); } catch (IOException ie) {...} } } } LOG.info("Leaving listener"); if (!shutdown) { if (socketException.get()) { // After leaving listener thread, the host cannot join the quorum anymore, // this is a severe error that we cannot recover from, so we need to exit socketBindErrorHandler.run(); } } } class ListenerHandler implements Runnable, Closeable { private ServerSocket serverSocket; private InetSocketAddress address; private boolean portUnification; private boolean sslQuorum; private CountDownLatch latch; ListenerHandler(InetSocketAddress address, boolean portUnification, boolean sslQuorum,CountDownLatch latch) { this.address = address; this.portUnification = portUnification; this.sslQuorum = sslQuorum; this.latch = latch; } /** * Sleeps on acceptConnections(). */ @Override public void run() { try { Thread.currentThread().setName("ListenerHandler-" + address); //建立连接 acceptConnections(); try { close(); } catch (IOException e) {...} } catch (Exception e) {...} finally { latch.countDown(); } } @Override public synchronized void close() throws IOException { if (serverSocket != null && !serverSocket.isClosed()) { LOG.debug("Trying to close listeners: {}", serverSocket); serverSocket.close(); } } /** * Sleeps on accept(). */ private void acceptConnections() { int numRetries = 0; Socket client = null; while ((!shutdown) && (portBindMaxRetry == 0 || numRetries < portBindMaxRetry)) { try { //创建serverSocket serverSocket = createNewServerSocket(); LOG.info("{} is accepting connections now, my election bind port: {}", QuorumCnxManager.this.mySid, address.toString()); while (!shutdown) { try { client = serverSocket.accept(); setSockOpts(client); //处理连接消息 if (quorumSaslAuthEnabled) { receiveConnectionAsync(client); } else { receiveConnection(client); } numRetries = 0; } catch (SocketTimeoutException e) {...} } } catch (IOException e) { if (shutdown) { break; } if (e instanceof SocketException) { socketException.set(true); } numRetries++; try { close(); Thread.sleep(1000); } catch (IOException ie) {...} catch (InterruptedException ie) {...} closeSocket(client); } } if (!shutdown) {...} } private ServerSocket createNewServerSocket() throws IOException { ServerSocket socket; if (portUnification) { LOG.info("Creating TLS-enabled quorum server socket"); socket = new UnifiedServerSocket(self.getX509Util(), true); } else if (sslQuorum) { LOG.info("Creating TLS-only quorum server socket"); socket = new UnifiedServerSocket(self.getX509Util(), false); } else { socket = new ServerSocket(); } socket.setReuseAddress(true); address = new InetSocketAddress(address.getHostString(), address.getPort()); //绑定地址与端口 socket.bind(address); return socket; } }
【7.1】receiveConnection方法怎么处理接收到的消息
public void receiveConnection(final Socket sock) { DataInputStream din = null; try { din = new DataInputStream(new BufferedInputStream(sock.getInputStream())); handleConnection(sock, din); } catch (IOException e) { closeSocket(sock); } } private void handleConnection(Socket sock, DataInputStream din) throws IOException { Long sid = null, protocolVersion = null; MultipleAddresses electionAddr = null; try { // 从输入流中读入一个Long(实际上是服务的ID) protocolVersion = din.readLong(); if (protocolVersion >= 0) { // this is a server id and not a protocol version sid = protocolVersion; } else { try { InitialMessage init = InitialMessage.parse(protocolVersion, din); sid = init.sid; if (!init.electionAddr.isEmpty()) { electionAddr = new MultipleAddresses(init.electionAddr, Duration.ofMillis(self.getMultiAddressReachabilityCheckTimeoutMs())); } } catch (InitialMessage.InitialMessageException ex) { closeSocket(sock); return; } } if (sid == QuorumPeer.OBSERVER_ID) { sid = observerCounter.getAndDecrement(); } } catch (IOException e) { closeSocket(sock); return; } // do authenticating learner authServer.authenticate(sock, din); //关闭不必要的连接 //因为socket是双工的,而之前我们是针对了每个服务都要与之建立连接(则有,我连它【自身发起的连接】,它连了我【对方发起的连接】) //说白了两条通道有一条不是必要的 if (sid < self.getId()) { //对方的id小于自身id SendWorker sw = senderWorkerMap.get(sid); if (sw != null) { sw.finish(); } // 关闭当前连接 closeSocket(sock); // 创建当前节点到对面节点的连接 if (electionAddr != null) { connectOne(sid, electionAddr); } else { connectOne(sid); } } //自身的话不需要做什么 else if (sid == self.getId()) {...} else { // 对方id大于自身id // 使用目标节点到当前节点的连接 SendWorker sw = new SendWorker(sock, sid); RecvWorker rw = new RecvWorker(sock, din, sid, sw); sw.setRecv(rw); SendWorker vsw = senderWorkerMap.get(sid); if (vsw != null) { vsw.finish(); } //更新senderWorker与queueSend senderWorkerMap.put(sid, sw); queueSendMap.putIfAbsent(sid, new CircularBlockingQueue<>(SEND_CAPACITY)); sw.start(); rw.start(); } } // 创建当前节点到对面节点的连接 synchronized boolean connectOne(long sid, MultipleAddresses electionAddr) { // 判断连接是否已经存在 if (senderWorkerMap.get(sid) != null) { if (self.isMultiAddressEnabled() && electionAddr.size() > 1 && self.isMultiAddressReachabilityCheckEnabled()) { senderWorkerMap.get(sid).asyncValidateIfSocketIsStillReachable(); } return true; } //初始化连接 return initiateConnectionAsync(electionAddr, sid); } public boolean initiateConnectionAsync(final MultipleAddresses electionAddr, final Long sid) { if (!inprogressConnections.add(sid)) { return true; } try { connectionExecutor.execute(new QuorumConnectionReqThread(electionAddr, sid)); connectionThreadCnt.incrementAndGet(); } catch (Throwable e) { inprogressConnections.remove(sid); return false; } return true; } //QuorumConnectionReqThread类#run方法 @Override public void run() { try { initiateConnection(electionAddr, sid); } finally { inprogressConnections.remove(sid); } } //真正建立socket连接 public void initiateConnection(final MultipleAddresses electionAddr, final Long sid) { Socket sock = null; try { if (self.isSslQuorum()) { sock = self.getX509Util().createSSLSocket(); } else { sock = SOCKET_FACTORY.get(); } setSockOpts(sock); sock.connect(electionAddr.getReachableOrOne(), cnxTO); if (sock instanceof SSLSocket) { SSLSocket sslSock = (SSLSocket) sock; sslSock.startHandshake(); } } catch (X509Exception e) { closeSocket(sock); return; } catch (UnresolvedAddressException | IOException e) { closeSocket(sock); return; } try { startConnection(sock, sid); } catch (IOException e) { closeSocket(sock); } } private boolean startConnection(Socket sock, Long sid) throws IOException { DataOutputStream dout = null; DataInputStream din = null; try { BufferedOutputStream buf = new BufferedOutputStream(sock.getOutputStream()); dout = new DataOutputStream(buf); long protocolVersion = self.isMultiAddressEnabled() ? PROTOCOL_VERSION_V2 : PROTOCOL_VERSION_V1; dout.writeLong(protocolVersion); dout.writeLong(self.getId()); // now we send our election address. For the new protocol version, we can send multiple addresses. Collection<InetSocketAddress> addressesToSend = protocolVersion == PROTOCOL_VERSION_V2 ? self.getElectionAddress().getAllAddresses() : Arrays.asList(self.getElectionAddress().getOne()); String addr = addressesToSend.stream() .map(NetUtils::formatInetAddr).collect(Collectors.joining("|")); byte[] addr_bytes = addr.getBytes(); dout.writeInt(addr_bytes.length); dout.write(addr_bytes); dout.flush(); din = new DataInputStream(new BufferedInputStream(sock.getInputStream())); } catch (IOException e) { closeSocket(sock); return false; } // authenticate learner QuorumPeer.QuorumServer qps = self.getVotingView().get(sid); if (qps != null) { authLearner.authenticate(sock, qps.hostname); } // If lost the challenge, then drop the new connection if (sid > self.getId()) { closeSocket(sock); } else { SendWorker sw = new SendWorker(sock, sid); RecvWorker rw = new RecvWorker(sock, din, sid, sw); sw.setRecv(rw); SendWorker vsw = senderWorkerMap.get(sid); if (vsw != null) { vsw.finish(); } senderWorkerMap.put(sid, sw); queueSendMap.putIfAbsent(sid, new CircularBlockingQueue<>(SEND_CAPACITY)); sw.start(); rw.start(); return true; } return false; }
【8】构建FastLeaderElection做了什么
//构建FastLeaderElection做了什么 public FastLeaderElection(QuorumPeer self, QuorumCnxManager manager) { this.stop = false; this.manager = manager; starter(self, manager); } private void starter(QuorumPeer self, QuorumCnxManager manager) { this.self = self; proposedLeader = -1; proposedZxid = -1; sendqueue = new LinkedBlockingQueue<ToSend>(); recvqueue = new LinkedBlockingQueue<Notification>(); this.messenger = new Messenger(manager); } public static class Notification { public static final int CURRENTVERSION = 0x2; int version; long leader; long zxid; long electionEpoch; QuorumPeer.ServerState state; long sid; QuorumVerifier qv; long peerEpoch; } public static class ToSend { enum mType { crequest, challenge, notification, ack } long leader; long zxid; long electionEpoch; QuorumPeer.ServerState state; long sid; byte[] configData = dummyData; long peerEpoch; } Messenger(QuorumCnxManager manager) { this.ws = new WorkerSender(manager); this.wsThread = new Thread(this.ws, "WorkerSender[myid=" + self.getId() + "]"); this.wsThread.setDaemon(true); this.wr = new WorkerReceiver(manager); this.wrThread = new Thread(this.wr, "WorkerReceiver[myid=" + self.getId() + "]"); this.wrThread.setDaemon(true); } //FastLeaderElection的start()方法做了什么 public void start() { this.messenger.start(); } void start() { this.wsThread.start(); this.wrThread.start(); }
【9】分析发送工作者WorkerSender做了什么
//发送工作者WorkerSender做了什么 class WorkerSender extends ZooKeeperThread { volatile boolean stop; QuorumCnxManager manager; WorkerSender(QuorumCnxManager manager) { super("WorkerSender"); this.stop = false; this.manager = manager; } public void run() { while (!stop) { try { ToSend m = sendqueue.poll(3000, TimeUnit.MILLISECONDS); if (m == null) { continue; } process(m); } catch (InterruptedException e) { break; } } } void process(ToSend m) { ByteBuffer requestBuffer = buildMsg(m.state.ordinal(), m.leader, m.zxid, m.electionEpoch, m.peerEpoch, m.configData); manager.toSend(m.sid, requestBuffer); } } //QuorumCnxManager类#toSend方法 public void toSend(Long sid, ByteBuffer b) { //If sending message to myself, then simply enqueue it (loopback). if (this.mySid == sid) { b.position(0); addToRecvQueue(new Message(b.duplicate(), sid)); //Otherwise send to the corresponding thread to send. } else { //应用层的发送队列数组,每个服务器对应一个队列,用他们的机器ID作为下标. BlockingQueue<ByteBuffer> bq = queueSendMap.computeIfAbsent(sid, serverId -> new CircularBlockingQueue<>(SEND_CAPACITY)); addToSendQueue(bq, b); connectOne(sid); } }
【10】分析接收工作者WorkerReceiver做了什么
//接收工作者 class WorkerReceiver extends ZooKeeperThread { volatile boolean stop; QuorumCnxManager manager; WorkerReceiver(QuorumCnxManager manager) { super("WorkerReceiver"); this.stop = false; this.manager = manager; } public void run() { Message response; while (!stop) { // Sleeps on receive try { response = manager.pollRecvQueue(3000, TimeUnit.MILLISECONDS); if (response == null) { continue; } final int capacity = response.buffer.capacity(); // The current protocol and two previous generations all send at least 28 bytes if (capacity < 28) { continue; } // this is the backwardCompatibility mode in place before ZK-107 // It is for a version of the protocol in which we didn't send peer epoch // With peer epoch and version the message became 40 bytes boolean backCompatibility28 = (capacity == 28); // this is the backwardCompatibility mode for no version information boolean backCompatibility40 = (capacity == 40); response.buffer.clear(); // Instantiate Notification and set its attributes Notification n = new Notification(); int rstate = response.buffer.getInt(); long rleader = response.buffer.getLong(); long rzxid = response.buffer.getLong(); long relectionEpoch = response.buffer.getLong(); long rpeerepoch; int version = 0x0; QuorumVerifier rqv = null; try { if (!backCompatibility28) { rpeerepoch = response.buffer.getLong(); if (!backCompatibility40) { version = response.buffer.getInt(); } else {...} } else { rpeerepoch = ZxidUtils.getEpochFromZxid(rzxid); } // check if we have a version that includes config. If so extract config info from message. if (version > 0x1) { int configLength = response.buffer.getInt(); // we want to avoid errors caused by the allocation of a byte array with negative length // (causing NegativeArraySizeException) or huge length (causing e.g. OutOfMemoryError) if (configLength < 0 || configLength > capacity) { throw new IOException(...); } byte[] b = new byte[configLength]; response.buffer.get(b); synchronized (self) { try { rqv = self.configFromString(new String(b, UTF_8)); QuorumVerifier curQV = self.getQuorumVerifier(); if (rqv.getVersion() > curQV.getVersion()) { if (self.getPeerState() == ServerState.LOOKING) { self.processReconfig(rqv, null, null, false); if (!rqv.equals(curQV)) { self.shuttingDownLE = true; self.getElectionAlg().shutdown(); break; } } else {...} } } catch (IOException | ConfigException e) {...} } } else {...} } catch (BufferUnderflowException | IOException e) { continue; } /* * If it is from a non-voting server (such as an observer or * a non-voting follower), respond right away. */ if (!validVoter(response.sid)) { Vote current = self.getCurrentVote(); QuorumVerifier qv = self.getQuorumVerifier(); ToSend notmsg = new ToSend( ToSend.mType.notification, current.getId(), current.getZxid(), logicalclock.get(), self.getPeerState(), response.sid, current.getPeerEpoch(), qv.toString().getBytes(UTF_8)); sendqueue.offer(notmsg); } else { // Receive new message // State of peer that sent this message QuorumPeer.ServerState ackstate = QuorumPeer.ServerState.LOOKING; switch (rstate) { case 0: ackstate = QuorumPeer.ServerState.LOOKING; break; case 1: ackstate = QuorumPeer.ServerState.FOLLOWING; break; case 2: ackstate = QuorumPeer.ServerState.LEADING; break; case 3: ackstate = QuorumPeer.ServerState.OBSERVING; break; default: continue; } n.leader = rleader; n.zxid = rzxid; n.electionEpoch = relectionEpoch; n.state = ackstate; n.sid = response.sid; n.peerEpoch = rpeerepoch; n.version = version; n.qv = rqv; //如果这个服务器正处于looking状态,那么发送提议leader if (self.getPeerState() == QuorumPeer.ServerState.LOOKING) { recvqueue.offer(n); /* * Send a notification back if the peer that sent this * message is also looking and its logical clock is * lagging behind. */ if ((ackstate == QuorumPeer.ServerState.LOOKING) && (n.electionEpoch < logicalclock.get())) { Vote v = getVote(); QuorumVerifier qv = self.getQuorumVerifier(); ToSend notmsg = new ToSend( ToSend.mType.notification, v.getId(), v.getZxid(), logicalclock.get(), self.getPeerState(), response.sid, v.getPeerEpoch(), qv.toString().getBytes()); sendqueue.offer(notmsg); } } else { //反之,如果选举结束了,则将自己服务器记录的leader信息发送回给对方 Vote current = self.getCurrentVote(); if (ackstate == QuorumPeer.ServerState.LOOKING) { if (self.leader != null) { if (leadingVoteSet != null) { self.leader.setLeadingVoteSet(leadingVoteSet); leadingVoteSet = null; } self.leader.reportLookingSid(response.sid); } QuorumVerifier qv = self.getQuorumVerifier(); ToSend notmsg = new ToSend( ToSend.mType.notification, current.getId(), current.getZxid(), current.getElectionEpoch(), self.getPeerState(), response.sid, current.getPeerEpoch(), qv.toString().getBytes()); sendqueue.offer(notmsg); } } } } catch (InterruptedException e) {...} } } }
【11】第五步【5】中quorumPeer.start()调用了父类的start(),由于父类就是Thread,所以核心在run方法里面
@Override public void run() { updateThreadName(); //监控部分,进行了省略 try { jmxQuorumBean = new QuorumBean(this); ..... } catch (Exception e) { jmxQuorumBean = null; } try { //主体逻辑 while (running) { if (unavailableStartTime == 0) { unavailableStartTime = Time.currentElapsedTime(); } switch (getPeerState()) { case LOOKING: LOG.info("LOOKING"); ServerMetrics.getMetrics().LOOKING_COUNT.add(1); if (Boolean.getBoolean("readonlymode.enabled")) { final ReadOnlyZooKeeperServer roZk = new ReadOnlyZooKeeperServer(logFactory, this, this.zkDb); Thread roZkMgr = new Thread() { public void run() { try { // lower-bound grace period to 2 secs sleep(Math.max(2000, tickTime)); if (ServerState.LOOKING.equals(getPeerState())) { roZk.startup(); } } catch (InterruptedException e) {...} catch (Exception e) {...} } }; try { roZkMgr.start(); reconfigFlagClear(); if (shuttingDownLE) { shuttingDownLE = false; startLeaderElection(); } setCurrentVote(makeLEStrategy().lookForLeader()); } catch (Exception e) { setPeerState(ServerState.LOOKING); } finally { roZkMgr.interrupt(); roZk.shutdown(); } } else { try { reconfigFlagClear(); if (shuttingDownLE) { shuttingDownLE = false; startLeaderElection(); } //设置当前的投票 setCurrentVote(makeLEStrategy().lookForLeader()); } catch (Exception e) { LOG.warn("Unexpected exception", e); setPeerState(ServerState.LOOKING); } } break; case OBSERVING: try { LOG.info("OBSERVING"); setObserver(makeObserver(logFactory)); observer.observeLeader(); } catch (Exception e) { LOG.warn("Unexpected exception", e); } finally { observer.shutdown(); setObserver(null); updateServerState(); // Add delay jitter before we switch to LOOKING // state to reduce the load of ObserverMaster if (isRunning()) { Observer.waitForObserverElectionDelay(); } } break; case FOLLOWING: try { LOG.info("FOLLOWING"); setFollower(makeFollower(logFactory)); follower.followLeader(); } catch (Exception e) { LOG.warn("Unexpected exception", e); } finally { follower.shutdown(); setFollower(null); updateServerState(); } break; case LEADING: LOG.info("LEADING"); try { setLeader(makeLeader(logFactory)); leader.lead(); setLeader(null); } catch (Exception e) { LOG.warn("Unexpected exception", e); } finally { if (leader != null) { leader.shutdown("Forcing shutdown"); setLeader(null); } updateServerState(); } break; } } } finally { LOG.warn("QuorumPeer main thread exited"); MBeanRegistry instance = MBeanRegistry.getInstance(); instance.unregister(jmxQuorumBean); instance.unregister(jmxLocalPeerBean); for (RemotePeerBean remotePeerBean : jmxRemotePeerBean.values()) { instance.unregister(remotePeerBean); } jmxQuorumBean = null; jmxLocalPeerBean = null; jmxRemotePeerBean = null; } }
【12】分析核心的选举流程代码
//FastLeaderElection类#lookForLeader方法 //核心的选举算法 public Vote lookForLeader() throws InterruptedException { //监控部分省略 try {...} catch (Exception e) {...} self.start_fle = Time.currentElapsedTime(); try { Map<Long, Vote> recvset = new HashMap<Long, Vote>(); Map<Long, Vote> outofelection = new HashMap<Long, Vote>(); int notTimeout = minNotificationInterval; synchronized (this) { //选举周期自增 logicalclock.incrementAndGet(); //更新选票信息(初始化的情况下是设置选自己) updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch()); } //发送选票 sendNotifications(); SyncedLearnerTracker voteSet = null; //在这个循环中,交换通知,直到找到Leader while ((self.getPeerState() == ServerState.LOOKING) && (!stop)) { //接收其他人发过来的选票 Notification n = recvqueue.poll(notTimeout, TimeUnit.MILLISECONDS); //从recvqueue取出通知为空的情况 //如广播出去8个,由于网络原因可能只收到3个,第四次取的时候就是空的 //可能收到8个了,但是选举还没结束,再次取的时候也是空的 //为了保证选举还没结束的时候,能继续收到其他Server的选票,并继续处理判断,直到选出Leader if (n == null) { //判断是否已经被交付,即检查所有队列是否为空,表示所有消息都已传递 if (manager.haveDelivered()) { // 重新发送,目的是为了重新再接收 sendNotifications(); } else { // 重新连接zk集群中的每一个server manager.connectAll(); } notTimeout = Math.min(notTimeout << 1, maxNotificationInterval); if (self.getQuorumVerifier() instanceof QuorumOracleMaj && self.getQuorumVerifier().revalidateVoteset(voteSet, notTimeout != minNotificationInterval)) { setPeerState(proposedLeader, voteSet); Vote endVote = new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch); leaveInstance(endVote); return endVote; } } //验证发送者的ServerId //验证当前通知推荐的leader的ServerId else if (validVoter(n.sid) && validVoter(n.leader)) { switch (n.state) { case LOOKING: if (getInitLastLoggedZxid() == -1) { break; } if (n.zxid == -1) { break; } // 判断对方选票的周期是不是比我自身的周期要大 // 这种是针对自身曾经宕机过,导致周期比其他人的要小 if (n.electionEpoch > logicalclock.get()) { logicalclock.set(n.electionEpoch); recvset.clear(); if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) { updateProposal(n.leader, n.zxid, n.peerEpoch); } else { updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch()); } sendNotifications(); } //如果对方选票的周期是不是比我自身的周期要小,则是无用票 else if (n.electionEpoch < logicalclock.get()) { break; } //PK逻辑即 else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) { //只有对面的比自身的要好,才会从新发起新票 updateProposal(n.leader, n.zxid, n.peerEpoch); sendNotifications(); } // 将选票放入选票的Set集合 recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch)); //获取voteSet voteSet = getVoteTracker(recvset, new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch)); //判断获取到了大多数节点对当前 Vote 的支持 if (voteSet.hasAllQuorums()) { //但是这个时候并不能直接判断 当前Vote 选择的Leadee就一定是最终的Leader //先等待 finalizeWait = 200ms 的时长,如果接收到了消息 //且接收到的 Vote 信息更新,那么放入到接收队列中,在下一次循环中再次比较谁的 Vote 更胜一筹 while ((n = recvqueue.poll(finalizeWait, TimeUnit.MILLISECONDS)) != null) { if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) { recvqueue.put(n); break; } } //当确实没有 Vote 消息再传进来的时候,可以确认最终的Leader,选举结束 //更新节点状态 //如果选举出的 leaderId和自身id一样,表明自己是Leader ,状态为LEADING //如果不一样,表明自己是Follower ,状态为FOLLOWING //最后,退出选举过程 if (n == null) { setPeerState(proposedLeader, voteSet); Vote endVote = new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch); leaveInstance(endVote); return endVote; } } break; case OBSERVING: break; case FOLLOWING: Vote resultFN = receivedFollowingNotification(recvset, outofelection, voteSet, n); if (resultFN == null) { break; } else { return resultFN; } case LEADING: Vote resultLN = receivedLeadingNotification(recvset, outofelection, voteSet, n); if (resultLN == null) { break; } else { return resultLN; } default: break; } } else { if (!validVoter(n.leader)) {...} if (!validVoter(n.sid)) {...} } } return null; } finally { //监控相关的就省略了 } } //发送选票方法 private void sendNotifications() { //循环拿出可以参加选举的节点进行发送选票 for (long sid : self.getCurrentAndNextConfigVoters()) { QuorumVerifier qv = self.getQuorumVerifier(); ToSend notmsg = new ToSend( ToSend.mType.notification, proposedLeader, proposedZxid, logicalclock.get(), //当前的选举周期 QuorumPeer.ServerState.LOOKING, //当前节点状态 sid, proposedEpoch, qv.toString().getBytes(UTF_8)); //放入sendqueue队列【这里面按照循环是会塞入多个,然后等待WorkerSender进行分发】 sendqueue.offer(notmsg); } } //判断交付逻辑 boolean haveDelivered() { for (BlockingQueue<ByteBuffer> queue : queueSendMap.values()) { final int queueSize = queue.size(); //只要有一个队列为0就返回true,后面就不看了,因为之前说过只要有一个队列为空,就说明当前Server与zk集群的连接没有问题 if (queueSize == 0) { return true; } } //只有当所有队列都不为空,才说明当前Server与zk集群失联 return false; } //选票的PK逻辑 protected boolean totalOrderPredicate(long newId, long newZxid, long newEpoch, long curId, long curZxid, long curEpoch) { if (self.getQuorumVerifier().getWeight(newId) == 0) { return false; } //周期高的 //周期一致,事务id大的 //周期一致,事务id大的,服务器id大的 return ((newEpoch > curEpoch) || ((newEpoch == curEpoch) && ((newZxid > curZxid) || ((newZxid == curZxid) && (newId > curId))))); }
【13】在上面的流程中已经能够合理的选出了Leader,此时如果有新的节点加入
//接收工作者WorkerReceiver会判断自身是否还在选举阶段,不是则会将自己服务器记录的leader信息发送回给对方 //则此时新加的机器节点的选票状态会是FOLLOWING或者LEADING private Vote receivedLeadingNotification(Map<Long, Vote> recvset, Map<Long, Vote> outofelection, SyncedLearnerTracker voteSet, Notification n) { Vote result = receivedFollowingNotification(recvset, outofelection, voteSet, n); if (result == null) { if (self.getQuorumVerifier().getNeedOracle() && !self.getQuorumVerifier().askOracle()) { setPeerState(n.leader, voteSet); Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch); leaveInstance(endVote); return endVote; } else { return null; } } else { return result; } } private Vote receivedFollowingNotification(Map<Long, Vote> recvset, Map<Long, Vote> outofelection, SyncedLearnerTracker voteSet, Notification n) { if (n.electionEpoch == logicalclock.get()) { recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state)); voteSet = getVoteTracker(recvset, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state)); if (voteSet.hasAllQuorums() && checkLeader(recvset, n.leader, n.electionEpoch)) { setPeerState(n.leader, voteSet); Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch); leaveInstance(endVote); return endVote; } } outofelection.put(n.sid, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state)); voteSet = getVoteTracker(outofelection, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state)); if (voteSet.hasAllQuorums() && checkLeader(outofelection, n.leader, n.electionEpoch)) { synchronized (this) { logicalclock.set(n.electionEpoch); setPeerState(n.leader, voteSet); } Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch); leaveInstance(endVote); return endVote; } return null; }
【14】总结:整个选举流程涉及到有价值的点在于:
1.对于多余的socket连接进行了关闭【但个人觉得,为什么不在监听者那里就进行判断呢?这样可以减少连接,而不是建立了又断开,重复的搞,可能还有我不清楚的点】
2.采用了多层队列架构(异步来优化性能),分别为
构建FastLeaderElection,持有两个LinkedBlockingQueue: sendqueue recvqueue 持有messenger,是经过Messenger类包装过的QuorumCnxManager messenger的持有: //两个线程 WorkerSender,负责从sendqueue拿到数据,然后去queueSendMap找对应的sid的队列塞入数据 WorkerReceiver,负责从manager的recvQueue中拿数据传到recvqueue //持有一个队列 recvQueue //对应每个socket连接都会有 RecvWorker,负责将DataInputStream里面的数据转而传到messenger的recvQueue队列 SendWorker,负责将各自队列里面的数据写入 DataOutputStream 而他们的存放 SendWorker持有RecvWorker senderWorkerMap{ ... [sid0->SendWorker], [sid1->SendWorker] } //队列集合 queueSendMap{ ... [sid0->CircularBlockingQueue], [sid1->CircularBlockingQueue] } //数据传输的流为 DataOutputStream DataInputStream
启动或leader宕机选举leader流程
leader选举多层队列架构
【1】整个zookeeper选举底层可以分为选举应用层和消息传输层,应用层有自己的队列统一接收和发送选票,传输层也设计了自己的队列,但是按发送的机器分了队列,避免给每台机器发送消息时相互影响,比如某台机器如果出问题发送不成功则不会影响对正常机器的消息发送。
Leader选举源码流程图