//org.apache.zookeeper.server.quorum包下QuorumPeerMain类 public static void main(String[] args) { QuorumPeerMain main = new QuorumPeerMain(); try { main.initializeAndRun(args); } catch (IllegalArgumentException e) {..} catch (ConfigException e) {..} catch (DatadirException e) {..} catch (AdminServerException e) {..} catch (Exception e) {..} ServiceUtils.requestSystemExit(ExitCode.EXECUTION_FINISHED.getValue()); } protected void initializeAndRun(String[] args) throws ConfigException, IOException, AdminServerException { QuorumPeerConfig config = new QuorumPeerConfig(); if (args.length == 1) { //解析配置文件加载到内存 //主要是调用了QuorumPeerConfig类#parse方法,解析逻辑在parseProperties方法 config.parse(args[0]); } //启动延时的定期清理快照数据文件 DatadirCleanupManager purgeMgr = new DatadirCleanupManager( config.getDataDir(), config.getDataLogDir(), config.getSnapRetainCount(), config.getPurgeInterval()); purgeMgr.start(); if (args.length == 1 && config.isDistributed()) { //集群的入口 runFromConfig(config); } else { //单机的入口 ZooKeeperServerMain.main(args); } }
public void runFromConfig(QuorumPeerConfig config) throws IOException, AdminServerException { try { ManagedUtil.registerLog4jMBeans(); } catch (JMException e) { LOG.warn("Unable to register log4j JMX control", e); } LOG.info("Starting quorum peer, myid=" + config.getServerId()); final MetricsProvider metricsProvider; try { metricsProvider = MetricsProviderBootstrap.startMetricsProvider( config.getMetricsProviderClassName(), config.getMetricsProviderConfiguration()); } catch (MetricsProviderLifeCycleException error) { throw new IOException("Cannot boot MetricsProvider " + config.getMetricsProviderClassName(), error); } try { ServerMetrics.metricsProviderInitialized(metricsProvider); ProviderRegistry.initialize(); ServerCnxnFactory cnxnFactory = null; ServerCnxnFactory secureCnxnFactory = null; if (config.getClientPortAddress() != null) { //初始化服务端连接对象 cnxnFactory = ServerCnxnFactory.createFactory(); //设置监听端口,从配置文件中拿 cnxnFactory.configure(config.getClientPortAddress(), config.getMaxClientCnxns(), config.getClientPortListenBacklog(), false); } if (config.getSecureClientPortAddress() != null) { secureCnxnFactory = ServerCnxnFactory.createFactory(); secureCnxnFactory.configure(config.getSecureClientPortAddress(), config.getMaxClientCnxns(), config.getClientPortListenBacklog(), true); } //构建本机节点,并将配置参数的数据传入 quorumPeer = getQuorumPeer(); quorumPeer.setTxnFactory(new FileTxnSnapLog(config.getDataLogDir(), config.getDataDir())); quorumPeer.enableLocalSessions(config.areLocalSessionsEnabled()); quorumPeer.enableLocalSessionsUpgrading(config.isLocalSessionsUpgradingEnabled()); //quorumPeer.setQuorumPeers(config.getAllMembers()); quorumPeer.setElectionType(config.getElectionAlg()); quorumPeer.setMyid(config.getServerId()); quorumPeer.setTickTime(config.getTickTime()); quorumPeer.setMinSessionTimeout(config.getMinSessionTimeout()); quorumPeer.setMaxSessionTimeout(config.getMaxSessionTimeout()); quorumPeer.setInitLimit(config.getInitLimit()); quorumPeer.setSyncLimit(config.getSyncLimit()); quorumPeer.setConnectToLearnerMasterLimit(config.getConnectToLearnerMasterLimit()); quorumPeer.setObserverMasterPort(config.getObserverMasterPort()); quorumPeer.setConfigFileName(config.getConfigFilename()); quorumPeer.setClientPortListenBacklog(config.getClientPortListenBacklog()); quorumPeer.setZKDatabase(new ZKDatabase(quorumPeer.getTxnFactory())); quorumPeer.setQuorumVerifier(config.getQuorumVerifier(), false); if (config.getLastSeenQuorumVerifier() != null) { quorumPeer.setLastSeenQuorumVerifier(config.getLastSeenQuorumVerifier(), false); } quorumPeer.initConfigInZKDatabase(); //将连接对象也存入本节点 quorumPeer.setCnxnFactory(cnxnFactory); quorumPeer.setSecureCnxnFactory(secureCnxnFactory); quorumPeer.setSslQuorum(config.isSslQuorum()); quorumPeer.setUsePortUnification(config.shouldUsePortUnification()); quorumPeer.setLearnerType(config.getPeerType()); quorumPeer.setSyncEnabled(config.getSyncEnabled()); quorumPeer.setQuorumListenOnAllIPs(config.getQuorumListenOnAllIPs()); if (config.sslQuorumReloadCertFiles) { quorumPeer.getX509Util().enableCertFileReloading(); } quorumPeer.setMultiAddressEnabled(config.isMultiAddressEnabled()); quorumPeer.setMultiAddressReachabilityCheckEnabled(config.isMultiAddressReachabilityCheckEnabled()); quorumPeer.setMultiAddressReachabilityCheckTimeoutMs(config.getMultiAddressReachabilityCheckTimeoutMs()); // sets quorum sasl authentication configurations quorumPeer.setQuorumSaslEnabled(config.quorumEnableSasl); if (quorumPeer.isQuorumSaslAuthEnabled()) { quorumPeer.setQuorumServerSaslRequired(config.quorumServerRequireSasl); quorumPeer.setQuorumLearnerSaslRequired(config.quorumLearnerRequireSasl); quorumPeer.setQuorumServicePrincipal(config.quorumServicePrincipal); quorumPeer.setQuorumServerLoginContext(config.quorumServerLoginContext); quorumPeer.setQuorumLearnerLoginContext(config.quorumLearnerLoginContext); } quorumPeer.setQuorumCnxnThreadsSize(config.quorumCnxnThreadsSize); quorumPeer.initialize(); if (config.jvmPauseMonitorToRun) { quorumPeer.setJvmPauseMonitor(new JvmPauseMonitor(config)); } //启动节点 quorumPeer.start(); ZKAuditProvider.addZKStartStopAuditLog(); quorumPeer.join(); } catch (InterruptedException e) { // warn, but generally this is ok LOG.warn("Quorum Peer interrupted", e); } finally { try { metricsProvider.stop(); } catch (Throwable error) { LOG.warn("Error while stopping metrics", error); } } }
//ServerCnxnFactory类#createFactory方法 //初始化通信对象 public static ServerCnxnFactory createFactory() throws IOException { //属性值展示:String ZOOKEEPER_SERVER_CNXN_FACTORY = "zookeeper.serverCnxnFactory" //官方推荐netty:则应该是ServerCnxnFactory类的子类NettyServerCnxnFactory String serverCnxnFactoryName = System.getProperty(ZOOKEEPER_SERVER_CNXN_FACTORY); if (serverCnxnFactoryName == null) { //但是默认是子类NIOServerCnxnFactory serverCnxnFactoryName = NIOServerCnxnFactory.class.getName(); } try { //利用反射进行初始化 ServerCnxnFactory serverCnxnFactory = (ServerCnxnFactory) Class.forName(serverCnxnFactoryName).getDeclaredConstructor().newInstance(); LOG.info("Using {} as server connection factory", serverCnxnFactoryName); return serverCnxnFactory; } catch (Exception e) { IOException ioe = new IOException("Couldn't instantiate " + serverCnxnFactoryName, e); throw ioe; } }
//org.apache.zookeeper.server包下DataTree类 //节点数据是final NodeHashMap nodes; public class DataTree { private static final Logger LOG = LoggerFactory.getLogger(DataTree.class); private final RateLogger RATE_LOGGER = new RateLogger(LOG, 15 * 60 * 1000); //该映射提供了对datanode的快速查找 private final NodeHashMap nodes; private IWatchManager dataWatches; private IWatchManager childWatches; //缓存所有datanode的路径和数据的总大小 private final AtomicLong nodeDataSize = new AtomicLong(0); //根结点 private static final String rootZookeeper = "/"; private static final String procZookeeper = Quotas.procZookeeper; private static final String procChildZookeeper = procZookeeper.substring(1); private static final String quotaZookeeper = Quotas.quotaZookeeper; private static final String quotaChildZookeeper = quotaZookeeper.substring(procZookeeper.length() + 1); private static final String configZookeeper = ZooDefs.CONFIG_NODE; private static final String configChildZookeeper = configZookeeper.substring(procZookeeper.length() + 1); private final PathTrie pTrie = new PathTrie(); public static final int STAT_OVERHEAD_BYTES = (6 * 8) + (5 * 4); private final Map<Long, HashSet<String>> ephemerals = new ConcurrentHashMap<Long, HashSet<String>>(); private final Set<String> containers = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>()); private final Set<String> ttls = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>()); private final ReferenceCountedACLCache aclCache = new ReferenceCountedACLCache(); public static final int DIGEST_LOG_LIMIT = 1024; public static final int DIGEST_LOG_INTERVAL = 128; private ZxidDigest digestFromLoadedSnapshot; private volatile ZxidDigest lastProcessedZxidDigest; private boolean firstMismatchTxn = true; private final List<DigestWatcher> digestWatchers = new ArrayList<>(); private LinkedList<ZxidDigest> digestLog = new LinkedList<>(); private final DigestCalculator digestCalculator; } public class DataNode implements Record { private volatile long digest; // 指示该节点的摘要是否是最新的 volatile boolean digestCached; byte[] data; Long acl; public StatPersisted stat; private Set<String> children = null; private static final Set<String> EMPTY_SET = Collections.emptySet(); }
@Override public synchronized void start() { if (!getView().containsKey(myid)) { throw new RuntimeException("My id " + myid + " not in the peer list"); } //加载快照文件数据到内存 loadDataBase(); //启动通信对象 startServerCnxnFactory(); try { //JettyAdminServer,启动内嵌Jetty服务,默认8080端口 adminServer.start(); } catch (AdminServerException e) { LOG.warn("Problem starting AdminServer", e); } //初始化选举数据 startLeaderElection(); startJvmPauseMonitor(); super.start(); } private void startServerCnxnFactory() { if (cnxnFactory != null) { //如果有配置netty通信,则NettyServerCnxnFactory类#start方法 cnxnFactory.start(); } if (secureCnxnFactory != null) { secureCnxnFactory.start(); } } //NettyServerCnxnFactory类#start方法 @Override public void start() { if (listenBacklog != -1) { bootstrap.option(ChannelOption.SO_BACKLOG, listenBacklog); } LOG.info("binding to port {}", localAddress); parentChannel = bootstrap.bind(localAddress).syncUninterruptibly().channel(); // Port changes after bind() if the original port was 0, update // localAddress to get the real port. localAddress = (InetSocketAddress) parentChannel.localAddress(); LOG.info("bound to port {}", getLocalPort()); } //选举数据构建 public synchronized void startLeaderElection() { try { if (getPeerState() == ServerState.LOOKING) { //构建选票,myid服务器id标记,最大的事务id,当前服务器的选举轮次 currentVote = new Vote(myid, getLastLoggedZxid(), getCurrentEpoch()); } } catch (IOException e) { RuntimeException re = new RuntimeException(e.getMessage()); re.setStackTrace(e.getStackTrace()); throw re; } //确定选举算法,默认传的是3 this.electionAlg = createElectionAlgorithm(electionType); } //节点状态 public enum ServerState { LOOKING, //等待状态 FOLLOWING, //从节点 LEADING, //主节点 OBSERVING //观察状态 }
//选举算法分析(3.8版本已经将过时的算法去除了) protected Election createElectionAlgorithm(int electionAlgorithm) { Election le = null; //TODO: use a factory rather than a switch switch (electionAlgorithm) { case 1: throw new UnsupportedOperationException("Election Algorithm 1 is not supported."); case 2: throw new UnsupportedOperationException("Election Algorithm 2 is not supported."); case 3: QuorumCnxManager qcm = createCnxnManager(); QuorumCnxManager oldQcm = qcmRef.getAndSet(qcm); if (oldQcm != null) { LOG.warn("Clobbering already-set QuorumCnxManager (restarting leader election?)"); oldQcm.halt(); } QuorumCnxManager.Listener listener = qcm.listener; if (listener != null) { //启动监听线程 listener.start(); //构建收发消息线程 FastLeaderElection fle = new FastLeaderElection(this, qcm); fle.start(); le = fle; } else { LOG.error("Null listener when initializing cnx manager"); } break; default: assert false; } return le; }
【7】翻阅监听线程 listener 做了什么
//翻阅监听线程做了什么,主要是看run方法 @Override public void run() { if (!shutdown) { LOG.debug("Listener thread started, myId: {}", self.getId()); Set<InetSocketAddress> addresses; if (self.getQuorumListenOnAllIPs()) { addresses = self.getElectionAddress().getWildcardAddresses(); } else { addresses = self.getElectionAddress().getAllAddresses(); } CountDownLatch latch = new CountDownLatch(addresses.size()); //循环的方式针对每个地址构建一个ListenerHandler listenerHandlers = addresses.stream().map(address -> new ListenerHandler(address, self.shouldUsePortUnification(), self.isSslQuorum(), latch)) .collect(Collectors.toList()); //针对每个ListenerHandler都会有一个对应的线程进行处理(线程池) final ExecutorService executor = Executors.newFixedThreadPool(addresses.size()); try { listenerHandlers.forEach(executor::submit); } finally { // prevent executor's threads to leak after ListenerHandler tasks complete executor.shutdown(); } try { latch.await(); } catch (InterruptedException ie) {..} finally { // Clean up for shutdown. for (ListenerHandler handler : listenerHandlers) { try { handler.close(); } catch (IOException ie) {...} } } } LOG.info("Leaving listener"); if (!shutdown) { if (socketException.get()) { // After leaving listener thread, the host cannot join the quorum anymore, // this is a severe error that we cannot recover from, so we need to exit socketBindErrorHandler.run(); } } } class ListenerHandler implements Runnable, Closeable { private ServerSocket serverSocket; private InetSocketAddress address; private boolean portUnification; private boolean sslQuorum; private CountDownLatch latch; ListenerHandler(InetSocketAddress address, boolean portUnification, boolean sslQuorum,CountDownLatch latch) { this.address = address; this.portUnification = portUnification; this.sslQuorum = sslQuorum; this.latch = latch; } /** * Sleeps on acceptConnections(). */ @Override public void run() { try { Thread.currentThread().setName("ListenerHandler-" + address); //建立连接 acceptConnections(); try { close(); } catch (IOException e) {...} } catch (Exception e) {...} finally { latch.countDown(); } } @Override public synchronized void close() throws IOException { if (serverSocket != null && !serverSocket.isClosed()) { LOG.debug("Trying to close listeners: {}", serverSocket); serverSocket.close(); } } /** * Sleeps on accept(). */ private void acceptConnections() { int numRetries = 0; Socket client = null; while ((!shutdown) && (portBindMaxRetry == 0 || numRetries < portBindMaxRetry)) { try { //创建serverSocket serverSocket = createNewServerSocket(); LOG.info("{} is accepting connections now, my election bind port: {}", QuorumCnxManager.this.mySid, address.toString()); while (!shutdown) { try { client = serverSocket.accept(); setSockOpts(client); //处理连接消息 if (quorumSaslAuthEnabled) { receiveConnectionAsync(client); } else { receiveConnection(client); } numRetries = 0; } catch (SocketTimeoutException e) {...} } } catch (IOException e) { if (shutdown) { break; } if (e instanceof SocketException) { socketException.set(true); } numRetries++; try { close(); Thread.sleep(1000); } catch (IOException ie) {...} catch (InterruptedException ie) {...} closeSocket(client); } } if (!shutdown) {...} } private ServerSocket createNewServerSocket() throws IOException { ServerSocket socket; if (portUnification) { LOG.info("Creating TLS-enabled quorum server socket"); socket = new UnifiedServerSocket(self.getX509Util(), true); } else if (sslQuorum) { LOG.info("Creating TLS-only quorum server socket"); socket = new UnifiedServerSocket(self.getX509Util(), false); } else { socket = new ServerSocket(); } socket.setReuseAddress(true); address = new InetSocketAddress(address.getHostString(), address.getPort()); //绑定地址与端口 socket.bind(address); return socket; } }
public void receiveConnection(final Socket sock) { DataInputStream din = null; try { din = new DataInputStream(new BufferedInputStream(sock.getInputStream())); handleConnection(sock, din); } catch (IOException e) { closeSocket(sock); } } private void handleConnection(Socket sock, DataInputStream din) throws IOException { Long sid = null, protocolVersion = null; MultipleAddresses electionAddr = null; try { // 从输入流中读入一个Long(实际上是服务的ID) protocolVersion = din.readLong(); if (protocolVersion >= 0) { // this is a server id and not a protocol version sid = protocolVersion; } else { try { InitialMessage init = InitialMessage.parse(protocolVersion, din); sid = init.sid; if (!init.electionAddr.isEmpty()) { electionAddr = new MultipleAddresses(init.electionAddr, Duration.ofMillis(self.getMultiAddressReachabilityCheckTimeoutMs())); } } catch (InitialMessage.InitialMessageException ex) { closeSocket(sock); return; } } if (sid == QuorumPeer.OBSERVER_ID) { sid = observerCounter.getAndDecrement(); } } catch (IOException e) { closeSocket(sock); return; } // do authenticating learner authServer.authenticate(sock, din); //关闭不必要的连接 //因为socket是双工的,而之前我们是针对了每个服务都要与之建立连接(则有,我连它【自身发起的连接】,它连了我【对方发起的连接】) //说白了两条通道有一条不是必要的 if (sid < self.getId()) { //对方的id小于自身id SendWorker sw = senderWorkerMap.get(sid); if (sw != null) { sw.finish(); } // 关闭当前连接 closeSocket(sock); // 创建当前节点到对面节点的连接 if (electionAddr != null) { connectOne(sid, electionAddr); } else { connectOne(sid); } } //自身的话不需要做什么 else if (sid == self.getId()) {...} else { // 对方id大于自身id // 使用目标节点到当前节点的连接 SendWorker sw = new SendWorker(sock, sid); RecvWorker rw = new RecvWorker(sock, din, sid, sw); sw.setRecv(rw); SendWorker vsw = senderWorkerMap.get(sid); if (vsw != null) { vsw.finish(); } //更新senderWorker与queueSend senderWorkerMap.put(sid, sw); queueSendMap.putIfAbsent(sid, new CircularBlockingQueue<>(SEND_CAPACITY)); sw.start(); rw.start(); } } // 创建当前节点到对面节点的连接 synchronized boolean connectOne(long sid, MultipleAddresses electionAddr) { // 判断连接是否已经存在 if (senderWorkerMap.get(sid) != null) { if (self.isMultiAddressEnabled() && electionAddr.size() > 1 && self.isMultiAddressReachabilityCheckEnabled()) { senderWorkerMap.get(sid).asyncValidateIfSocketIsStillReachable(); } return true; } //初始化连接 return initiateConnectionAsync(electionAddr, sid); } public boolean initiateConnectionAsync(final MultipleAddresses electionAddr, final Long sid) { if (!inprogressConnections.add(sid)) { return true; } try { connectionExecutor.execute(new QuorumConnectionReqThread(electionAddr, sid)); connectionThreadCnt.incrementAndGet(); } catch (Throwable e) { inprogressConnections.remove(sid); return false; } return true; } //QuorumConnectionReqThread类#run方法 @Override public void run() { try { initiateConnection(electionAddr, sid); } finally { inprogressConnections.remove(sid); } } //真正建立socket连接 public void initiateConnection(final MultipleAddresses electionAddr, final Long sid) { Socket sock = null; try { if (self.isSslQuorum()) { sock = self.getX509Util().createSSLSocket(); } else { sock = SOCKET_FACTORY.get(); } setSockOpts(sock); sock.connect(electionAddr.getReachableOrOne(), cnxTO); if (sock instanceof SSLSocket) { SSLSocket sslSock = (SSLSocket) sock; sslSock.startHandshake(); } } catch (X509Exception e) { closeSocket(sock); return; } catch (UnresolvedAddressException | IOException e) { closeSocket(sock); return; } try { startConnection(sock, sid); } catch (IOException e) { closeSocket(sock); } } private boolean startConnection(Socket sock, Long sid) throws IOException { DataOutputStream dout = null; DataInputStream din = null; try { BufferedOutputStream buf = new BufferedOutputStream(sock.getOutputStream()); dout = new DataOutputStream(buf); long protocolVersion = self.isMultiAddressEnabled() ? PROTOCOL_VERSION_V2 : PROTOCOL_VERSION_V1; dout.writeLong(protocolVersion); dout.writeLong(self.getId()); // now we send our election address. For the new protocol version, we can send multiple addresses. Collection<InetSocketAddress> addressesToSend = protocolVersion == PROTOCOL_VERSION_V2 ? self.getElectionAddress().getAllAddresses() : Arrays.asList(self.getElectionAddress().getOne()); String addr = addressesToSend.stream() .map(NetUtils::formatInetAddr).collect(Collectors.joining("|")); byte[] addr_bytes = addr.getBytes(); dout.writeInt(addr_bytes.length); dout.write(addr_bytes); dout.flush(); din = new DataInputStream(new BufferedInputStream(sock.getInputStream())); } catch (IOException e) { closeSocket(sock); return false; } // authenticate learner QuorumPeer.QuorumServer qps = self.getVotingView().get(sid); if (qps != null) { authLearner.authenticate(sock, qps.hostname); } // If lost the challenge, then drop the new connection if (sid > self.getId()) { closeSocket(sock); } else { SendWorker sw = new SendWorker(sock, sid); RecvWorker rw = new RecvWorker(sock, din, sid, sw); sw.setRecv(rw); SendWorker vsw = senderWorkerMap.get(sid); if (vsw != null) { vsw.finish(); } senderWorkerMap.put(sid, sw); queueSendMap.putIfAbsent(sid, new CircularBlockingQueue<>(SEND_CAPACITY)); sw.start(); rw.start(); return true; } return false; }
//构建FastLeaderElection做了什么 public FastLeaderElection(QuorumPeer self, QuorumCnxManager manager) { this.stop = false; this.manager = manager; starter(self, manager); } private void starter(QuorumPeer self, QuorumCnxManager manager) { this.self = self; proposedLeader = -1; proposedZxid = -1; sendqueue = new LinkedBlockingQueue<ToSend>(); recvqueue = new LinkedBlockingQueue<Notification>(); this.messenger = new Messenger(manager); } public static class Notification { public static final int CURRENTVERSION = 0x2; int version; long leader; long zxid; long electionEpoch; QuorumPeer.ServerState state; long sid; QuorumVerifier qv; long peerEpoch; } public static class ToSend { enum mType { crequest, challenge, notification, ack } long leader; long zxid; long electionEpoch; QuorumPeer.ServerState state; long sid; byte[] configData = dummyData; long peerEpoch; } Messenger(QuorumCnxManager manager) { this.ws = new WorkerSender(manager); this.wsThread = new Thread(this.ws, "WorkerSender[myid=" + self.getId() + "]"); this.wsThread.setDaemon(true); this.wr = new WorkerReceiver(manager); this.wrThread = new Thread(this.wr, "WorkerReceiver[myid=" + self.getId() + "]"); this.wrThread.setDaemon(true); } //FastLeaderElection的start()方法做了什么 public void start() { this.messenger.start(); } void start() { this.wsThread.start(); this.wrThread.start(); }
//发送工作者WorkerSender做了什么 class WorkerSender extends ZooKeeperThread { volatile boolean stop; QuorumCnxManager manager; WorkerSender(QuorumCnxManager manager) { super("WorkerSender"); this.stop = false; this.manager = manager; } public void run() { while (!stop) { try { ToSend m = sendqueue.poll(3000, TimeUnit.MILLISECONDS); if (m == null) { continue; } process(m); } catch (InterruptedException e) { break; } } } void process(ToSend m) { ByteBuffer requestBuffer = buildMsg(m.state.ordinal(), m.leader, m.zxid, m.electionEpoch, m.peerEpoch, m.configData); manager.toSend(m.sid, requestBuffer); } } //QuorumCnxManager类#toSend方法 public void toSend(Long sid, ByteBuffer b) { //If sending message to myself, then simply enqueue it (loopback). if (this.mySid == sid) { b.position(0); addToRecvQueue(new Message(b.duplicate(), sid)); //Otherwise send to the corresponding thread to send. } else { //应用层的发送队列数组,每个服务器对应一个队列,用他们的机器ID作为下标. BlockingQueue<ByteBuffer> bq = queueSendMap.computeIfAbsent(sid, serverId -> new CircularBlockingQueue<>(SEND_CAPACITY)); addToSendQueue(bq, b); connectOne(sid); } }
//接收工作者 class WorkerReceiver extends ZooKeeperThread { volatile boolean stop; QuorumCnxManager manager; WorkerReceiver(QuorumCnxManager manager) { super("WorkerReceiver"); this.stop = false; this.manager = manager; } public void run() { Message response; while (!stop) { // Sleeps on receive try { response = manager.pollRecvQueue(3000, TimeUnit.MILLISECONDS); if (response == null) { continue; } final int capacity = response.buffer.capacity(); // The current protocol and two previous generations all send at least 28 bytes if (capacity < 28) { continue; } // this is the backwardCompatibility mode in place before ZK-107 // It is for a version of the protocol in which we didn't send peer epoch // With peer epoch and version the message became 40 bytes boolean backCompatibility28 = (capacity == 28); // this is the backwardCompatibility mode for no version information boolean backCompatibility40 = (capacity == 40); response.buffer.clear(); // Instantiate Notification and set its attributes Notification n = new Notification(); int rstate = response.buffer.getInt(); long rleader = response.buffer.getLong(); long rzxid = response.buffer.getLong(); long relectionEpoch = response.buffer.getLong(); long rpeerepoch; int version = 0x0; QuorumVerifier rqv = null; try { if (!backCompatibility28) { rpeerepoch = response.buffer.getLong(); if (!backCompatibility40) { version = response.buffer.getInt(); } else {...} } else { rpeerepoch = ZxidUtils.getEpochFromZxid(rzxid); } // check if we have a version that includes config. If so extract config info from message. if (version > 0x1) { int configLength = response.buffer.getInt(); // we want to avoid errors caused by the allocation of a byte array with negative length // (causing NegativeArraySizeException) or huge length (causing e.g. OutOfMemoryError) if (configLength < 0 || configLength > capacity) { throw new IOException(...); } byte[] b = new byte[configLength]; response.buffer.get(b); synchronized (self) { try { rqv = self.configFromString(new String(b, UTF_8)); QuorumVerifier curQV = self.getQuorumVerifier(); if (rqv.getVersion() > curQV.getVersion()) { if (self.getPeerState() == ServerState.LOOKING) { self.processReconfig(rqv, null, null, false); if (!rqv.equals(curQV)) { self.shuttingDownLE = true; self.getElectionAlg().shutdown(); break; } } else {...} } } catch (IOException | ConfigException e) {...} } } else {...} } catch (BufferUnderflowException | IOException e) { continue; } /* * If it is from a non-voting server (such as an observer or * a non-voting follower), respond right away. */ if (!validVoter(response.sid)) { Vote current = self.getCurrentVote(); QuorumVerifier qv = self.getQuorumVerifier(); ToSend notmsg = new ToSend( ToSend.mType.notification, current.getId(), current.getZxid(), logicalclock.get(), self.getPeerState(), response.sid, current.getPeerEpoch(), qv.toString().getBytes(UTF_8)); sendqueue.offer(notmsg); } else { // Receive new message // State of peer that sent this message QuorumPeer.ServerState ackstate = QuorumPeer.ServerState.LOOKING; switch (rstate) { case 0: ackstate = QuorumPeer.ServerState.LOOKING; break; case 1: ackstate = QuorumPeer.ServerState.FOLLOWING; break; case 2: ackstate = QuorumPeer.ServerState.LEADING; break; case 3: ackstate = QuorumPeer.ServerState.OBSERVING; break; default: continue; } n.leader = rleader; n.zxid = rzxid; n.electionEpoch = relectionEpoch; n.state = ackstate; n.sid = response.sid; n.peerEpoch = rpeerepoch; n.version = version; n.qv = rqv; //如果这个服务器正处于looking状态,那么发送提议leader if (self.getPeerState() == QuorumPeer.ServerState.LOOKING) { recvqueue.offer(n); /* * Send a notification back if the peer that sent this * message is also looking and its logical clock is * lagging behind. */ if ((ackstate == QuorumPeer.ServerState.LOOKING) && (n.electionEpoch < logicalclock.get())) { Vote v = getVote(); QuorumVerifier qv = self.getQuorumVerifier(); ToSend notmsg = new ToSend( ToSend.mType.notification, v.getId(), v.getZxid(), logicalclock.get(), self.getPeerState(), response.sid, v.getPeerEpoch(), qv.toString().getBytes()); sendqueue.offer(notmsg); } } else { //反之,如果选举结束了,则将自己服务器记录的leader信息发送回给对方 Vote current = self.getCurrentVote(); if (ackstate == QuorumPeer.ServerState.LOOKING) { if (self.leader != null) { if (leadingVoteSet != null) { self.leader.setLeadingVoteSet(leadingVoteSet); leadingVoteSet = null; } self.leader.reportLookingSid(response.sid); } QuorumVerifier qv = self.getQuorumVerifier(); ToSend notmsg = new ToSend( ToSend.mType.notification, current.getId(), current.getZxid(), current.getElectionEpoch(), self.getPeerState(), response.sid, current.getPeerEpoch(), qv.toString().getBytes()); sendqueue.offer(notmsg); } } } } catch (InterruptedException e) {...} } } }
@Override public void run() { updateThreadName(); //监控部分,进行了省略 try { jmxQuorumBean = new QuorumBean(this); ..... } catch (Exception e) { jmxQuorumBean = null; } try { //主体逻辑 while (running) { if (unavailableStartTime == 0) { unavailableStartTime = Time.currentElapsedTime(); } switch (getPeerState()) { case LOOKING: LOG.info("LOOKING"); ServerMetrics.getMetrics().LOOKING_COUNT.add(1); if (Boolean.getBoolean("readonlymode.enabled")) { final ReadOnlyZooKeeperServer roZk = new ReadOnlyZooKeeperServer(logFactory, this, this.zkDb); Thread roZkMgr = new Thread() { public void run() { try { // lower-bound grace period to 2 secs sleep(Math.max(2000, tickTime)); if (ServerState.LOOKING.equals(getPeerState())) { roZk.startup(); } } catch (InterruptedException e) {...} catch (Exception e) {...} } }; try { roZkMgr.start(); reconfigFlagClear(); if (shuttingDownLE) { shuttingDownLE = false; startLeaderElection(); } setCurrentVote(makeLEStrategy().lookForLeader()); } catch (Exception e) { setPeerState(ServerState.LOOKING); } finally { roZkMgr.interrupt(); roZk.shutdown(); } } else { try { reconfigFlagClear(); if (shuttingDownLE) { shuttingDownLE = false; startLeaderElection(); } //设置当前的投票 setCurrentVote(makeLEStrategy().lookForLeader()); } catch (Exception e) { LOG.warn("Unexpected exception", e); setPeerState(ServerState.LOOKING); } } break; case OBSERVING: try { LOG.info("OBSERVING"); setObserver(makeObserver(logFactory)); observer.observeLeader(); } catch (Exception e) { LOG.warn("Unexpected exception", e); } finally { observer.shutdown(); setObserver(null); updateServerState(); // Add delay jitter before we switch to LOOKING // state to reduce the load of ObserverMaster if (isRunning()) { Observer.waitForObserverElectionDelay(); } } break; case FOLLOWING: try { LOG.info("FOLLOWING"); setFollower(makeFollower(logFactory)); follower.followLeader(); } catch (Exception e) { LOG.warn("Unexpected exception", e); } finally { follower.shutdown(); setFollower(null); updateServerState(); } break; case LEADING: LOG.info("LEADING"); try { setLeader(makeLeader(logFactory)); leader.lead(); setLeader(null); } catch (Exception e) { LOG.warn("Unexpected exception", e); } finally { if (leader != null) { leader.shutdown("Forcing shutdown"); setLeader(null); } updateServerState(); } break; } } } finally { LOG.warn("QuorumPeer main thread exited"); MBeanRegistry instance = MBeanRegistry.getInstance(); instance.unregister(jmxQuorumBean); instance.unregister(jmxLocalPeerBean); for (RemotePeerBean remotePeerBean : jmxRemotePeerBean.values()) { instance.unregister(remotePeerBean); } jmxQuorumBean = null; jmxLocalPeerBean = null; jmxRemotePeerBean = null; } }
//FastLeaderElection类#lookForLeader方法 //核心的选举算法 public Vote lookForLeader() throws InterruptedException { //监控部分省略 try {...} catch (Exception e) {...} self.start_fle = Time.currentElapsedTime(); try { Map<Long, Vote> recvset = new HashMap<Long, Vote>(); Map<Long, Vote> outofelection = new HashMap<Long, Vote>(); int notTimeout = minNotificationInterval; synchronized (this) { //选举周期自增 logicalclock.incrementAndGet(); //更新选票信息(初始化的情况下是设置选自己) updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch()); } //发送选票 sendNotifications(); SyncedLearnerTracker voteSet = null; //在这个循环中,交换通知,直到找到Leader while ((self.getPeerState() == ServerState.LOOKING) && (!stop)) { //接收其他人发过来的选票 Notification n = recvqueue.poll(notTimeout, TimeUnit.MILLISECONDS); //从recvqueue取出通知为空的情况 //如广播出去8个,由于网络原因可能只收到3个,第四次取的时候就是空的 //可能收到8个了,但是选举还没结束,再次取的时候也是空的 //为了保证选举还没结束的时候,能继续收到其他Server的选票,并继续处理判断,直到选出Leader if (n == null) { //判断是否已经被交付,即检查所有队列是否为空,表示所有消息都已传递 if (manager.haveDelivered()) { // 重新发送,目的是为了重新再接收 sendNotifications(); } else { // 重新连接zk集群中的每一个server manager.connectAll(); } notTimeout = Math.min(notTimeout << 1, maxNotificationInterval); if (self.getQuorumVerifier() instanceof QuorumOracleMaj && self.getQuorumVerifier().revalidateVoteset(voteSet, notTimeout != minNotificationInterval)) { setPeerState(proposedLeader, voteSet); Vote endVote = new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch); leaveInstance(endVote); return endVote; } } //验证发送者的ServerId //验证当前通知推荐的leader的ServerId else if (validVoter(n.sid) && validVoter(n.leader)) { switch (n.state) { case LOOKING: if (getInitLastLoggedZxid() == -1) { break; } if (n.zxid == -1) { break; } // 判断对方选票的周期是不是比我自身的周期要大 // 这种是针对自身曾经宕机过,导致周期比其他人的要小 if (n.electionEpoch > logicalclock.get()) { logicalclock.set(n.electionEpoch); recvset.clear(); if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) { updateProposal(n.leader, n.zxid, n.peerEpoch); } else { updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch()); } sendNotifications(); } //如果对方选票的周期是不是比我自身的周期要小,则是无用票 else if (n.electionEpoch < logicalclock.get()) { break; } //PK逻辑即 else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) { //只有对面的比自身的要好,才会从新发起新票 updateProposal(n.leader, n.zxid, n.peerEpoch); sendNotifications(); } // 将选票放入选票的Set集合 recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch)); //获取voteSet voteSet = getVoteTracker(recvset, new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch)); //判断获取到了大多数节点对当前 Vote 的支持 if (voteSet.hasAllQuorums()) { //但是这个时候并不能直接判断 当前Vote 选择的Leadee就一定是最终的Leader //先等待 finalizeWait = 200ms 的时长,如果接收到了消息 //且接收到的 Vote 信息更新,那么放入到接收队列中,在下一次循环中再次比较谁的 Vote 更胜一筹 while ((n = recvqueue.poll(finalizeWait, TimeUnit.MILLISECONDS)) != null) { if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) { recvqueue.put(n); break; } } //当确实没有 Vote 消息再传进来的时候,可以确认最终的Leader,选举结束 //更新节点状态 //如果选举出的 leaderId和自身id一样,表明自己是Leader ,状态为LEADING //如果不一样,表明自己是Follower ,状态为FOLLOWING //最后,退出选举过程 if (n == null) { setPeerState(proposedLeader, voteSet); Vote endVote = new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch); leaveInstance(endVote); return endVote; } } break; case OBSERVING: break; case FOLLOWING: Vote resultFN = receivedFollowingNotification(recvset, outofelection, voteSet, n); if (resultFN == null) { break; } else { return resultFN; } case LEADING: Vote resultLN = receivedLeadingNotification(recvset, outofelection, voteSet, n); if (resultLN == null) { break; } else { return resultLN; } default: break; } } else { if (!validVoter(n.leader)) {...} if (!validVoter(n.sid)) {...} } } return null; } finally { //监控相关的就省略了 } } //发送选票方法 private void sendNotifications() { //循环拿出可以参加选举的节点进行发送选票 for (long sid : self.getCurrentAndNextConfigVoters()) { QuorumVerifier qv = self.getQuorumVerifier(); ToSend notmsg = new ToSend( ToSend.mType.notification, proposedLeader, proposedZxid, logicalclock.get(), //当前的选举周期 QuorumPeer.ServerState.LOOKING, //当前节点状态 sid, proposedEpoch, qv.toString().getBytes(UTF_8)); //放入sendqueue队列【这里面按照循环是会塞入多个,然后等待WorkerSender进行分发】 sendqueue.offer(notmsg); } } //判断交付逻辑 boolean haveDelivered() { for (BlockingQueue<ByteBuffer> queue : queueSendMap.values()) { final int queueSize = queue.size(); //只要有一个队列为0就返回true,后面就不看了,因为之前说过只要有一个队列为空,就说明当前Server与zk集群的连接没有问题 if (queueSize == 0) { return true; } } //只有当所有队列都不为空,才说明当前Server与zk集群失联 return false; } //选票的PK逻辑 protected boolean totalOrderPredicate(long newId, long newZxid, long newEpoch, long curId, long curZxid, long curEpoch) { if (self.getQuorumVerifier().getWeight(newId) == 0) { return false; } //周期高的 //周期一致,事务id大的 //周期一致,事务id大的,服务器id大的 return ((newEpoch > curEpoch) || ((newEpoch == curEpoch) && ((newZxid > curZxid) || ((newZxid == curZxid) && (newId > curId))))); }
//接收工作者WorkerReceiver会判断自身是否还在选举阶段,不是则会将自己服务器记录的leader信息发送回给对方 //则此时新加的机器节点的选票状态会是FOLLOWING或者LEADING private Vote receivedLeadingNotification(Map<Long, Vote> recvset, Map<Long, Vote> outofelection, SyncedLearnerTracker voteSet, Notification n) { Vote result = receivedFollowingNotification(recvset, outofelection, voteSet, n); if (result == null) { if (self.getQuorumVerifier().getNeedOracle() && !self.getQuorumVerifier().askOracle()) { setPeerState(n.leader, voteSet); Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch); leaveInstance(endVote); return endVote; } else { return null; } } else { return result; } } private Vote receivedFollowingNotification(Map<Long, Vote> recvset, Map<Long, Vote> outofelection, SyncedLearnerTracker voteSet, Notification n) { if (n.electionEpoch == logicalclock.get()) { recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state)); voteSet = getVoteTracker(recvset, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state)); if (voteSet.hasAllQuorums() && checkLeader(recvset, n.leader, n.electionEpoch)) { setPeerState(n.leader, voteSet); Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch); leaveInstance(endVote); return endVote; } } outofelection.put(n.sid, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state)); voteSet = getVoteTracker(outofelection, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state)); if (voteSet.hasAllQuorums() && checkLeader(outofelection, n.leader, n.electionEpoch)) { synchronized (this) { logicalclock.set(n.electionEpoch); setPeerState(n.leader, voteSet); } Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch); leaveInstance(endVote); return endVote; } return null; }
构建FastLeaderElection,持有两个LinkedBlockingQueue: sendqueue recvqueue 持有messenger,是经过Messenger类包装过的QuorumCnxManager messenger的持有: //两个线程 WorkerSender,负责从sendqueue拿到数据,然后去queueSendMap找对应的sid的队列塞入数据 WorkerReceiver,负责从manager的recvQueue中拿数据传到recvqueue //持有一个队列 recvQueue //对应每个socket连接都会有 RecvWorker,负责将DataInputStream里面的数据转而传到messenger的recvQueue队列 SendWorker,负责将各自队列里面的数据写入 DataOutputStream 而他们的存放 SendWorker持有RecvWorker senderWorkerMap{ ... [sid0->SendWorker], [sid1->SendWorker] } //队列集合 queueSendMap{ ... [sid0->CircularBlockingQueue], [sid1->CircularBlockingQueue] } //数据传输的流为 DataOutputStream DataInputStream
