Apache Kafka源码分析 – ReplicaManager
如果说controller作为master,负责全局的事情,比如选取leader,reassignment等
那么ReplicaManager就是worker,负责完成replica的管理工作
主要工作包含,
stopReplica
getOrCreatePartition
getLeaderReplicaIfLocal
getReplica
readMessageSets
becomeLeaderOrFollower
StopReplicaCommand
处理很简单,主要就是停止fetcher线程,并删除partition目录
stopReplicas
def stopReplicas(stopReplicaRequest: StopReplicaRequest): (mutable.Map[TopicAndPartition, Short], Short) = { replicaStateChangeLock synchronized { // 加锁 val responseMap = new collection.mutable.HashMap[TopicAndPartition, Short] if(stopReplicaRequest.controllerEpoch < controllerEpoch) { // 检查Epoch,防止收到过期的request (responseMap, ErrorMapping.StaleControllerEpochCode) } else { controllerEpoch = stopReplicaRequest.controllerEpoch // 更新Epoch // First stop fetchers for all partitions, then stop the corresponding replicas replicaFetcherManager.removeFetcherForPartitions(stopReplicaRequest.partitions.map(r => TopicAndPartition(r.topic, r.partition))) // 先通过FetcherManager停止相关partition的Fetcher线程 for(topicAndPartition <- stopReplicaRequest.partitions){ val errorCode = stopReplica(topicAndPartition.topic, topicAndPartition.partition, stopReplicaRequest.deletePartitions) // 调用stopReplica responseMap.put(topicAndPartition, errorCode) } (responseMap, ErrorMapping.NoError) } } }
stopReplica
def stopReplica(topic: String, partitionId: Int, deletePartition: Boolean): Short = { getPartition(topic, partitionId) match { case Some(partition) => leaderPartitionsLock synchronized { leaderPartitions -= partition } if(deletePartition) { // 仅仅在deletePartition=true时,才会真正删除该partition val removedPartition = allPartitions.remove((topic, partitionId)) if (removedPartition != null) removedPartition.delete() // this will delete the local log } case None => //do nothing if replica no longer exists. This can happen during delete topic retries } }
LeaderAndISRCommand
becomeLeaderOrFollower
做些epoch和valid的检查,然后区分出leader和follows,分别调用makeLeaders,makeFollowers
def becomeLeaderOrFollower(leaderAndISRRequest: LeaderAndIsrRequest): (collection.Map[(String, Int), Short], Short) = { replicaStateChangeLock synchronized {// 加锁 val responseMap = new collection.mutable.HashMap[(String, Int), Short] if(leaderAndISRRequest.controllerEpoch < controllerEpoch) { // 检查requset epoch (responseMap, ErrorMapping.StaleControllerEpochCode) } else { val controllerId = leaderAndISRRequest.controllerId val correlationId = leaderAndISRRequest.correlationId controllerEpoch = leaderAndISRRequest.controllerEpoch // First check partition's leader epoch
// 前面只是检查了request的epoch,但是还要检查其中的每个partitionStateInfo中的leader epoch val partitionState = new HashMap[Partition, PartitionStateInfo]() leaderAndISRRequest.partitionStateInfos.foreach{ case ((topic, partitionId), partitionStateInfo) => val partition = getOrCreatePartition(topic, partitionId, partitionStateInfo.replicationFactor) // get或创建partition val partitionLeaderEpoch = partition.getLeaderEpoch() // If the leader epoch is valid record the epoch of the controller that made the leadership decision. // This is useful while updating the isr to maintain the decision maker controller's epoch in the zookeeper path if (partitionLeaderEpoch < partitionStateInfo.leaderIsrAndControllerEpoch.leaderAndIsr.leaderEpoch) { // local的partitionLeaderEpoch要小于request中的leaderEpoch,否则就是过时的request if(partitionStateInfo.allReplicas.contains(config.brokerId)) // 判断该partition是否被assigned给当前的broker partitionState.put(partition, partitionStateInfo) else { } } else { // Received invalid LeaderAndIsr request // Otherwise record the error code in response responseMap.put((topic, partitionId), ErrorMapping.StaleLeaderEpochCode) } } val partitionsTobeLeader = partitionState .filter{ case (partition, partitionStateInfo) => partitionStateInfo.leaderIsrAndControllerEpoch.leaderAndIsr.leader == config.brokerId} val partitionsToBeFollower = (partitionState -- partitionsTobeLeader.keys) if (!partitionsTobeLeader.isEmpty) makeLeaders(controllerId, controllerEpoch, partitionsTobeLeader, leaderAndISRRequest.correlationId, responseMap) if (!partitionsToBeFollower.isEmpty) makeFollowers(controllerId, controllerEpoch, partitionsToBeFollower, leaderAndISRRequest.leaders, leaderAndISRRequest.correlationId, responseMap) // we initialize highwatermark thread after the first leaderisrrequest. This ensures that all the partitions // have been completely populated before starting the checkpointing there by avoiding weird race conditions if (!hwThreadInitialized) { startHighWaterMarksCheckPointThread() // 启动HighWaterMarksCheckPointThread hwThreadInitialized = true } replicaFetcherManager.shutdownIdleFetcherThreads() (responseMap, ErrorMapping.NoError) } } }
makeLeaders
停止Fetcher,调用partition.makeLeader,把这些partition加到leaderPartitions中
/* * Make the current broker to become leader for a given set of partitions by: * * 1. Stop fetchers for these partitions * 2. Update the partition metadata in cache * 3. Add these partitions to the leader partitions set * * If an unexpected error is thrown in this function, it will be propagated to KafkaApis where * the error message will be set on each partition since we do not know which partition caused it * TODO: the above may need to be fixed later */ private def makeLeaders(controllerId: Int, epoch: Int, partitionState: Map[Partition, PartitionStateInfo], correlationId: Int, responseMap: mutable.Map[(String, Int), Short]) = { try { // First stop fetchers for all the partitions replicaFetcherManager.removeFetcherForPartitions(partitionState.keySet.map(new TopicAndPartition(_))) // Update the partition information to be the leader partitionState.foreach{ case (partition, partitionStateInfo) => partition.makeLeader(controllerId, partitionStateInfo, correlationId)} // Finally add these partitions to the list of partitions for which the leader is the current broker leaderPartitionsLock synchronized { leaderPartitions ++= partitionState.keySet } } catch { } }
makeFollowers
除了修改leaderPartitions和Mark as followers以外
作为followers,需要truncated log到highWatermark,然后启动fetcher去catch leader
/* * Make the current broker to become follower for a given set of partitions by: * * 1. Remove these partitions from the leader partitions set. * 2. Mark the replicas as followers so that no more data can be added from the producer clients. * 3. Stop fetchers for these partitions so that no more data can be added by the replica fetcher threads. * 4. Truncate the log and checkpoint offsets for these partitions. * 5. If the broker is not shutting down, add the fetcher to the new leaders. * * The ordering of doing these steps make sure that the replicas in transition will not * take any more messages before checkpointing offsets so that all messages before the checkpoint * are guaranteed to be flushed to disks * * If an unexpected error is thrown in this function, it will be propagated to KafkaApis where * the error message will be set on each partition since we do not know which partition caused it */ private def makeFollowers(controllerId: Int, epoch: Int, partitionState: Map[Partition, PartitionStateInfo], leaders: Set[Broker], correlationId: Int, responseMap: mutable.Map[(String, Int), Short]) { try { leaderPartitionsLock synchronized { leaderPartitions --= partitionState.keySet } partitionState.foreach{ case (partition, leaderIsrAndControllerEpoch) => partition.makeFollower(controllerId, leaderIsrAndControllerEpoch, leaders, correlationId)} replicaFetcherManager.removeFetcherForPartitions(partitionState.keySet.map(new TopicAndPartition(_))) logManager.truncateTo(partitionState.map{ case(partition, leaderISRAndControllerEpoch) => // 将当前replica的log truncate到highWatermark,因为只有committed的数据是可以保证和leader一致的 new TopicAndPartition(partition) -> partition.getOrCreateReplica().highWatermark }) if (!isShuttingDown.get()) { // 如果该broker没有shutting down val partitionAndOffsets = mutable.Map[TopicAndPartition, BrokerAndInitialOffset]() partitionState.foreach { case (partition, partitionStateInfo) => val leader = partitionStateInfo.leaderIsrAndControllerEpoch.leaderAndIsr.leader // 找到leader leaders.find(_.id == leader) match { case Some(leaderBroker) => partitionAndOffsets.put(new TopicAndPartition(partition), // get当前replica的logEndOffset BrokerAndInitialOffset(leaderBroker, partition.getReplica().get.logEndOffset)) case None => } } replicaFetcherManager.addFetcherForPartitions(partitionAndOffsets) // 启动Fetcher去catch leader } else { } } } catch { } }
checkpointHighWatermarks
对于每个replica而已,HighWatermarks是很重要的,因为只有通过它可以知道到底哪些数据是一致的,这样就算broker crash,恢复的时候只需要基于HighWatermarks继续catch就可以
所以对于HighWatermarks,需要做cp
/** * Flushes the highwatermark value for all partitions to the highwatermark file */ def checkpointHighWatermarks() { val replicas = allPartitions.values.map(_.getReplica(config.brokerId)).collect{case Some(replica) => replica} val replicasByDir = replicas.filter(_.log.isDefined).groupBy(_.log.get.dir.getParent) for((dir, reps) <- replicasByDir) { val hwms = reps.map(r => (new TopicAndPartition(r) -> r.highWatermark)).toMap try { highWatermarkCheckpoints(dir).write(hwms) } catch { case e: IOException => fatal("Error writing to highwatermark file: ", e) Runtime.getRuntime().halt(1) } } }
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步