Apache Kafka源码分析 – Replica and Partition
Replica
对于local replica, 需要记录highWatermarkValue,表示当前已经committed的数据
对于remote replica,需要记录logEndOffsetValue以及更新的时间
package kafka.cluster class Replica(val brokerId: Int, val partition: Partition, time: Time = SystemTime, initialHighWatermarkValue: Long = 0L, val log: Option[Log] = None) extends Logging { //only defined in local replica private[this] var highWatermarkValue: AtomicLong = new AtomicLong(initialHighWatermarkValue) // only used for remote replica; logEndOffsetValue for local replica is kept in log private[this] var logEndOffsetValue = new AtomicLong(ReplicaManager.UnknownLogEndOffset) private[this] var logEndOffsetUpdateTimeMsValue: AtomicLong = new AtomicLong(time.milliseconds) val topic = partition.topic val partitionId = partition.partitionId }
Partition
主要用于管理leader,ISR,AR
package kafka.cluster /** * Data structure that represents a topic partition. The leader maintains the AR, ISR, CUR, RAR */ class Partition(val topic: String, val partitionId: Int, var replicationFactor: Int, time: Time, val replicaManager: ReplicaManager) extends Logging with KafkaMetricsGroup { private val localBrokerId = replicaManager.config.brokerId //当前的brokerId private val logManager = replicaManager.logManager private val zkClient = replicaManager.zkClient var leaderReplicaIdOpt: Option[Int] = None //leaderReplica的id var inSyncReplicas: Set[Replica] = Set.empty[Replica] //ISR private val assignedReplicaMap = new Pool[Int,Replica] //AR,往往是大于等于ISR的 private val leaderIsrUpdateLock = new Object private var zkVersion: Int = LeaderAndIsr.initialZKVersion private var leaderEpoch: Int = LeaderAndIsr.initialLeaderEpoch - 1 /* Epoch of the controller that last changed the leader. This needs to be initialized correctly upon broker startup. * One way of doing that is through the controller's start replica state change command. When a new broker starts up * the controller sends it a start replica command containing the leader for each partition that the broker hosts. * In addition to the leader, the controller can also send the epoch of the controller that elected the leader for * each partition. */ private var controllerEpoch: Int = KafkaController.InitialControllerEpoch - 1 }
getOrCreateReplica
def getOrCreateReplica(replicaId: Int = localBrokerId): Replica = { val replicaOpt = getReplica(replicaId) replicaOpt match { case Some(replica) => replica case None => //需要create if (isReplicaLocal(replicaId)) { val config = LogConfig.fromProps(logManager.defaultConfig.toProps, AdminUtils.fetchTopicConfig(zkClient, topic)) val log = logManager.createLog(TopicAndPartition(topic, partitionId), config) //创建logfile val checkpoint = replicaManager.highWatermarkCheckpoints(log.dir.getParent) //试图读取HW的checkpoint val offsetMap = checkpoint.read if (!offsetMap.contains(TopicAndPartition(topic, partitionId))) warn("No checkpointed highwatermark is found for partition [%s,%d]".format(topic, partitionId)) val offset = offsetMap.getOrElse(TopicAndPartition(topic, partitionId), 0L).min(log.logEndOffset) //offset为cp读出的HW和logEndOffset中小的那个 val localReplica = new Replica(replicaId, this, time, offset, Some(log)) //创建Replica对象 addReplicaIfNotExists(localReplica) //加到AR中去 } else { //Remote Replica,直接创建对象 val remoteReplica = new Replica(replicaId, this, time) addReplicaIfNotExists(remoteReplica) } getReplica(replicaId).get } }
makeLeader
/** * Make the local replica the leader by resetting LogEndOffset for remote replicas (there could be old LogEndOffset from the time when this broker was the leader last time) * and setting the new leader and ISR */ def makeLeader(controllerId: Int, partitionStateInfo: PartitionStateInfo, correlationId: Int): Boolean = { leaderIsrUpdateLock synchronized { val allReplicas = partitionStateInfo.allReplicas //取出所有replicaid val leaderIsrAndControllerEpoch = partitionStateInfo.leaderIsrAndControllerEpoch val leaderAndIsr = leaderIsrAndControllerEpoch.leaderAndIsr // record the epoch of the controller that made the leadership decision. This is useful while updating the isr // to maintain the decision maker controller's epoch in the zookeeper path controllerEpoch = leaderIsrAndControllerEpoch.controllerEpoch //每次做leadership decision的时候需要更新controllerEpoch // add replicas that are new allReplicas.foreach(replica => getOrCreateReplica(replica)) //生成所有replica对象,对于new的需要create val newInSyncReplicas = leaderAndIsr.isr.map(r => getOrCreateReplica(r)).toSet //生成新的ISR // remove assigned replicas that have been removed by the controller (assignedReplicas().map(_.brokerId) -- allReplicas).foreach(removeReplica(_)) //对于AR中已经不再有效的replica,remove // reset LogEndOffset for remote replicas assignedReplicas.foreach(r => if (r.brokerId != localBrokerId) r.logEndOffset = ReplicaManager.UnknownLogEndOffset) //清空所有remote replica的LEO inSyncReplicas = newInSyncReplicas leaderEpoch = leaderAndIsr.leaderEpoch zkVersion = leaderAndIsr.zkVersion leaderReplicaIdOpt = Some(localBrokerId) // we may need to increment high watermark maybeIncrementLeaderHW(getReplica().get) //更新HW,用min(所有remote replica的LEO) true } }
maybeIncrementLeaderHW
用所有remote replica的LEO的最小值来替换当前的HW(如果大于HW的话)
/** * There is no need to acquire the leaderIsrUpdate lock here since all callers of this private API acquire that lock * @param leaderReplica */ private def maybeIncrementLeaderHW(leaderReplica: Replica) { val allLogEndOffsets = inSyncReplicas.map(_.logEndOffset) val newHighWatermark = allLogEndOffsets.min val oldHighWatermark = leaderReplica.highWatermark if(newHighWatermark > oldHighWatermark) { leaderReplica.highWatermark = newHighWatermark //更新HW } else }
makeFollower
/** * Make the local replica the follower by setting the new leader and ISR to empty */ def makeFollower(controllerId: Int, partitionStateInfo: PartitionStateInfo, leaders: Set[Broker], correlationId: Int): Boolean = { leaderIsrUpdateLock synchronized { val allReplicas = partitionStateInfo.allReplicas val leaderIsrAndControllerEpoch = partitionStateInfo.leaderIsrAndControllerEpoch val leaderAndIsr = leaderIsrAndControllerEpoch.leaderAndIsr val newLeaderBrokerId: Int = leaderAndIsr.leader //新的leaderid // record the epoch of the controller that made the leadership decision. This is useful while updating the isr // to maintain the decision maker controller's epoch in the zookeeper path controllerEpoch = leaderIsrAndControllerEpoch.controllerEpoch // TODO: Delete leaders from LeaderAndIsrRequest in 0.8.1 leaders.find(_.id == newLeaderBrokerId) match { case Some(leaderBroker) => // add replicas that are new allReplicas.foreach(r => getOrCreateReplica(r)) // remove assigned replicas that have been removed by the controller (assignedReplicas().map(_.brokerId) -- allReplicas).foreach(removeReplica(_)) inSyncReplicas = Set.empty[Replica] //将ISR置空 leaderEpoch = leaderAndIsr.leaderEpoch zkVersion = leaderAndIsr.zkVersion leaderReplicaIdOpt = Some(newLeaderBrokerId) case None => // we should not come here } true } }
maybeShrinkIsr
从ISR中将Stuck followers和Slow followers去除
def maybeShrinkIsr(replicaMaxLagTimeMs: Long, replicaMaxLagMessages: Long) { leaderIsrUpdateLock synchronized { leaderReplicaIfLocal() match { case Some(leaderReplica) => val outOfSyncReplicas = getOutOfSyncReplicas(leaderReplica, replicaMaxLagTimeMs, replicaMaxLagMessages) // 获取OutofSync的replicas if(outOfSyncReplicas.size > 0) { val newInSyncReplicas = inSyncReplicas – outOfSyncReplicas //从ISR中去除outOfSyncReplicas // update ISR in zk and in cache updateIsr(newInSyncReplicas) // we may need to increment high watermark since ISR could be down to 1 maybeIncrementLeaderHW(leaderReplica) replicaManager.isrShrinkRate.mark() } case None => // do nothing if no longer leader } } } def getOutOfSyncReplicas(leaderReplica: Replica, keepInSyncTimeMs: Long, keepInSyncMessages: Long): Set[Replica] = { /** * there are two cases that need to be handled here - * 1. Stuck followers: If the leo of the replica hasn't been updated for keepInSyncTimeMs ms, * the follower is stuck and should be removed from the ISR * 2. Slow followers: If the leo of the slowest follower is behind the leo of the leader by keepInSyncMessages, the * follower is not catching up and should be removed from the ISR **/ val leaderLogEndOffset = leaderReplica.logEndOffset val candidateReplicas = inSyncReplicas - leaderReplica // Case 1 above val stuckReplicas = candidateReplicas.filter(r => (time.milliseconds - r.logEndOffsetUpdateTimeMs) > keepInSyncTimeMs) if(stuckReplicas.size > 0) debug("Stuck replicas for partition [%s,%d] are %s".format(topic, partitionId, stuckReplicas.map(_.brokerId).mkString(","))) // Case 2 above val slowReplicas = candidateReplicas.filter(r => r.logEndOffset >= 0 && (leaderLogEndOffset - r.logEndOffset) > keepInSyncMessages) if(slowReplicas.size > 0) debug("Slow replicas for partition [%s,%d] are %s".format(topic, partitionId, slowReplicas.map(_.brokerId).mkString(","))) stuckReplicas ++ slowReplicas }