Apache Kafka源码分析 – Replica and Partition
Replica
对于local replica, 需要记录highWatermarkValue,表示当前已经committed的数据
对于remote replica,需要记录logEndOffsetValue以及更新的时间
package kafka.cluster
class Replica(val brokerId: Int,
val partition: Partition,
time: Time = SystemTime,
initialHighWatermarkValue: Long = 0L,
val log: Option[Log] = None) extends Logging {
//only defined in local replica
private[this] var highWatermarkValue: AtomicLong = new AtomicLong(initialHighWatermarkValue)
// only used for remote replica; logEndOffsetValue for local replica is kept in log
private[this] var logEndOffsetValue = new AtomicLong(ReplicaManager.UnknownLogEndOffset)
private[this] var logEndOffsetUpdateTimeMsValue: AtomicLong = new AtomicLong(time.milliseconds)
val topic = partition.topic
val partitionId = partition.partitionId
}
Partition
主要用于管理leader,ISR,AR
package kafka.cluster
/**
* Data structure that represents a topic partition. The leader maintains the AR, ISR, CUR, RAR
*/
class Partition(val topic: String,
val partitionId: Int,
var replicationFactor: Int,
time: Time,
val replicaManager: ReplicaManager) extends Logging with KafkaMetricsGroup {
private val localBrokerId = replicaManager.config.brokerId //当前的brokerId
private val logManager = replicaManager.logManager
private val zkClient = replicaManager.zkClient
var leaderReplicaIdOpt: Option[Int] = None //leaderReplica的id
var inSyncReplicas: Set[Replica] = Set.empty[Replica] //ISR
private val assignedReplicaMap = new Pool[Int,Replica] //AR,往往是大于等于ISR的
private val leaderIsrUpdateLock = new Object
private var zkVersion: Int = LeaderAndIsr.initialZKVersion
private var leaderEpoch: Int = LeaderAndIsr.initialLeaderEpoch - 1
/* Epoch of the controller that last changed the leader. This needs to be initialized correctly upon broker startup.
* One way of doing that is through the controller's start replica state change command. When a new broker starts up
* the controller sends it a start replica command containing the leader for each partition that the broker hosts.
* In addition to the leader, the controller can also send the epoch of the controller that elected the leader for
* each partition. */
private var controllerEpoch: Int = KafkaController.InitialControllerEpoch - 1
}
getOrCreateReplica
def getOrCreateReplica(replicaId: Int = localBrokerId): Replica = {
val replicaOpt = getReplica(replicaId)
replicaOpt match {
case Some(replica) => replica
case None => //需要create
if (isReplicaLocal(replicaId)) {
val config = LogConfig.fromProps(logManager.defaultConfig.toProps, AdminUtils.fetchTopicConfig(zkClient, topic))
val log = logManager.createLog(TopicAndPartition(topic, partitionId), config) //创建logfile
val checkpoint = replicaManager.highWatermarkCheckpoints(log.dir.getParent) //试图读取HW的checkpoint
val offsetMap = checkpoint.read
if (!offsetMap.contains(TopicAndPartition(topic, partitionId)))
warn("No checkpointed highwatermark is found for partition [%s,%d]".format(topic, partitionId))
val offset = offsetMap.getOrElse(TopicAndPartition(topic, partitionId), 0L).min(log.logEndOffset) //offset为cp读出的HW和logEndOffset中小的那个
val localReplica = new Replica(replicaId, this, time, offset, Some(log)) //创建Replica对象
addReplicaIfNotExists(localReplica) //加到AR中去
} else { //Remote Replica,直接创建对象
val remoteReplica = new Replica(replicaId, this, time)
addReplicaIfNotExists(remoteReplica)
}
getReplica(replicaId).get
}
}
makeLeader
/**
* Make the local replica the leader by resetting LogEndOffset for remote replicas (there could be old LogEndOffset from the time when this broker was the leader last time)
* and setting the new leader and ISR
*/
def makeLeader(controllerId: Int,
partitionStateInfo: PartitionStateInfo, correlationId: Int): Boolean = {
leaderIsrUpdateLock synchronized {
val allReplicas = partitionStateInfo.allReplicas //取出所有replicaid
val leaderIsrAndControllerEpoch = partitionStateInfo.leaderIsrAndControllerEpoch
val leaderAndIsr = leaderIsrAndControllerEpoch.leaderAndIsr
// record the epoch of the controller that made the leadership decision. This is useful while updating the isr
// to maintain the decision maker controller's epoch in the zookeeper path
controllerEpoch = leaderIsrAndControllerEpoch.controllerEpoch //每次做leadership decision的时候需要更新controllerEpoch
// add replicas that are new
allReplicas.foreach(replica => getOrCreateReplica(replica)) //生成所有replica对象,对于new的需要create
val newInSyncReplicas = leaderAndIsr.isr.map(r => getOrCreateReplica(r)).toSet //生成新的ISR
// remove assigned replicas that have been removed by the controller
(assignedReplicas().map(_.brokerId) -- allReplicas).foreach(removeReplica(_)) //对于AR中已经不再有效的replica,remove
// reset LogEndOffset for remote replicas
assignedReplicas.foreach(r => if (r.brokerId != localBrokerId) r.logEndOffset = ReplicaManager.UnknownLogEndOffset) //清空所有remote replica的LEO
inSyncReplicas = newInSyncReplicas
leaderEpoch = leaderAndIsr.leaderEpoch
zkVersion = leaderAndIsr.zkVersion
leaderReplicaIdOpt = Some(localBrokerId)
// we may need to increment high watermark
maybeIncrementLeaderHW(getReplica().get) //更新HW,用min(所有remote replica的LEO)
true
}
}
maybeIncrementLeaderHW
用所有remote replica的LEO的最小值来替换当前的HW(如果大于HW的话)
/**
* There is no need to acquire the leaderIsrUpdate lock here since all callers of this private API acquire that lock
* @param leaderReplica
*/
private def maybeIncrementLeaderHW(leaderReplica: Replica) {
val allLogEndOffsets = inSyncReplicas.map(_.logEndOffset)
val newHighWatermark = allLogEndOffsets.min
val oldHighWatermark = leaderReplica.highWatermark
if(newHighWatermark > oldHighWatermark) {
leaderReplica.highWatermark = newHighWatermark //更新HW
}
else
}
makeFollower
/**
* Make the local replica the follower by setting the new leader and ISR to empty
*/
def makeFollower(controllerId: Int,
partitionStateInfo: PartitionStateInfo,
leaders: Set[Broker], correlationId: Int): Boolean = {
leaderIsrUpdateLock synchronized {
val allReplicas = partitionStateInfo.allReplicas
val leaderIsrAndControllerEpoch = partitionStateInfo.leaderIsrAndControllerEpoch
val leaderAndIsr = leaderIsrAndControllerEpoch.leaderAndIsr
val newLeaderBrokerId: Int = leaderAndIsr.leader //新的leaderid
// record the epoch of the controller that made the leadership decision. This is useful while updating the isr
// to maintain the decision maker controller's epoch in the zookeeper path
controllerEpoch = leaderIsrAndControllerEpoch.controllerEpoch
// TODO: Delete leaders from LeaderAndIsrRequest in 0.8.1
leaders.find(_.id == newLeaderBrokerId) match {
case Some(leaderBroker) =>
// add replicas that are new
allReplicas.foreach(r => getOrCreateReplica(r))
// remove assigned replicas that have been removed by the controller
(assignedReplicas().map(_.brokerId) -- allReplicas).foreach(removeReplica(_))
inSyncReplicas = Set.empty[Replica] //将ISR置空
leaderEpoch = leaderAndIsr.leaderEpoch
zkVersion = leaderAndIsr.zkVersion
leaderReplicaIdOpt = Some(newLeaderBrokerId)
case None => // we should not come here
}
true
}
}
maybeShrinkIsr
从ISR中将Stuck followers和Slow followers去除
def maybeShrinkIsr(replicaMaxLagTimeMs: Long, replicaMaxLagMessages: Long) {
leaderIsrUpdateLock synchronized {
leaderReplicaIfLocal() match {
case Some(leaderReplica) =>
val outOfSyncReplicas = getOutOfSyncReplicas(leaderReplica, replicaMaxLagTimeMs, replicaMaxLagMessages) // 获取OutofSync的replicas
if(outOfSyncReplicas.size > 0) {
val newInSyncReplicas = inSyncReplicas – outOfSyncReplicas //从ISR中去除outOfSyncReplicas
// update ISR in zk and in cache
updateIsr(newInSyncReplicas)
// we may need to increment high watermark since ISR could be down to 1
maybeIncrementLeaderHW(leaderReplica)
replicaManager.isrShrinkRate.mark()
}
case None => // do nothing if no longer leader
}
}
}
def getOutOfSyncReplicas(leaderReplica: Replica, keepInSyncTimeMs: Long, keepInSyncMessages: Long): Set[Replica] = {
/**
* there are two cases that need to be handled here -
* 1. Stuck followers: If the leo of the replica hasn't been updated for keepInSyncTimeMs ms,
* the follower is stuck and should be removed from the ISR
* 2. Slow followers: If the leo of the slowest follower is behind the leo of the leader by keepInSyncMessages, the
* follower is not catching up and should be removed from the ISR
**/
val leaderLogEndOffset = leaderReplica.logEndOffset
val candidateReplicas = inSyncReplicas - leaderReplica
// Case 1 above
val stuckReplicas = candidateReplicas.filter(r => (time.milliseconds - r.logEndOffsetUpdateTimeMs) > keepInSyncTimeMs)
if(stuckReplicas.size > 0)
debug("Stuck replicas for partition [%s,%d] are %s".format(topic, partitionId, stuckReplicas.map(_.brokerId).mkString(",")))
// Case 2 above
val slowReplicas = candidateReplicas.filter(r => r.logEndOffset >= 0 && (leaderLogEndOffset - r.logEndOffset) > keepInSyncMessages)
if(slowReplicas.size > 0)
debug("Slow replicas for partition [%s,%d] are %s".format(topic, partitionId, slowReplicas.map(_.brokerId).mkString(",")))
stuckReplicas ++ slowReplicas
}