KAFKA 事件：【十九】说一下 kafka 消费者客户端架构？

大家好，这是一个为了梦想而保持学习的博客。这个专题会记录我对于 KAFKA 的学习和实战经验，希望对大家有所帮助，目录形式依旧为问答的方式，相当于是模拟面试。

前言

这一篇我们主要梳理下 kafka 的消费者客户端的整体架构。所谓架构整体架构呢，也就是 consumer 的核心链路设计，即：初始化、消息消费、位移提交、心跳。
Tips：以下源码基于 kafka-1.1.0 版本。

初始化

初始化，顾名思义，我们先来看看「KafkaConsumer」这个类的构造源码

    private KafkaConsumer(ConsumerConfig config,
                          Deserializer<K> keyDeserializer,
                          Deserializer<V> valueDeserializer) {
        try {
            // 一些基础配置项
            // ...
            List<ConsumerInterceptor<K, V>> interceptorList = (List) (new ConsumerConfig(userProvidedConfigs, false)).getConfiguredInstances(ConsumerConfig.INTERCEPTOR_CLASSES_CONFIG,
                    ConsumerInterceptor.class);
            // 拦截器/序列化器模块初始化
            this.interceptors = new ConsumerInterceptors<>(interceptorList);
            if (keyDeserializer == null) {
                this.keyDeserializer = config.getConfiguredInstance(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,
                        Deserializer.class);
                this.keyDeserializer.configure(config.originals(), true);
            } else {
                config.ignore(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG);
                this.keyDeserializer = keyDeserializer;
            }
            if (valueDeserializer == null) {
                this.valueDeserializer = config.getConfiguredInstance(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,
                        Deserializer.class);
                this.valueDeserializer.configure(config.originals(), false);
            } else {
                config.ignore(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG);
                this.valueDeserializer = valueDeserializer;
            }
            ClusterResourceListeners clusterResourceListeners = configureClusterResourceListeners(keyDeserializer, valueDeserializer, reporters, interceptorList);
            this.metadata = new Metadata(retryBackoffMs, config.getLong(ConsumerConfig.METADATA_MAX_AGE_CONFIG),
                    true, false, clusterResourceListeners);
            List<InetSocketAddress> addresses = ClientUtils.parseAndValidateAddresses(config.getList(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG));
            this.metadata.update(Cluster.bootstrap(addresses), Collections.<String>emptySet(), 0);
            String metricGrpPrefix = "consumer";
            ConsumerMetrics metricsRegistry = new ConsumerMetrics(metricsTags.keySet(), "consumer");
            ChannelBuilder channelBuilder = ClientUtils.createChannelBuilder(config);

            IsolationLevel isolationLevel = IsolationLevel.valueOf(
                    config.getString(ConsumerConfig.ISOLATION_LEVEL_CONFIG).toUpperCase(Locale.ROOT));
            Sensor throttleTimeSensor = Fetcher.throttleTimeSensor(metrics, metricsRegistry.fetcherMetrics);

            int heartbeatIntervalMs = config.getInt(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG);
            // 网络模块
            NetworkClient netClient = new NetworkClient(
                    new Selector(config.getLong(ConsumerConfig.CONNECTIONS_MAX_IDLE_MS_CONFIG), metrics, time, metricGrpPrefix, channelBuilder, logContext),
                    this.metadata,
                    clientId,
                    100, // a fixed large enough value will suffice for max in-flight requests
                    config.getLong(ConsumerConfig.RECONNECT_BACKOFF_MS_CONFIG),
                    config.getLong(ConsumerConfig.RECONNECT_BACKOFF_MAX_MS_CONFIG),
                    config.getInt(ConsumerConfig.SEND_BUFFER_CONFIG),
                    config.getInt(ConsumerConfig.RECEIVE_BUFFER_CONFIG),
                    config.getInt(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG),
                    time,
                    true,
                    new ApiVersions(),
                    throttleTimeSensor,
                    logContext);
            // 将公共的网络模块封装到consumerNetworkClient中
            this.client = new ConsumerNetworkClient(
                    logContext,
                    netClient,
                    metadata,
                    time,
                    retryBackoffMs,
                    config.getInt(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG),
                    heartbeatIntervalMs); //Will avoid blocking an extended period of time to prevent heartbeat thread starvation
            OffsetResetStrategy offsetResetStrategy = OffsetResetStrategy.valueOf(config.getString(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG).toUpperCase(Locale.ROOT));
            this.subscriptions = new SubscriptionState(offsetResetStrategy);
            this.assignors = config.getConfiguredInstances(
                    ConsumerConfig.PARTITION_ASSIGNMENT_STRATEGY_CONFIG,
                    PartitionAssignor.class);
            // 协调器组件
            this.coordinator = new ConsumerCoordinator(logContext,
                    this.client,
                    groupId,
                    config.getInt(ConsumerConfig.MAX_POLL_INTERVAL_MS_CONFIG),// 默认5分钟
                    config.getInt(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG),// 默认10s
                    heartbeatIntervalMs,
                    assignors,
                    this.metadata,
                    this.subscriptions,
                    metrics,
                    metricGrpPrefix,
                    this.time,
                    retryBackoffMs,
                    config.getBoolean(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG),
                    config.getInt(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG),
                    this.interceptors,
                    config.getBoolean(ConsumerConfig.EXCLUDE_INTERNAL_TOPICS_CONFIG),
                    config.getBoolean(ConsumerConfig.LEAVE_GROUP_ON_CLOSE_CONFIG));
            // 消息拉取组件
            this.fetcher = new Fetcher<>(
                    logContext,
                    this.client,
                    config.getInt(ConsumerConfig.FETCH_MIN_BYTES_CONFIG),
                    config.getInt(ConsumerConfig.FETCH_MAX_BYTES_CONFIG),
                    config.getInt(ConsumerConfig.FETCH_MAX_WAIT_MS_CONFIG),
                    config.getInt(ConsumerConfig.MAX_PARTITION_FETCH_BYTES_CONFIG),
                    config.getInt(ConsumerConfig.MAX_POLL_RECORDS_CONFIG),
                    config.getBoolean(ConsumerConfig.CHECK_CRCS_CONFIG),
                    this.keyDeserializer,
                    this.valueDeserializer,
                    this.metadata,
                    this.subscriptions,
                    metrics,
                    metricsRegistry.fetcherMetrics,
                    this.time,
                    this.retryBackoffMs,
                    this.requestTimeoutMs,
                    isolationLevel);

            config.logUnused();
            AppInfoParser.registerAppInfo(JMX_PREFIX, clientId, metrics);

            log.debug("Kafka consumer initialized");
        } catch (Throwable t) {
            // call close methods if internal objects are already constructed
            // this is to prevent resource leak. see KAFKA-2121
            close(0, true);
            // now propagate the exception
            throw new KafkaException("Failed to construct kafka consumer", t);
        }
    }

上面的代码比较长，其实主要就是 5 个模块：

一些基础参数的设置，例如：clientId、groupId 等等
拦截器 / 序列化模块的初始化「interceptors/deserializer」
网络模块初始化「NetworkClient」
协调器模块初始化「ConsumerCoordinator」
消息拉取组件初始化「Fetcher」

这个小节我们只需要大概了解我们初始化的时候会做哪些事情，其中包含了哪些组件，各个组件大概负责什么职责即可。

消息消费

初始化 KafkaCosnumer 完成后，在进行消息消费之前，我们需要订阅目标 Topic 列表。有以下三种方式： consumer.subscribe(Collections.singletonList(this.topic)); // 最常用的指定一个topics列表即可
consumer.assign(partitions); // 强制指定目标消费分区集合,不再具备消费组的特性
consumer.subscribe(pattern, callback); // 通过正则的方式匹配对应topic，并且可以设置rebalance时的回调

以上三种订阅方式，最常用的毫无疑问是第一种，顾名思义就是指定一个 topic 列表即可；
第二种呢，由于不具备消费组的特性了，因此也就不受消费组的分区订阅限制，可以让消费者个数突破分区数的限制，但是同时呢也就没有了 rebalance 带来的故障转移的优势，另外使用 assign 的方式订阅后消费，通过 kafka-consumer-groups.sh 查对应消费组的消费进度时会有点问题；
第三种呢，使用的很少，都是应用于一些相对特殊的场景下，就是可以灵活的匹配 topic 列表，除此之外也没什么特殊的地方。

消费

很多同学对 kafka 的消费方式有疑问，最多的就是以下两点：

真的写个 while (true) 去消费吗？—— 是的，通常情况下都可以这么写；如果想要优雅一点可以设置了一个对应的控制参数去代替「true」。
poll (timeout) 这个时间该怎么设置呀？—— 首先我们需要明白这个「timeout」参数是什么意思，简单的解释下就是我们这一次 poll，如果对应的所有分区中没有还未消费的消息，那么请求就会在服务端等待对应的时间；如果这个时间内生产者在对应的分区写入了数据那么就立刻返回再次进行拉取，如果没有等到，那么就在设置的这个超时时间后返回空的集合，继续下一轮的 poll。
在明白了参数的含义后，我们其实就明白了这个参数对我们消费的逻辑实际上没有多大影响，它只影响在服务端等待的时间，所以我们通常设置 100ms~1000ms 都是可以的；不过有一点需要注意的是，如果下游消费者比较多，比如几百上千个消费者实例，这个参数都设置成 10ms 之后的非常短的时间，这会导致服务端对应的时间轮处理线程繁忙从而出现一个现象：生产者明明没有写数据到服务端去，为啥服务端的 CPU 却有个 20%~40%？其实你按照正常 CPU 繁忙的手段去排查，最终能查到是 delayedConsume 延时任务的处理线程。
用上面这个真实的案例想告诉大家的是，这个参数对客户端没啥影响，对服务端还有点影响，所以通常情况下，设置个 500MS/1000MS 就可以了。

讲完了上面的两个关键问题，让我们来看下对应部分的源码：

    @Override
    public ConsumerRecords<K, V> poll(long timeout) {
        acquireAndEnsureOpen();
        try {
            if (timeout < 0)
                throw new IllegalArgumentException("Timeout must not be negative");

            if (this.subscriptions.hasNoSubscriptionOrUserAssignment())
                throw new IllegalStateException("Consumer is not subscribed to any topics or assigned any partitions");

            // poll for new data until the timeout expires
            long start = time.milliseconds();
            long remaining = timeout;
            do {
                Map<TopicPartition, List<ConsumerRecord<K, V>>> records = pollOnce(remaining);
                if (!records.isEmpty()) {
                    // before returning the fetched records, we can send off the next round of fetches
                    // and avoid block waiting for their responses to enable pipelining while the user
                    // is handling the fetched records.
                    //
                    // NOTE: since the consumed position has already been updated, we must not allow
                    // wakeups or any other errors to be triggered prior to returning the fetched records.
                    if (fetcher.sendFetches() > 0 || client.hasPendingRequests())
                        client.pollNoWakeup();
                    // 拿到数据后，执行拦截器逻辑
                    return this.interceptors.onConsume(new ConsumerRecords<>(records));
                }

                long elapsed = time.milliseconds() - start;
                remaining = timeout - elapsed;
            } while (remaining > 0);

            return ConsumerRecords.empty();
        } finally {
            release();
        }
    }

可以看到核心函数是：pollOnce(remaining)

    /**
     * Do one round of polling. In addition to checking for new data, this does any needed offset commits
     * (if auto-commit is enabled), and offset resets (if an offset reset policy is defined).
     * @param timeout The maximum time to block in the underlying call to {@link ConsumerNetworkClient#poll(long)}.
     * @return The fetched records (may be empty)
     */
    private Map<TopicPartition, List<ConsumerRecord<K, V>>> pollOnce(long timeout) {
        client.maybeTriggerWakeup();

        long startMs = time.milliseconds();
        // coordinator组件激活
        coordinator.poll(startMs, timeout);

        // Lookup positions of assigned partitions
        boolean hasAllFetchPositions = updateFetchPositions();

        // if data is available already, return it immediately
        // 如果数据已经就绪，则立即返回（从缓存中）
        Map<TopicPartition, List<ConsumerRecord<K, V>>> records = fetcher.fetchedRecords();
        if (!records.isEmpty())
            return records;

        // send any new fetches (won't resend pending fetches)
        // 如果缓存中没有数据，则发送FETCH请求拉取数据
        fetcher.sendFetches();

        long nowMs = time.milliseconds();
        long remainingTimeMs = Math.max(0, timeout - (nowMs - startMs));
        long pollTimeout = Math.min(coordinator.timeToNextPoll(nowMs), remainingTimeMs);

        // We do not want to be stuck blocking in poll if we are missing some positions
        // since the offset lookup may be backing off after a failure
        if (!hasAllFetchPositions && pollTimeout > retryBackoffMs)
            pollTimeout = retryBackoffMs;
        // 在上面设置完请求之后，由client.poll去进行网络IO
        client.poll(pollTimeout, nowMs, new PollCondition() {
            // 这个函数是判断FETCH请求是否有拉到数据，如果没有则返回true
            @Override
            public boolean shouldBlock() {
                // since a fetch might be completed by the background thread, we need this poll condition
                // to ensure that we do not block unnecessarily in poll()
                return !fetcher.hasCompletedFetches();
            }
        });

        // after the long poll, we should check whether the group needs to rebalance
        // prior to returning data so that the group can stabilize faster
        if (coordinator.needRejoin())
            return Collections.emptyMap();
        // 再次从缓存中取
        return fetcher.fetchedRecords();
    }

该函数主要做了三件事，让我们挨个看看：

激活 coordinator
从缓存中拉取数据并返回
如果缓存中不存在数据，那么就发送 Fetch 请求，并将数据存于缓存中，最终从缓存中拉取数据并返回。

激活 coordinator，这个函数非常关键。
从下面的代码我们可以看到，做了很多关键的动作：

确认 coordinator，其实就是发送 Find_Coordinator 请求
激活消费组，主要就是启动心跳线程 + 发送 Join_Group 请求
最后，判断是否需要执行自动提交

这个函数的信息量巨大，我们稍微整理一下以方便记忆：
1、心跳线程不是在初始化的时候启动的，而是在第一次 poll 时激活 coordinator 模块时启动的，这个显而易见，因为心跳是和 coordinator 节点去进行的。
2、元数据是在完成消费组激活动作后去更新的，那么 consumer 和对应的节点建连的时刻，也就是在第一次 poll 的时候。
3、消息自动提交的实现，是在 pollOnce 的入口处，也就是每次 poll 的第一个动作就是去检查是否需要自动提交上一次的消费进度。

   /**
     * Poll for coordinator events. This ensures that the coordinator is known and that the consumer
     * has joined the group (if it is using group management). This also handles periodic offset commits
     * if they are enabled.
     *
     * @param now current time in milliseconds
     */
    public void poll(long now, long remainingMs) {
        // 执行已完成的位移提交的回调函数
        invokeCompletedOffsetCommitCallbacks();
        // 如果不是assign模式，那么就走消费组的策略
        if (subscriptions.partitionsAutoAssigned()) {
            // 确认coordinator
            if (coordinatorUnknown()) {
                ensureCoordinatorReady();
                now = time.milliseconds();
            }
            // 分区信息/订阅信息出现变化，则需要进行REJOIN
            if (needRejoin()) {
                // due to a race condition between the initial metadata fetch and the initial rebalance,
                // we need to ensure that the metadata is fresh before joining initially. This ensures
                // that we have matched the pattern against the cluster's topics at least once before joining.
                // 如果是按照正则方式订阅，那么需要确认元数据是否需要更新
                if (subscriptions.hasPatternSubscription())
                    client.ensureFreshMetadata();
                // 1、二次确认coordinator
                // 2、启动heartbeatThread
                // 3、发送JOIN_GROUP请求
                ensureActiveGroup();
                now = time.milliseconds();
            }
            // 检查心跳线程状态
            pollHeartbeat(now);
        } else {
            // For manually assigned partitions, if there are no ready nodes, await metadata.
            // If connections to all nodes fail, wakeups triggered while attempting to send fetch
            // requests result in polls returning immediately, causing a tight loop of polls. Without
            // the wakeup, poll() with no channels would block for the timeout, delaying re-connection.
            // awaitMetadataUpdate() initiates new connections with configured backoff and avoids the busy loop.
            // When group management is used, metadata wait is already performed for this scenario as
            // coordinator is unknown, hence this check is not required.
            if (metadata.updateRequested() && !client.hasReadyNodes()) {
                boolean metadataUpdated = client.awaitMetadataUpdate(remainingMs);
                if (!metadataUpdated && !client.hasReadyNodes())
                    return;
                now = time.milliseconds();
            }
        }
        // 判断是否需要自动提交
        maybeAutoCommitOffsetsAsync(now);
    }

    protected synchronized boolean ensureCoordinatorReady(long startTimeMs, long timeoutMs) {
        long remainingMs = timeoutMs;

        while (coordinatorUnknown()) {
            RequestFuture<Void> future = lookupCoordinator();
            client.poll(future, remainingMs);

            if (future.failed()) {
                if (future.isRetriable()) {
                    remainingMs = timeoutMs - (time.milliseconds() - startTimeMs);
                    if (remainingMs <= 0)
                        break;

                    log.debug("Coordinator discovery failed, refreshing metadata");
                    client.awaitMetadataUpdate(remainingMs);
                } else
                    throw future.exception();
            } else if (coordinator != null && client.connectionFailed(coordinator)) {
                // we found the coordinator, but the connection has failed, so mark
                // it dead and backoff before retrying discovery
                markCoordinatorUnknown();
                time.sleep(retryBackoffMs);
            }

            remainingMs = timeoutMs - (time.milliseconds() - startTimeMs);
            if (remainingMs <= 0)
                break;
        }

        return !coordinatorUnknown();
    }

    public void ensureActiveGroup() {
        // always ensure that the coordinator is ready because we may have been disconnected
        // when sending heartbeats and does not necessarily require us to rejoin the group.
        ensureCoordinatorReady();
        startHeartbeatThreadIfNeeded();
        joinGroupIfNeeded();
    }

我们接着看下消息是如何从缓存中被拉取的，从源码可知，数据是被缓存于 ConcurrentLinkedQueue<CompletedFetch> completedFetches 这么一个队列中，存放的形式是分区为一个 Fetch 进行管理的。

    /**
     * Return the fetched records, empty the record buffer and update the consumed position.
     *
     * NOTE: returning empty records guarantees the consumed position are NOT updated.
     *
     * @return The fetched records per partition
     * @throws OffsetOutOfRangeException If there is OffsetOutOfRange error in fetchResponse and
     *         the defaultResetPolicy is NONE
     */
    public Map<TopicPartition, List<ConsumerRecord<K, V>>> fetchedRecords() {
        Map<TopicPartition, List<ConsumerRecord<K, V>>> fetched = new HashMap<>();
        int recordsRemaining = maxPollRecords;

        try {
            // 拉取max.poll.size条数
            while (recordsRemaining > 0) {
                // 从completedFetches里面取出completedFetch
                if (nextInLineRecords == null || nextInLineRecords.isFetched) {
                    CompletedFetch completedFetch = completedFetches.peek();
                    // 如果拉完了，直接break
                    if (completedFetch == null) break;

                    nextInLineRecords = parseCompletedFetch(completedFetch);
                    completedFetches.poll();
                } else {
                    // 将completedFetch转换成records
                    List<ConsumerRecord<K, V>> records = fetchRecords(nextInLineRecords, recordsRemaining);
                    TopicPartition partition = nextInLineRecords.partition;
                    if (!records.isEmpty()) {
                        List<ConsumerRecord<K, V>> currentRecords = fetched.get(partition);
                        if (currentRecords == null) {
                            fetched.put(partition, records);
                        } else {
                            // this case shouldn't usually happen because we only send one fetch at a time per partition,
                            // but it might conceivably happen in some rare cases (such as partition leader changes).
                            // we have to copy to a new list because the old one may be immutable
                            List<ConsumerRecord<K, V>> newRecords = new ArrayList<>(records.size() + currentRecords.size());
                            newRecords.addAll(currentRecords);
                            newRecords.addAll(records);
                            fetched.put(partition, newRecords);
                        }
                        recordsRemaining -= records.size();
                    }
                }
            }
        } catch (KafkaException e) {
            if (fetched.isEmpty())
                throw e;
        }
        return fetched;
    }
	

    private List<ConsumerRecord<K, V>> fetchRecords(PartitionRecords partitionRecords, int maxRecords) {
        if (!subscriptions.isAssigned(partitionRecords.partition)) {
            // this can happen when a rebalance happened before fetched records are returned to the consumer's poll call
            log.debug("Not returning fetched records for partition {} since it is no longer assigned",
                    partitionRecords.partition);
        } else if (!subscriptions.isFetchable(partitionRecords.partition)) {
            // this can happen when a partition is paused before fetched records are returned to the consumer's
            // poll call or if the offset is being reset
            log.debug("Not returning fetched records for assigned partition {} since it is no longer fetchable",
                    partitionRecords.partition);
        } else {
            long position = subscriptions.position(partitionRecords.partition);
            // 比较内存中记录的nextFetchOffset是否等于这次拉取下来的数据的起始位移
            if (partitionRecords.nextFetchOffset == position) {
                List<ConsumerRecord<K, V>> partRecords = partitionRecords.fetchRecords(maxRecords);

                long nextOffset = partitionRecords.nextFetchOffset;
                log.trace("Returning fetched records at offset {} for assigned partition {} and update " +
                        "position to {}", position, partitionRecords.partition, nextOffset);
                // 更新当前分区的消费位移信息 = nextOffset
                // 因此在这次拉取完成后，根据partitionState.position拿到的位移信息，一定都是curRecords.offset+1
                // 所以无论是异步/同步提交，都是直接把这一批数据给提交上去了，反而可能会导致数据丢失。
                // 反观自动提交，是在上一批数据消费完之后，才去提交的，因此在同步消费上一批数据的情况下，只会造成重复消费，而不会消息丢失。
                // 如果是异步执行的方式，那么必然会有重复消费/消息丢失的风险。
                subscriptions.position(partitionRecords.partition, nextOffset);

                Long partitionLag = subscriptions.partitionLag(partitionRecords.partition, isolationLevel);
                if (partitionLag != null)
                    this.sensors.recordPartitionLag(partitionRecords.partition, partitionLag);

                return partRecords;
            } else {
                // these records aren't next in line based on the last consumed position, ignore them
                // they must be from an obsolete request
                log.debug("Ignoring fetched records for {} at offset {} since the current position is {}",
                        partitionRecords.partition, partitionRecords.nextFetchOffset, position);
            }
        }

        partitionRecords.drain();
        return emptyList();
    }
	

    private static class CompletedFetch {
        private final TopicPartition partition;
        private final long fetchedOffset;
        private final FetchResponse.PartitionData partitionData;
        private final FetchResponseMetricAggregator metricAggregator;
        private final short responseVersion;

        private CompletedFetch(TopicPartition partition,
                               long fetchedOffset,
                               FetchResponse.PartitionData partitionData,
                               FetchResponseMetricAggregator metricAggregator,
                               short responseVersion) {
            this.partition = partition;
            this.fetchedOffset = fetchedOffset;
            this.partitionData = partitionData;
            this.metricAggregator = metricAggregator;
            this.responseVersion = responseVersion;
        }
    }

我们再看下数据拉取后是怎么存放于 completeFetchs 中的，其实主要做了两件事：
1、找到这次发送请求对应的 Node 信息。
2、接着去发送请求，把拿到的数据封装好后入队到 completedFetches 中。

    public int sendFetches() {
        Map<Node, FetchSessionHandler.FetchRequestData> fetchRequestMap = prepareFetchRequests();
        for (Map.Entry<Node, FetchSessionHandler.FetchRequestData> entry : fetchRequestMap.entrySet()) {
            final Node fetchTarget = entry.getKey();
            final FetchSessionHandler.FetchRequestData data = entry.getValue();
            final FetchRequest.Builder request = FetchRequest.Builder
                    .forConsumer(this.maxWaitMs, this.minBytes, data.toSend())
                    .isolationLevel(isolationLevel)
                    .setMaxBytes(this.maxBytes)
                    .metadata(data.metadata())
                    .toForget(data.toForget());
            if (log.isDebugEnabled()) {
                log.debug("Sending {} {} to broker {}", isolationLevel, data.toString(), fetchTarget);
            }
            // 通过networkClient对指定节点拉取数据
            // 然后放入completedFetches中
            client.send(fetchTarget, request)
                    .addListener(new RequestFutureListener<ClientResponse>() {
                        @Override
                        public void onSuccess(ClientResponse resp) {
                            FetchResponse response = (FetchResponse) resp.responseBody();
                            FetchSessionHandler handler = sessionHandlers.get(fetchTarget.id());
                            if (handler == null) {
                                log.error("Unable to find FetchSessionHandler for node {}. Ignoring fetch response.",
                                    fetchTarget.id());
                                return;
                            }
                            if (!handler.handleResponse(response)) {
                                return;
                            }

                            Set<TopicPartition> partitions = new HashSet<>(response.responseData().keySet());
                            FetchResponseMetricAggregator metricAggregator = new FetchResponseMetricAggregator(sensors, partitions);
                            // 拉取成功就缓存在completedFetches里面
                            for (Map.Entry<TopicPartition, FetchResponse.PartitionData> entry : response.responseData().entrySet()) {
                                TopicPartition partition = entry.getKey();
                                long fetchOffset = data.sessionPartitions().get(partition).fetchOffset;
                                FetchResponse.PartitionData fetchData = entry.getValue();

                                log.debug("Fetch {} at offset {} for partition {} returned fetch data {}",
                                        isolationLevel, fetchOffset, partition, fetchData);
                                completedFetches.add(new CompletedFetch(partition, fetchOffset, fetchData, metricAggregator,
                                        resp.requestHeader().apiVersion()));
                            }

                            sensors.fetchLatency.record(resp.requestLatencyMs());
                        }

                        @Override
                        public void onFailure(RuntimeException e) {
                            FetchSessionHandler handler = sessionHandlers.get(fetchTarget.id());
                            if (handler != null) {
                                handler.handleError(e);
                            }
                        }
                    });
        }
        return fetchRequestMap.size();
    }

到这里，消息消费的逻辑基本上梳理完成了，其实概括一下来说就是：
1、激活 coordinator，激活消费组，启动心跳线程，判断是否自动提交。
2、尝试从缓存队列 complateFetchs 中拉取数据，没有则发送 fetch 请求。
3、将 fetch 请求拿到的数据封装到 complateFetchs 队列中，最后再次拉取后返回消息集合。

位移提交

在我们处理完消息之后，如何让服务端知道呢？答案就是位移提交。
提交的方式有三种：

自动提交：开启 enable.auto.commit=true 后即可自动提交，默认为 true，每 5s 提交异一次。
异步提交：也就是当即发送一条 Commit_offset 请求，但是不等待响应。
同步提交：也就是当即发送一条 Commit_offset 请求，需要同步等待响应。

让我们来看下对应的源码。

自动提交

涉及以下 3 个函数，最重要的呢就是 allConsumed 函数，这个函数决定我当前提交的到底是哪一条 offset。
从代码可知，提交的呢就是我们每个分区的最新的 position，而这个 position 的设置则是在我们上面的消息拉取处去设置的，各位可以回头去看下，其实就是这是的 lastOffset，由此我们知道，自动提交是每次 poll 的时候去检查是否满足自动提交的条件，如果满足呢就提交上一批拉取的数据的最新位移。

    public void maybeAutoCommitOffsetsAsync(long now) {
        if (autoCommitEnabled && now >= nextAutoCommitDeadline) {
            this.nextAutoCommitDeadline = now + autoCommitIntervalMs;
            doAutoCommitOffsetsAsync();
        }
    }
	
    private void doAutoCommitOffsetsAsync() {
        // 获取已消费的位移信息
        Map<TopicPartition, OffsetAndMetadata> allConsumedOffsets = subscriptions.allConsumed();
        log.debug("Sending asynchronous auto-commit of offsets {}", allConsumedOffsets);

        commitOffsetsAsync(allConsumedOffsets, new OffsetCommitCallback() {
            @Override
            public void onComplete(Map<TopicPartition, OffsetAndMetadata> offsets, Exception exception) {
                if (exception != null) {
                    if (exception instanceof RetriableException) {
                        log.debug("Asynchronous auto-commit of offsets {} failed due to retriable error: {}", offsets,
                                exception);
                        nextAutoCommitDeadline = Math.min(time.milliseconds() + retryBackoffMs, nextAutoCommitDeadline);
                    } else {
                        log.warn("Asynchronous auto-commit of offsets {} failed: {}", offsets, exception.getMessage());
                    }
                } else {
                    log.debug("Completed asynchronous auto-commit of offsets {}", offsets);
                }
            }
        });
    }

    public Map<TopicPartition, OffsetAndMetadata> allConsumed() {
        Map<TopicPartition, OffsetAndMetadata> allConsumed = new HashMap<>();
        // 从partitionState中获取position信息，也就是上一次poll执行时
        // 该分区拉取到的最新的一条数据的那个position
        for (PartitionStates.PartitionState<TopicPartitionState> state : assignment.partitionStates()) {
            if (state.value().hasValidPosition())
                allConsumed.put(state.topicPartition(), new OffsetAndMetadata(state.value().position));
        }
        return allConsumed;
    }

异步 / 同步提交

为啥这两个放在一起呢？因为除了是否同步等待之外，基本没啥区别。
从下面的代码我们可以看到，默认的调用下，其本质还是取决于 allConsumed 函数，那么每次调用的时候也是提交当前分组的最新拉取 offset。所以不要滥用这两个函数的默认调用。
当然，这两个函数可以控制到 offset 维度的消费提交，通过其他两个重载函数；不过这样的话可以保障消息的可靠性，但是吞吐量会降得非常低，如果大家有兴趣可以去看下另外的重载函数即可。

    @Override
    public void commitAsync(OffsetCommitCallback callback) {
        acquireAndEnsureOpen();
        try {
            commitAsync(subscriptions.allConsumed(), callback);
        } finally {
            release();
        }
    }
	
    @Override
    public void commitSync() {
        acquireAndEnsureOpen();
        try {
            coordinator.commitOffsetsSync(subscriptions.allConsumed(), Long.MAX_VALUE);
        } finally {
            release();
        }
    }
	



    private void doCommitOffsetsAsync(final Map<TopicPartition, OffsetAndMetadata> offsets, final OffsetCommitCallback callback) {
        // 发送提交请求
        RequestFuture<Void> future = sendOffsetCommitRequest(offsets);
        final OffsetCommitCallback cb = callback == null ? defaultOffsetCommitCallback : callback;
        future.addListener(new RequestFutureListener<Void>() {
            @Override
            public void onSuccess(Void value) {
                // 拦截器可以对提交成功后也做操作
                if (interceptors != null)
                    interceptors.onCommit(offsets);
                // 将提交信息缓存
                completedOffsetCommits.add(new OffsetCommitCompletion(cb, offsets, null));
            }

            @Override
            public void onFailure(RuntimeException e) {
                Exception commitException = e;
                // 可重试异常
                if (e instanceof RetriableException)
                    commitException = new RetriableCommitFailedException(e);
                // 也缓存起来
                completedOffsetCommits.add(new OffsetCommitCompletion(cb, offsets, commitException));
            }
        });
    }

    public boolean commitOffsetsSync(Map<TopicPartition, OffsetAndMetadata> offsets, long timeoutMs) {
        invokeCompletedOffsetCommitCallbacks();

        if (offsets.isEmpty())
            return true;

        long now = time.milliseconds();
        long startMs = now;
        long remainingMs = timeoutMs;
        do {
            if (coordinatorUnknown()) {
                if (!ensureCoordinatorReady(now, remainingMs))
                    return false;

                remainingMs = timeoutMs - (time.milliseconds() - startMs);
            }

            RequestFuture<Void> future = sendOffsetCommitRequest(offsets);
            client.poll(future, remainingMs);

            // We may have had in-flight offset commits when the synchronous commit began. If so, ensure that
            // the corresponding callbacks are invoked prior to returning in order to preserve the order that
            // the offset commits were applied.
            invokeCompletedOffsetCommitCallbacks();

            if (future.succeeded()) {
                if (interceptors != null)
                    interceptors.onCommit(offsets);
                return true;
            }

            if (future.failed() && !future.isRetriable())
                throw future.exception();

            time.sleep(retryBackoffMs);

            now = time.milliseconds();
            remainingMs = timeoutMs - (now - startMs);
        } while (remainingMs > 0);

        return false;
    }

最后，看完相关代码，有同学还是想问，到底该选择哪种提交方式呢？
其实都是根据业务需要来的，不然 kafka 也不会提供这么多种选择。
如果业务可以容忍重复消息，那么自动提交就可以满足其需求。
如果业务想要保证消费端不丢失消息，那么就可以使用 offset 维度的同步提交。
如果业务想要相对较高的可靠性，又想要还过得去的吞吐量，可以选择默认的异步提交。
...
以此类推进行选择，本质上还是业务对可靠性与吞吐量之间的选择。

心跳

最后，我们来看下客户端的心跳。
心跳的启动我们从上面知道是第一次 poll 的时候才会启动，我们来看看详细信息是什么。
其实，重点就是这个心跳线程是守护线程，优先级比较低，因此在机器 CPU 负载较高时可能拿不到 CPU 资源，从而导致心跳中断，该消费者被提出消费组。

    private synchronized void startHeartbeatThreadIfNeeded() {
        if (heartbeatThread == null) {
            heartbeatThread = new HeartbeatThread();
            heartbeatThread.start();
        }
    }
	
	
public static final String HEARTBEAT_THREAD_PREFIX = "kafka-coordinator-heartbeat-thread";

    private class HeartbeatThread extends KafkaThread {
        private boolean enabled = false;
        private boolean closed = false;
        private AtomicReference<RuntimeException> failed = new AtomicReference<>(null);

        private HeartbeatThread() {
            super(HEARTBEAT_THREAD_PREFIX + (groupId.isEmpty() ? "" : " | " + groupId), true);
        }

    public KafkaThread(final String name, boolean daemon) {
        super(name);
        configureThread(name, daemon);
    }

    private void configureThread(final String name, boolean daemon) {
        setDaemon(daemon);
        setUncaughtExceptionHandler(new UncaughtExceptionHandler() {
            public void uncaughtException(Thread t, Throwable e) {
                log.error("Uncaught exception in thread '{}':", name, e);
            }
        });
    }

再看看这个心跳线程做了些啥：

首先，检测心跳会话是否超时，如果超时了就把 coordinator 设置为 unkonwn，在下一次 poll 的时候会开启 rebalance。
接着检测两次 poll 的间隔，是否超过了 max.poll.interval.ms 参数的限制，默认是 5 分钟。如果是的话就主动发起 LeaveGroup 请求，下一次 poll 的时候会发起 rebalance。
然后检测是否到了心跳的间隔时间，如果没到那么就等待设置的重试间隔时间 100ms。
最后，时间到了，就发送心跳请求，并设置对应的 listenner。

除开上面的流程外，最重要的是 HeartbeatResponseHandler 中如何处理对应的 response 的。：

如果没有 error，则正常处理。
如果发现 coordinator 不可用或者没有 coordinator，则重新 find_coordinator
如果发现正处于 rebalance 过程中，则发送 JoinGroup 请求，重新加入消费组。
如果年代非法，那么说明掉线了一阵子了，别人的 rebaance 都完成了，那么重置年代后，重新加组。
如果是当前的 memberId 非法，那么同上重置年代后，重新加组。
其他异常则直接向上层抛出。

        @Override
        public void run() {
            try {
                log.debug("Heartbeat thread started");
                while (true) {
                    synchronized (AbstractCoordinator.this) {
                        if (closed)
                            return;

                        if (!enabled) {
                            AbstractCoordinator.this.wait();
                            continue;
                        }

                        if (state != MemberState.STABLE) {
                            // the group is not stable (perhaps because we left the group or because the coordinator
                            // kicked us out), so disable heartbeats and wait for the main thread to rejoin.
                            disable();
                            continue;
                        }

                        client.pollNoWakeup();
                        long now = time.milliseconds();

                        if (coordinatorUnknown()) {
                            if (findCoordinatorFuture != null || lookupCoordinator().failed())
                                // the immediate future check ensures that we backoff properly in the case that no
                                // brokers are available to connect to.
                                AbstractCoordinator.this.wait(retryBackoffMs);
                        } else if (heartbeat.sessionTimeoutExpired(now)) {
                            // the session timeout has expired without seeing a successful heartbeat, so we should
                            // probably make sure the coordinator is still healthy.
                            markCoordinatorUnknown();
                        } else if (heartbeat.pollTimeoutExpired(now)) {
                            // the poll timeout has expired, which means that the foreground thread has stalled
                            // in between calls to poll(), so we explicitly leave the group.
                            maybeLeaveGroup();
                        } else if (!heartbeat.shouldHeartbeat(now)) {
                            // poll again after waiting for the retry backoff in case the heartbeat failed or the
                            // coordinator disconnected
                            AbstractCoordinator.this.wait(retryBackoffMs);
                        } else {
                            heartbeat.sentHeartbeat(now);

                            sendHeartbeatRequest().addListener(new RequestFutureListener<Void>() {
                                @Override
                                public void onSuccess(Void value) {
                                    synchronized (AbstractCoordinator.this) {
                                        heartbeat.receiveHeartbeat(time.milliseconds());
                                    }
                                }

                                @Override
                                public void onFailure(RuntimeException e) {
                                    synchronized (AbstractCoordinator.this) {
                                        if (e instanceof RebalanceInProgressException) {
                                            // it is valid to continue heartbeating while the group is rebalancing. This
                                            // ensures that the coordinator keeps the member in the group for as long
                                            // as the duration of the rebalance timeout. If we stop sending heartbeats,
                                            // however, then the session timeout may expire before we can rejoin.
                                            heartbeat.receiveHeartbeat(time.milliseconds());
                                        } else {
                                            heartbeat.failHeartbeat();

                                            // wake up the thread if it's sleeping to reschedule the heartbeat
                                            AbstractCoordinator.this.notify();
                                        }
                                    }
                                }
                            });
                        }
                    }
                }
            } catch (AuthenticationException e) {
                log.error("An authentication error occurred in the heartbeat thread", e);
                this.failed.set(e);
            } catch (GroupAuthorizationException e) {
                log.error("A group authorization error occurred in the heartbeat thread", e);
                this.failed.set(e);
            } catch (InterruptedException | InterruptException e) {
                Thread.interrupted();
                log.error("Unexpected interrupt received in heartbeat thread", e);
                this.failed.set(new RuntimeException(e));
            } catch (Throwable e) {
                log.error("Heartbeat thread failed due to unexpected error", e);
                if (e instanceof RuntimeException)
                    this.failed.set((RuntimeException) e);
                else
                    this.failed.set(new RuntimeException(e));
            } finally {
                log.debug("Heartbeat thread has closed");
            }
        }
    }
	

    // visible for testing
    synchronized RequestFuture<Void> sendHeartbeatRequest() {
        log.debug("Sending Heartbeat request to coordinator {}", coordinator);
        HeartbeatRequest.Builder requestBuilder =
                new HeartbeatRequest.Builder(this.groupId, this.generation.generationId, this.generation.memberId);
        // 将handler传入，处理对应响应，并且执行外部回调
        return client.send(coordinator, requestBuilder)
                .compose(new HeartbeatResponseHandler());
    }

    private class HeartbeatResponseHandler extends CoordinatorResponseHandler<HeartbeatResponse, Void> {
        @Override
        public void handle(HeartbeatResponse heartbeatResponse, RequestFuture<Void> future) {
            sensors.heartbeatLatency.record(response.requestLatencyMs());
            Errors error = heartbeatResponse.error();
            if (error == Errors.NONE) {
                log.debug("Received successful Heartbeat response");
                future.complete(null);
            } else if (error == Errors.COORDINATOR_NOT_AVAILABLE
                    || error == Errors.NOT_COORDINATOR) {
                log.debug("Attempt to heartbeat since coordinator {} is either not started or not valid.",
                        coordinator());
                markCoordinatorUnknown();
                future.raise(error);
            } else if (error == Errors.REBALANCE_IN_PROGRESS) {
                log.debug("Attempt to heartbeat failed since group is rebalancing");
                requestRejoin();
                future.raise(Errors.REBALANCE_IN_PROGRESS);
            } else if (error == Errors.ILLEGAL_GENERATION) {
                log.debug("Attempt to heartbeat failed since generation {} is not current", generation.generationId);
                resetGeneration();
                future.raise(Errors.ILLEGAL_GENERATION);
            } else if (error == Errors.UNKNOWN_MEMBER_ID) {
                log.debug("Attempt to heartbeat failed for since member id {} is not valid.", generation.memberId);
                resetGeneration();
                future.raise(Errors.UNKNOWN_MEMBER_ID);
            } else if (error == Errors.GROUP_AUTHORIZATION_FAILED) {
                future.raise(new GroupAuthorizationException(groupId));
            } else {
                future.raise(new KafkaException("Unexpected error in heartbeat response: " + error.message()));
            }
        }
    }

总结

以上，整体梳理下 consumer 核心链路的相关代码，但是相当于是代码的走读，没有做总结性的记录和画图，因此本文只是半完成品，后续有时间我会补上对应的东西，如果大家对其中某些地方有疑问的，欢迎留言交流～^_^

posted @ 2022-06-04 13:40 Keepal 阅读(418) 评论(0) 收藏举报

刷新页面返回顶部

Keepal

KAFKA 事件：【十九】说一下 kafka 消费者客户端架构？

前言

初始化

消息消费

订阅

消费

位移提交

自动提交

异步 / 同步提交

心跳

总结

公告