Es写流程
Es写索引包括协调节点流程和节点写索引的流程
协调节点主要做索引的预处理、检查、分发任务
节点执行完后发发送给主分片所在节点,该节点把response发送给协调节点,协调节点发送给用户
入口在TransportBulkAction的doExecute @Override protected void doExecute(Task task, BulkRequest bulkRequest, ActionListener<BulkResponse> listener) { if (bulkRequest.hasIndexRequestsWithPipelines()) {//如果指定pipeline了 if (clusterService.localNode().isIngestNode()) { //如果当前节点具备数据预处理资格 processBulkIndexIngestRequest(task, bulkRequest, listener); } else {//从可以执行预处理的节点中选择一个 选择方法是 Math.floor ingestForwarder.forwardIngestRequest(BulkAction.INSTANCE, bulkRequest, listener); } return; } final long startTime = relativeTime(); final AtomicArray<BulkItemResponse> responses = new AtomicArray<>(bulkRequest.requests.size()); if (needToCheck()) {//如果需要自动创建索引检查 final Set<String> indices = bulkRequest.requests.stream() // delete requests should not attempt to create the index (if the index does not // exists), unless an external versioning is used .filter(request -> request.opType() != DocWriteRequest.OpType.DELETE || request.versionType() == VersionType.EXTERNAL || request.versionType() == VersionType.EXTERNAL_GTE) .map(DocWriteRequest::index) .collect(Collectors.toSet());//只需要 非删除的 、EXTERNAL EXTERNAL_GTE的 /* Step 2: filter that to indices that don't exist and we can create. At the same time build a map of indices we can't create * that we'll use when we try to run the requests. */ final Map<String, IndexNotFoundException> indicesThatCannotBeCreated = new HashMap<>(); Set<String> autoCreateIndices = new HashSet<>(); ClusterState state = clusterService.state(); for (String index : indices) { boolean shouldAutoCreate; try { shouldAutoCreate = shouldAutoCreate(index, state); } catch (IndexNotFoundException e) { shouldAutoCreate = false; indicesThatCannotBeCreated.put(index, e); } if (shouldAutoCreate) { autoCreateIndices.add(index); } } // Step 3: create all the indices that are missing, if there are any missing. start the bulk after all the creates come back. if (autoCreateIndices.isEmpty()) {//添加需要创建索引的index 如果不需要创建 直接提交bulk 请求 executeBulk(task, bulkRequest, startTime, listener, responses, indicesThatCannotBeCreated); } else {//否则先创建 再提交bulk final AtomicInteger counter = new AtomicInteger(autoCreateIndices.size()); for (String index : autoCreateIndices) { createIndex(index, bulkRequest.timeout(), new ActionListener<CreateIndexResponse>() {//创建索引 @Override public void onResponse(CreateIndexResponse result) { if (counter.decrementAndGet() == 0) {//如果全部成功后提交bulk executeBulk(task, bulkRequest, startTime, listener, responses, indicesThatCannotBeCreated); } } @Override public void onFailure(Exception e) { if (!(ExceptionsHelper.unwrapCause(e) instanceof ResourceAlreadyExistsException)) { // fail all requests involving this index, if create didn't work for (int i = 0; i < bulkRequest.requests.size(); i++) {//如果失败 设置失败原因 并把bulk中该条失败的设置为null DocWriteRequest request = bulkRequest.requests.get(i); if (request != null && setResponseFailureIfIndexMatches(responses, i, request, index, e)) { bulkRequest.requests.set(i, null); } } } if (counter.decrementAndGet() == 0) {//如果所有索引都创建成功 提交bulk 因为可能bulk中有多条记录是同一个index的操作 如果某一条索引关联的index创建失败 其他成功了cout也会-- executeBulk(task, bulkRequest, startTime, ActionListener.wrap(listener::onResponse, inner -> { inner.addSuppressed(e); listener.onFailure(inner); }), responses, indicesThatCannotBeCreated); } } }); } } } else { executeBulk(task, bulkRequest, startTime, listener, responses, emptyMap()); } } 提交索引bulk @Override protected void doRun() throws Exception { final ClusterState clusterState = observer.setAndGetObservedState(); if (handleBlockExceptions(clusterState)) {//如果索引是只读的 可能是磁盘满了 通过设置"index.blocks.read_only_allow_delete":"false"参数 return; } final ConcreteIndices concreteIndices = new ConcreteIndices(clusterState, indexNameExpressionResolver); MetaData metaData = clusterState.metaData(); for (int i = 0; i < bulkRequest.requests.size(); i++) { DocWriteRequest docWriteRequest = bulkRequest.requests.get(i); //the request can only be null because we set it to null in the previous step, so it gets ignored if (docWriteRequest == null) { continue; } if (addFailureIfIndexIsUnavailable(docWriteRequest, i, concreteIndices, metaData)) {//如果某些index不可被新增或者索引已经关闭 continue; } Index concreteIndex = concreteIndices.resolveIfAbsent(docWriteRequest); try { switch (docWriteRequest.opType()) { case CREATE: case INDEX: IndexRequest indexRequest = (IndexRequest) docWriteRequest; final IndexMetaData indexMetaData = metaData.index(concreteIndex);//索引元信息 MappingMetaData mappingMd = indexMetaData.mappingOrDefault(indexRequest.type()); Version indexCreated = indexMetaData.getCreationVersion(); indexRequest.resolveRouting(metaData); indexRequest.process(indexCreated, mappingMd, concreteIndex.getName());//检查创建路由和主键id 如果版本在在V_6_0_0_beta1 之后用 如果版本在在V_6_0_0_beta1 之后用 base64UUID()其他用 legacyBase64UUID 设置ID break; case UPDATE: TransportUpdateAction.resolveAndValidateRouting(metaData, concreteIndex.getName(), (UpdateRequest) docWriteRequest);//通过id找路由 如果设置别名 也会找别名 break; case DELETE: docWriteRequest.routing(metaData.resolveIndexRouting(docWriteRequest.parent(), docWriteRequest.routing(), docWriteRequest.index())); // check if routing is required, if so, throw error if routing wasn't specified if (docWriteRequest.routing() == null && metaData.routingRequired(concreteIndex.getName(), docWriteRequest.type())) { throw new RoutingMissingException(concreteIndex.getName(), docWriteRequest.type(), docWriteRequest.id()); } break; default: throw new AssertionError("request type not supported: [" + docWriteRequest.opType() + "]"); } } catch (ElasticsearchParseException | IllegalArgumentException | RoutingMissingException e) {//设置失败信息 并不会throw异常 BulkItemResponse.Failure failure = new BulkItemResponse.Failure(concreteIndex.getName(), docWriteRequest.type(), docWriteRequest.id(), e); BulkItemResponse bulkItemResponse = new BulkItemResponse(i, docWriteRequest.opType(), failure); responses.set(i, bulkItemResponse); // make sure the request gets never processed again bulkRequest.requests.set(i, null); } } // first, go over all the requests and create a ShardId -> Operations mapping Map<ShardId, List<BulkItemRequest>> requestsByShard = new HashMap<>(); for (int i = 0; i < bulkRequest.requests.size(); i++) {//把bulk里面的request 按照shard 放到不同的集合中 DocWriteRequest request = bulkRequest.requests.get(i); if (request == null) { continue; } String concreteIndex = concreteIndices.getConcreteIndex(request.index()).getName(); ShardId shardId = clusterService.operationRouting().indexShards(clusterState, concreteIndex, request.id(), request.routing()).shardId();//通过路由找到shard List<BulkItemRequest> shardRequests = requestsByShard.computeIfAbsent(shardId, shard -> new ArrayList<>()); shardRequests.add(new BulkItemRequest(i, request)); } if (requestsByShard.isEmpty()) { listener.onResponse(new BulkResponse(responses.toArray(new BulkItemResponse[responses.length()]), buildTookInMillis(startTimeNanos))); return; } final AtomicInteger counter = new AtomicInteger(requestsByShard.size()); String nodeId = clusterService.localNode().getId(); for (Map.Entry<ShardId, List<BulkItemRequest>> entry : requestsByShard.entrySet()) { final ShardId shardId = entry.getKey(); final List<BulkItemRequest> requests = entry.getValue(); BulkShardRequest bulkShardRequest = new BulkShardRequest(shardId, bulkRequest.getRefreshPolicy(), requests.toArray(new BulkItemRequest[requests.size()]));//封装到不同的bulkshard request中 bulkShardRequest.waitForActiveShards(bulkRequest.waitForActiveShards()); //设置足够的活动分片 bulkShardRequest.timeout(bulkRequest.timeout());//超时时间 if (task != null) { bulkShardRequest.setParentTask(nodeId, task.getId()); } shardBulkAction.execute(bulkShardRequest, new ActionListener<BulkShardResponse>() { @Override public void onResponse(BulkShardResponse bulkShardResponse) { for (BulkItemResponse bulkItemResponse : bulkShardResponse.getResponses()) { // we may have no response if item failed if (bulkItemResponse.getResponse() != null) { bulkItemResponse.getResponse().setShardInfo(bulkShardResponse.getShardInfo()); } responses.set(bulkItemResponse.getItemId(), bulkItemResponse); } if (counter.decrementAndGet() == 0) {//如果全部执行成功 finishHim(); } } @Override public void onFailure(Exception e) { // create failures for all relevant requests for (BulkItemRequest request : requests) { final String indexName = concreteIndices.getConcreteIndex(request.index()).getName(); DocWriteRequest docWriteRequest = request.request(); responses.set(request.id(), new BulkItemResponse(request.id(), docWriteRequest.opType(), new BulkItemResponse.Failure(indexName, docWriteRequest.type(), docWriteRequest.id(), e))); } if (counter.decrementAndGet() == 0) { finishHim(); } } private void finishHim() { listener.onResponse(new BulkResponse(responses.toArray(new BulkItemResponse[responses.length()]), buildTookInMillis(startTimeNanos))); } }); } } 接受到bulk的消息后 在TransportReplicationAction的duRun中处理 @Override protected void doRun() { setPhase(task, "routing"); final ClusterState state = observer.setAndGetObservedState(); if (handleBlockExceptions(state)) { return; } // request does not have a shardId yet, we need to pass the concrete index to resolve shardId final String concreteIndex = concreteIndex(state); final IndexMetaData indexMetaData = state.metaData().index(concreteIndex); if (indexMetaData == null) { retry(new IndexNotFoundException(concreteIndex)); return; } if (indexMetaData.getState() == IndexMetaData.State.CLOSE) { throw new IndexClosedException(indexMetaData.getIndex()); } // resolve all derived request fields, so we can route and apply it resolveRequest(indexMetaData, request); assert request.shardId() != null : "request shardId must be set in resolveRequest"; assert request.waitForActiveShards() != ActiveShardCount.DEFAULT : "request waitForActiveShards must be set in resolveRequest"; final ShardRouting primary = primary(state); if (retryIfUnavailable(state, primary)) { return; } final DiscoveryNode node = state.nodes().get(primary.currentNodeId()); if (primary.currentNodeId().equals(state.nodes().getLocalNodeId())) {//主分片在本节点 在本地写 否则转发出去 performLocalAction(state, primary, node, indexMetaData); } else { performRemoteAction(state, primary, node); } } 以上为协调节点流程 private void performLocalAction(ClusterState state, ShardRouting primary, DiscoveryNode node, IndexMetaData indexMetaData) { setPhase(task, "waiting_on_primary"); if (logger.isTraceEnabled()) { logger.trace("send action [{}] to local primary [{}] for request [{}] with cluster state version [{}] to [{}] ", transportPrimaryAction, request.shardId(), request, state.version(), primary.currentNodeId()); } performAction(node, transportPrimaryAction, true, new ConcreteShardRequest<>(request, primary.allocationId().getId(), indexMetaData.primaryTerm(primary.id()))); } @Override public void onResponse(PrimaryShardReference primaryShardReference) { try { if (primaryShardReference.isRelocated()) {//如果主分片已经迁移 往迁移的分片上发 primaryShardReference.close(); // release shard operation lock as soon as possible setPhase(replicationTask, "primary_delegation"); // delegate primary phase to relocation target // it is safe to execute primary phase on relocation target as there are no more in-flight operations where primary // phase is executed on local shard and all subsequent operations are executed on relocation target as primary phase. final ShardRouting primary = primaryShardReference.routingEntry(); assert primary.relocating() : "indexShard is marked as relocated but routing isn't" + primary; DiscoveryNode relocatingNode = clusterService.state().nodes().get(primary.relocatingNodeId()); transportService.sendRequest(relocatingNode, transportPrimaryAction, new ConcreteShardRequest<>(request, primary.allocationId().getRelocationId(), primaryTerm), transportOptions, new TransportChannelResponseHandler<Response>(logger, channel, "rerouting indexing to target primary " + primary, TransportReplicationAction.this::newResponseInstance) { @Override public void handleResponse(Response response) { setPhase(replicationTask, "finished"); super.handleResponse(response); } @Override public void handleException(TransportException exp) { setPhase(replicationTask, "finished"); super.handleException(exp); } }); } else { setPhase(replicationTask, "primary"); final ActionListener<Response> listener = createResponseListener(primaryShardReference); createReplicatedOperation(request, ActionListener.wrap(result -> result.respond(listener), listener::onFailure), primaryShardReference) .execute(); } } catch (Exception e) { Releasables.closeWhileHandlingException(primaryShardReference); // release shard operation lock before responding to caller onFailure(e); } } 写处理 在ReplicationOperation的execute中 public void execute() throws Exception { final String activeShardCountFailure = checkActiveShardCount();//检查是否够活跃的分片 final ShardRouting primaryRouting = primary.routingEntry(); final ShardId primaryId = primaryRouting.shardId(); if (activeShardCountFailure != null) { finishAsFailed(new UnavailableShardsException(primaryId, "{} Timeout: [{}], request: [{}]", activeShardCountFailure, request.timeout(), request)); return; } totalShards.incrementAndGet(); pendingActions.incrementAndGet(); // increase by 1 until we finish all primary coordination primaryResult = primary.perform(request);//在主分片上写 primary.updateLocalCheckpointForShard(primaryRouting.allocationId().getId(), primary.localCheckpoint());//更新本地checkPoint 每个分片都维护一个本地的checkpoint 主分片上还会维护一个全局的 final ReplicaRequest replicaRequest = primaryResult.replicaRequest(); if (replicaRequest != null) { if (logger.isTraceEnabled()) { logger.trace("[{}] op [{}] completed on primary for request [{}]", primaryId, opType, request); } // we have to get the replication group after successfully indexing into the primary in order to honour recovery semantics. // we have to make sure that every operation indexed into the primary after recovery start will also be replicated // to the recovery target. If we used an old replication group, we may miss a recovery that has started since then. // we also have to make sure to get the global checkpoint before the replication group, to ensure that the global checkpoint // is valid for this replication group. If we would sample in the reverse, the global checkpoint might be based on a subset // of the sampled replication group, and advanced further than what the given replication group would allow it to. // This would entail that some shards could learn about a global checkpoint that would be higher than its local checkpoint. final long globalCheckpoint = primary.globalCheckpoint();//获取全局检查点 checkPoint final ReplicationGroup replicationGroup = primary.getReplicationGroup();//获取副本分片 markUnavailableShardsAsStale(replicaRequest, replicationGroup.getInSyncAllocationIds(), replicationGroup.getRoutingTable());//关闭不可达的分片 performOnReplicas(replicaRequest, globalCheckpoint, replicationGroup.getRoutingTable());//副本分片执行写 } successfulShards.incrementAndGet(); // mark primary as successful 成功的话 +1 decPendingAndFinishIfNeeded();//每次提交都会在pendingAction列表理添加 当待执行的列表为空的话 执行结束 } 在主分片上写 TransportShardBulkAction public static WritePrimaryResult<BulkShardRequest, BulkShardResponse> performOnPrimary( BulkShardRequest request, IndexShard primary, UpdateHelper updateHelper, LongSupplier nowInMillisSupplier, MappingUpdatePerformer mappingUpdater) throws Exception { final IndexMetaData metaData = primary.indexSettings().getIndexMetaData(); Translog.Location location = null; for (int requestIndex = 0; requestIndex < request.items().length; requestIndex++) { if (isAborted(request.items()[requestIndex].getPrimaryResponse()) == false) {//如果 可以写入 就是判断是不是为null location = executeBulkItemRequest(metaData, primary, request, location, requestIndex, updateHelper, nowInMillisSupplier, mappingUpdater); } } BulkItemResponse[] responses = new BulkItemResponse[request.items().length]; BulkItemRequest[] items = request.items(); for (int i = 0; i < items.length; i++) { responses[i] = items[i].getPrimaryResponse(); } BulkShardResponse response = new BulkShardResponse(request.shardId(), responses); return new WritePrimaryResult<>(request, response, location, null, primary, logger);//封装结果 } /** Executes bulk item requests and handles request execution exceptions */ static Translog.Location executeBulkItemRequest(IndexMetaData metaData, IndexShard primary, BulkShardRequest request, Translog.Location location, int requestIndex, UpdateHelper updateHelper, LongSupplier nowInMillisSupplier, final MappingUpdatePerformer mappingUpdater) throws Exception { final DocWriteRequest itemRequest = request.items()[requestIndex].request(); final DocWriteRequest.OpType opType = itemRequest.opType(); final BulkItemResultHolder responseHolder; switch (itemRequest.opType()) { case CREATE: case INDEX: responseHolder = executeIndexRequest((IndexRequest) itemRequest, request.items()[requestIndex], primary, mappingUpdater);//先写主节点 break; case UPDATE: responseHolder = executeUpdateRequest((UpdateRequest) itemRequest, primary, metaData, request, requestIndex, updateHelper, nowInMillisSupplier, mappingUpdater); break; case DELETE: responseHolder = executeDeleteRequest((DeleteRequest) itemRequest, request.items()[requestIndex], primary, mappingUpdater); break; default: throw new IllegalStateException("unexpected opType [" + itemRequest.opType() + "] found"); } final BulkItemRequest replicaRequest = responseHolder.replicaRequest; // update the bulk item request because update request execution can mutate the bulk item request request.items()[requestIndex] = replicaRequest; // Retrieve the primary response, and update the replica request with the primary's response BulkItemResponse primaryResponse = createPrimaryResponse(responseHolder, opType, request); if (primaryResponse != null) { replicaRequest.setPrimaryResponse(primaryResponse); } // Update the translog with the new location, if needed return calculateTranslogLocation(location, responseHolder); } 依次调用 TransportShardBulkAction.executeIndexRequest()->TransportShardBulkAction.executeIndexRequestOnPrimary()->IndexShard.applyIndexOperationOnPrimary()->IndexShard.applyIndexOperation()-IndexShard.index() lucene 写 在InternalEngine @Override public IndexResult index(Index index) throws IOException { assert Objects.equals(index.uid().field(), uidField) : index.uid().field(); final boolean doThrottle = index.origin().isRecovery() == false; try (ReleasableLock releasableLock = readLock.acquire()) { ensureOpen(); assert assertIncomingSequenceNumber(index.origin(), index.seqNo()); assert assertVersionType(index); try (Releasable ignored = acquireLock(index.uid());//加锁 Releasable indexThrottle = doThrottle ? () -> {} : throttle.acquireThrottle()) { lastWriteNanos = index.startTime(); final IndexingStrategy plan; if (index.origin() == Operation.Origin.PRIMARY) { plan = planIndexingAsPrimary(index); } else { // non-primary mode (i.e., replica or recovery) plan = planIndexingAsNonPrimary(index); } final IndexResult indexResult; if (plan.earlyResultOnPreFlightError.isPresent()) { indexResult = plan.earlyResultOnPreFlightError.get(); assert indexResult.hasFailure(); } else if (plan.indexIntoLucene) { indexResult = indexIntoLucene(index, plan);//写lucene } else { indexResult = new IndexResult( plan.versionForIndexing, plan.seqNoForIndexing, plan.currentNotFoundOrDeleted); } //先写lucene 再写translog是为了防止先写translog后lucene写入时会做检查如果失败的话translog还得回滚 if (index.origin() != Operation.Origin.LOCAL_TRANSLOG_RECOVERY) { final Translog.Location location; if (indexResult.hasFailure() == false) { location = translog.add(new Translog.Index(index, indexResult)); } else if (indexResult.getSeqNo() != SequenceNumbers.UNASSIGNED_SEQ_NO) { // if we have document failure, record it as a no-op in the translog with the generated seq_no location = translog.add(new Translog.NoOp(indexResult.getSeqNo(), index.primaryTerm(), indexResult.getFailure().getMessage())); } else { location = null; } indexResult.setTranslogLocation(location); } if (indexResult.getSeqNo() != SequenceNumbers.UNASSIGNED_SEQ_NO) { seqNoService().markSeqNoAsCompleted(indexResult.getSeqNo()); //seqNum+1 } indexResult.setTook(System.nanoTime() - index.startTime()); indexResult.freeze(); return indexResult; } } catch (RuntimeException | IOException e) { try { maybeFailEngine("index", e);//如果lucene写失败直接返回true ,如果是非lucene失败得对上面加的锁释放 并且得回滚lucene上步操作 } catch (Exception inner) { e.addSuppressed(inner); } throw e; } } lucene写入 会在doc中添加 seq version primaryTerm 字段 private IndexResult indexIntoLucene(Index index, IndexingStrategy plan) throws IOException { assert assertSequenceNumberBeforeIndexing(index.origin(), plan.seqNoForIndexing); assert plan.versionForIndexing >= 0 : "version must be set. got " + plan.versionForIndexing; assert plan.indexIntoLucene; /* Update the document's sequence number and primary term; the sequence number here is derived here from either the sequence * number service if this is on the primary, or the existing document's sequence number if this is on the replica. The * primary term here has already been set, see IndexShard#prepareIndex where the Engine$Index operation is created. */ index.parsedDoc().updateSeqID(plan.seqNoForIndexing, index.primaryTerm()); index.parsedDoc().version().setLongValue(plan.versionForIndexing); try { if (plan.useLuceneUpdateDocument) { update(index.uid(), index.docs(), indexWriter); } else { // document does not exists, we can optimize for create, but double check if assertions are running assert assertDocDoesNotExist(index, canOptimizeAddDocument(index) == false); index(index.docs(), indexWriter); } versionMap.putUnderLock(index.uid().bytes(), new VersionValue(plan.versionForIndexing, plan.seqNoForIndexing, index.primaryTerm())); return new IndexResult(plan.versionForIndexing, plan.seqNoForIndexing, plan.currentNotFoundOrDeleted); } catch (Exception ex) { if (indexWriter.getTragicException() == null) { /* There is no tragic event recorded so this must be a document failure. * * The handling inside IW doesn't guarantee that an tragic / aborting exception * will be used as THE tragicEventException since if there are multiple exceptions causing an abort in IW * only one wins. Yet, only the one that wins will also close the IW and in turn fail the engine such that * we can potentially handle the exception before the engine is failed. * Bottom line is that we can only rely on the fact that if it's a document failure then * `indexWriter.getTragicException()` will be null otherwise we have to rethrow and treat it as fatal or rather * non-document failure * * we return a `MATCH_ANY` version to indicate no document was index. The value is * not used anyway */ return new IndexResult(ex, Versions.MATCH_ANY, plan.seqNoForIndexing); } else { throw ex; } } } 写入失败处理 @Override protected final void closeNoLock(String reason, CountDownLatch closedLatch) { if (isClosed.compareAndSet(false, true)) { assert rwl.isWriteLockedByCurrentThread() || failEngineLock.isHeldByCurrentThread() : "Either the write lock must be held or the engine must be currently be failing itself"; try { this.versionMap.clear(); try { IOUtils.close(searcherManager);//关闭searchManaer } catch (Exception e) { logger.warn("Failed to close SearcherManager", e); } try { IOUtils.close(translog);//关闭translog } catch (Exception e) { logger.warn("Failed to close translog", e); } // no need to commit in this case!, we snapshot before we close the shard, so translog and all sync'ed logger.trace("rollback indexWriter"); try { indexWriter.rollback(); //回滚 } catch (AlreadyClosedException ex) { failOnTragicEvent(ex); throw ex; } logger.trace("rollback indexWriter done"); } catch (Exception e) { logger.warn("failed to rollback writer on close", e); } finally { try { store.decRef(); logger.debug("engine closed [{}]", reason); } finally { closedLatch.countDown(); } } } } 副本分片流程 和主分片差不多 private void performOnReplicas(final ReplicaRequest replicaRequest, final long globalCheckpoint, final IndexShardRoutingTable indexShardRoutingTable) { final String localNodeId = primary.routingEntry().currentNodeId(); // If the index gets deleted after primary operation, we skip replication for (final ShardRouting shard : indexShardRoutingTable) { if (shard.unassigned()) { assert shard.primary() == false : "primary shard should not be unassigned in a replication group: " + shard; totalShards.incrementAndGet(); continue; } if (shard.currentNodeId().equals(localNodeId) == false) {//节点主分片已经执行完 不需要重新写 performOnReplica(shard, replicaRequest, globalCheckpoint);//写副本分片 会发送全局检查点到各个分片 全局检查节点没操作一次+1这样做的目的是 如果一个节点丢失再连接上时 只需要知道这个checkPoint 和主分片上的差多少 只需要发送差量的数据进行恢复 } if (shard.relocating() && shard.relocatingNodeId().equals(localNodeId) == false) {//如果分片已经迁移 就往迁移的node分片上发 performOnReplica(shard.getTargetRelocatingShard(), replicaRequest, globalCheckpoint); } } } private void performOnReplica(final ShardRouting shard, final ReplicaRequest replicaRequest, final long globalCheckpoint) { if (logger.isTraceEnabled()) { logger.trace("[{}] sending op [{}] to replica {} for request [{}]", shard.shardId(), opType, shard, replicaRequest); } totalShards.incrementAndGet(); pendingActions.incrementAndGet(); replicasProxy.performOn(shard, replicaRequest, globalCheckpoint, new ActionListener<ReplicaResponse>() {//执行和主分片差不多 @Override public void onResponse(ReplicaResponse response) {//操作成功之后 successfulShards.incrementAndGet(); try { primary.updateLocalCheckpointForShard(shard.allocationId().getId(), response.localCheckpoint());//主分片更新每个节点本地检查点 primary.updateGlobalCheckpointForShard(shard.allocationId().getId(), response.globalCheckpoint());//主分片更新全局检查节点 } catch (final AlreadyClosedException e) { // okay, the index was deleted or this shard was never activated after a relocation; fall through and finish normally } catch (final Exception e) { // fail the primary but fall through and let the rest of operation processing complete final String message = String.format(Locale.ROOT, "primary failed updating local checkpoint for replica %s", shard); primary.failShard(message, e); } decPendingAndFinishIfNeeded();//如果已经已经没有待执行的任务 执行finish } @Override public void onFailure(Exception replicaException) {//如果失败的话 向主节点发送失败 logger.trace( (org.apache.logging.log4j.util.Supplier<?>) () -> new ParameterizedMessage( "[{}] failure while performing [{}] on replica {}, request [{}]", shard.shardId(), opType, shard, replicaRequest), replicaException); if (TransportActions.isShardNotAvailableException(replicaException)) { decPendingAndFinishIfNeeded(); } else {//如果是 RestStatus restStatus = ExceptionsHelper.status(replicaException); shardReplicaFailures.add(new ReplicationResponse.ShardInfo.Failure( shard.shardId(), shard.currentNodeId(), replicaException, restStatus, false)); String message = String.format(Locale.ROOT, "failed to perform %s on replica %s", opType, shard); replicasProxy.failShardIfNeeded(shard, message, replicaException, ReplicationOperation.this::decPendingAndFinishIfNeeded, ReplicationOperation.this::onPrimaryDemoted, throwable -> decPendingAndFinishIfNeeded()); } } }); }