Eureka服务端注册、心跳、下线源码分析

一、服务注册

  eureka的服务注册是通过http请求进行的,使用的使用jersey框架,也是一种mvc架构,服务注册的控制层是ApplicationResource,访问的接口是addInstance

@POST
    @Consumes({"application/json", "application/xml"})
    public Response addInstance(InstanceInfo info,
                                @HeaderParam(PeerEurekaNode.HEADER_REPLICATION) String isReplication) {
        logger.debug("Registering instance {} (replication={})", info.getId(), isReplication);
        // 入参校验
        if (isBlank(info.getId())) {
            return Response.status(400).entity("Missing instanceId").build();
        } else if (isBlank(info.getHostName())) {
            return Response.status(400).entity("Missing hostname").build();
        } else if (isBlank(info.getIPAddr())) {
            return Response.status(400).entity("Missing ip address").build();
        } else if (isBlank(info.getAppName())) {
            return Response.status(400).entity("Missing appName").build();
        } else if (!appName.equals(info.getAppName())) {
            return Response.status(400).entity("Mismatched appName, expecting " + appName + " but was " + info.getAppName()).build();
        } else if (info.getDataCenterInfo() == null) {
            return Response.status(400).entity("Missing dataCenterInfo").build();
        } else if (info.getDataCenterInfo().getName() == null) {
            return Response.status(400).entity("Missing dataCenterInfo Name").build();
        }

        // 获取注册中心
        DataCenterInfo dataCenterInfo = info.getDataCenterInfo();
        if (dataCenterInfo instanceof UniqueIdentifier) {
            String dataCenterInfoId = ((UniqueIdentifier) dataCenterInfo).getId();
            if (isBlank(dataCenterInfoId)) {
                boolean experimental = "true".equalsIgnoreCase(serverConfig.getExperimental("registration.validation.dataCenterInfoId"));
                if (experimental) {
                    String entity = "DataCenterInfo of type " + dataCenterInfo.getClass() + " must contain a valid id";
                    return Response.status(400).entity(entity).build();
                } else if (dataCenterInfo instanceof AmazonInfo) {
                    AmazonInfo amazonInfo = (AmazonInfo) dataCenterInfo;
                    String effectiveId = amazonInfo.get(AmazonInfo.MetaDataKey.instanceId);
                    if (effectiveId == null) {
                        amazonInfo.getMetadata().put(AmazonInfo.MetaDataKey.instanceId.getName(), info.getId());
                    }
                } else {
                    logger.warn("Registering DataCenterInfo of type {} without an appropriate id", dataCenterInfo.getClass());
                }
            }
        }
        //服务注册
        registry.register(info, "true".equals(isReplication));
        //响应成功
        return Response.status(204).build();
    }

调用InstanceRegistry的注册方法

    public void register(final InstanceInfo info, final boolean isReplication) {
        //获取心跳续约时间  发布EurekaInstanceRegisteredEvent注册事件
        handleRegistration(info, resolveInstanceLeaseDuration(info), isReplication);
        super.register(info, isReplication);
    }

发布注册事件

    private void handleRegistration(InstanceInfo info, int leaseDuration,
            boolean isReplication) {
        log("register " + info.getAppName() + ", vip " + info.getVIPAddress()
                + ", leaseDuration " + leaseDuration + ", isReplication "
                + isReplication);
        //发布注册事件
        publishEvent(new EurekaInstanceRegisteredEvent(this, info, leaseDuration,
                isReplication));
    }

注册实例

//PeerAwareInstanceRegistryImpl
    public void register(final InstanceInfo info, final boolean isReplication) {
        int leaseDuration = Lease.DEFAULT_DURATION_IN_SECS;
        if (info.getLeaseInfo() != null && info.getLeaseInfo().getDurationInSecs() > 0) {
            leaseDuration = info.getLeaseInfo().getDurationInSecs();
        }
        super.register(info, leaseDuration, isReplication);
        //集群注册同步
        replicateToPeers(Action.Register, info.getAppName(), info.getId(), info, null, isReplication);
    }
    //AbstractInstanceRegistry
    public void register(InstanceInfo registrant, int leaseDuration, boolean isReplication) {
        try {
            read.lock();
            //获取集群,第一次为空
            Map<String, Lease<InstanceInfo>> gMap = registry.get(registrant.getAppName());
            REGISTER.increment(isReplication);
            if (gMap == null) {
                //创建一个新的,并存放
                final ConcurrentHashMap<String, Lease<InstanceInfo>> gNewMap = new ConcurrentHashMap<String, Lease<InstanceInfo>>();
                gMap = registry.putIfAbsent(registrant.getAppName(), gNewMap);
                if (gMap == null) {
                    gMap = gNewMap;
                }
            }
            //通过实例id获取实例
            Lease<InstanceInfo> existingLease = gMap.get(registrant.getId());
            //注册存在冲突,正常情况下,不会进入
            if (existingLease != null && (existingLease.getHolder() != null)) {
                Long existingLastDirtyTimestamp = existingLease.getHolder().getLastDirtyTimestamp();
                Long registrationLastDirtyTimestamp = registrant.getLastDirtyTimestamp();
                logger.debug("Existing lease found (existing={}, provided={}", existingLastDirtyTimestamp, registrationLastDirtyTimestamp);

                // 进行时间戳的比较,保留最后一次注册信息
                if (existingLastDirtyTimestamp > registrationLastDirtyTimestamp) {
                    logger.warn("There is an existing lease and the existing lease's dirty timestamp {} is greater" +
                            " than the one that is being registered {}", existingLastDirtyTimestamp, registrationLastDirtyTimestamp);
                    logger.warn("Using the existing instanceInfo instead of the new instanceInfo as the registrant");
                    registrant = existingLease.getHolder();
                }
            } else {
                // 只存在一个注册
                synchronized (lock) {
                    if (this.expectedNumberOfClientsSendingRenews > 0) {
                        // 注册数量+1
                        this.expectedNumberOfClientsSendingRenews = this.expectedNumberOfClientsSendingRenews + 1;
                        updateRenewsPerMinThreshold();
                    }
                }
                logger.debug("No previous lease information found; it is new registration");
            }
            //创建注册器对象 包括心跳续约时间 注册时间戳
            Lease<InstanceInfo> lease = new Lease<InstanceInfo>(registrant, leaseDuration);
            if (existingLease != null) {
                lease.setServiceUpTimestamp(existingLease.getServiceUpTimestamp());
            }
            //保存实例信息到本地注册表
            gMap.put(registrant.getId(), lease);
            
            recentRegisteredQueue.add(new Pair<Long, String>(
                    System.currentTimeMillis(),
                    registrant.getAppName() + "(" + registrant.getId() + ")"));
            // 
            if (!InstanceStatus.UNKNOWN.equals(registrant.getOverriddenStatus())) {
                logger.debug("Found overridden status {} for instance {}. Checking to see if needs to be add to the "
                                + "overrides", registrant.getOverriddenStatus(), registrant.getId());
                if (!overriddenInstanceStatusMap.containsKey(registrant.getId())) {
                    logger.info("Not found overridden id {} and hence adding it", registrant.getId());
                    overriddenInstanceStatusMap.put(registrant.getId(), registrant.getOverriddenStatus());
                }
            }
            InstanceStatus overriddenStatusFromMap = overriddenInstanceStatusMap.get(registrant.getId());
            if (overriddenStatusFromMap != null) {
                logger.info("Storing overridden status {} from map", overriddenStatusFromMap);
                registrant.setOverriddenStatus(overriddenStatusFromMap);
            }

            // 获取实例状态
            InstanceStatus overriddenInstanceStatus = getOverriddenInstanceStatus(registrant, existingLease, isReplication);
            registrant.setStatusWithoutDirty(overriddenInstanceStatus);

            // 设置实例上线时间戳
            if (InstanceStatus.UP.equals(registrant.getStatus())) {
                lease.serviceUp();
            }
            registrant.setActionType(ActionType.ADDED);
            recentlyChangedQueue.add(new RecentlyChangedItem(lease));
            registrant.setLastUpdatedTimestamp();
            //失效缓存
            invalidateCache(registrant.getAppName(), registrant.getVIPAddress(), registrant.getSecureVipAddress());
            logger.info("Registered instance {}/{} with status {} (replication={})",
                    registrant.getAppName(), registrant.getId(), registrant.getStatus(), isReplication);
        } finally {
            read.unlock();
        }
    }

二、心跳续约

   server端的心跳续约,是通过InstanceResource接收客户端的请求。

// InstanceResource接收EurekaClient端发送的心跳续约请求
    // 也有可能是接收其他EurekaServer端同步数据的请求
    public Response renewLease(
            //判断是心跳续约还是同步集群数据
            @HeaderParam(PeerEurekaNode.HEADER_REPLICATION) String isReplication,
            // 实例的覆盖状态
            @QueryParam("overriddenstatus") String overriddenStatus,
            // 实例状态
            @QueryParam("status") String status,
             // 实例信息在EurekClient端上次被修改的时间
            @QueryParam("lastDirtyTimestamp") String lastDirtyTimestamp) {
        boolean isFromReplicaNode = "true".equals(isReplication);
        //续约
        boolean isSuccess = registry.renew(app.getName(), id, isFromReplicaNode);

        // 续约失败,返回404,EurekaClient端收到404后会发起注册请求
        if (!isSuccess) {
            logger.warn("Not Found (Renew): {} - {}", app.getName(), id);
            return Response.status(Status.NOT_FOUND).build();
        }

        Response response;
        if (lastDirtyTimestamp != null && serverConfig.shouldSyncWhenTimestampDiffers()) {
            // 验证传入的lastDirtyTimestamp和EurekaServer端保存的lastDirtyTimestamp是否相同
            response = this.validateDirtyTimestamp(Long.valueOf(lastDirtyTimestamp), isFromReplicaNode);
            if (response.getStatus() == Response.Status.NOT_FOUND.getStatusCode()
                    && (overriddenStatus != null)
                    && !(InstanceStatus.UNKNOWN.name().equals(overriddenStatus))
                    && isFromReplicaNode) {
                //状态覆盖
                registry.storeOverriddenStatusIfRequired(app.getAppName(), id, InstanceStatus.valueOf(overriddenStatus));
            }
        } else {
            //续约成功返回
            response = Response.ok().build();
        }
        logger.debug("Found (Renew): {} - {}; reply status={}", app.getName(), id, response.getStatus());
        return response;
    }
    //InstanceRegistry
    public boolean renew(final String appName, final String serverId,
            boolean isReplication) {
        log("renew " + appName + " serverId " + serverId + ", isReplication {}"
                + isReplication);
        List<Application> applications = getSortedApplications();
        for (Application input : applications) {
            if (input.getName().equals(appName)) {
                InstanceInfo instance = null;
                for (InstanceInfo info : input.getInstances()) {
                    if (info.getId().equals(serverId)) {
                        instance = info;
                        break;
                    }
                }
                //发布心跳续约事件
                publishEvent(new EurekaInstanceRenewedEvent(this, appName, serverId,
                        instance, isReplication));
                break;
            }
        }
        //调用父类的续约方法
        return super.renew(appName, serverId, isReplication);
    }
    //PeerAwareInstanceRegistryImpl
    public boolean renew(final String appName, final String id, final boolean isReplication) {
        //父类AbstractInstanceRegistry
        if (super.renew(appName, id, isReplication)) {
            // 如果是续约请求则向其他EurekaServer节点同步续约信息
            // 如果是同步信息请求则直接返回
            replicateToPeers(Action.Heartbeat, appName, id, null, null, isReplication);
            return true;
        }
        return false;
    }

调用父类的续约方法

public boolean renew(String appName, String id, boolean isReplication) {
        RENEW.increment(isReplication);
        //注册表信息
        Map<String, Lease<InstanceInfo>> gMap = registry.get(appName);
        Lease<InstanceInfo> leaseToRenew = null;
        if (gMap != null) {
            //根据实例id取出实例信息
            leaseToRenew = gMap.get(id);
        }
        if (leaseToRenew == null) {
            RENEW_NOT_FOUND.increment(isReplication);
            logger.warn("DS: Registry: lease doesn't exist, registering resource: {} - {}", appName, id);
            return false;
        } else {
            InstanceInfo instanceInfo = leaseToRenew.getHolder();
            if (instanceInfo != null) {
                // 获得实例的覆盖状态
                InstanceStatus overriddenInstanceStatus = this.getOverriddenInstanceStatus(
                        instanceInfo, leaseToRenew, isReplication);
                // 实例覆盖状态为UNKNOWN,续租失败
                if (overriddenInstanceStatus == InstanceStatus.UNKNOWN) {
                    logger.info("Instance status UNKNOWN possibly due to deleted override for instance {}"
                            + "; re-register required", instanceInfo.getId());
                    RENEW_NOT_FOUND.increment(isReplication);
                    return false;
                }
                 // 实例状态与覆盖状态不一致
                if (!instanceInfo.getStatus().equals(overriddenInstanceStatus)) {
                    logger.info(
                            "The instance status {} is different from overridden instance status {} for instance {}. "
                                    + "Hence setting the status to overridden status", instanceInfo.getStatus().name(),
                                    instanceInfo.getOverriddenStatus().name(),
                                    instanceInfo.getId());
                    // 强行把实例的覆盖状态设为实例状态
                    instanceInfo.setStatusWithoutDirty(overriddenInstanceStatus);

                }
            }
            renewsLastMin.increment();
            // 续租(设置lastUpdateTimestamp(租约最后更新时间))
            leaseToRenew.renew();
            return true;
        }
    }
    public void renew() {
        lastUpdateTimestamp = System.currentTimeMillis() + duration;
    }

lastUpdateTimestamp本来指的最后更新时间,而这里直接加上了一个续约时间duration,这样的话lastUpdateTimestamp指的应该是过期时间,所以这地方是有点小bug的,在判断是否过期的时候也指出了

 /**
     * Checks if the lease of a given {@link com.netflix.appinfo.InstanceInfo} has expired or not.
     *
     * Note that due to renew() doing the 'wrong" thing and setting lastUpdateTimestamp to +duration more than
     * what it should be, the expiry will actually be 2 * duration. This is a minor bug and should only affect
     * instances that ungracefully shutdown. Due to possible wide ranging impact to existing usage, this will
     * not be fixed.
     *
     * @param additionalLeaseMs any additional lease time to add to the lease evaluation in ms.
     */
    public boolean isExpired(long additionalLeaseMs) {
        return (evictionTimestamp > 0 || System.currentTimeMillis() > (lastUpdateTimestamp + duration + additionalLeaseMs));
    }

翻译过来:请注意,由于renew()做了“错误”的事情,将lastUpdateTimestamp加上了duration,过期时间实际上是加了2次的duration。这是一个小错误,只会影响不正常关闭的实例。由于可能对现有使用产生广泛影响,因此不会修复此问题。

三、服务下线

一种是优雅下线,是client端发送下线请求,由server端处理,剔除服务。

一种是客户端异常,网络异常等原因,非正常下线。

优雅下线,服务端InstanceResource接收由cancelLease方法处理。

    public Response cancelLease(
            @HeaderParam(PeerEurekaNode.HEADER_REPLICATION) String isReplication) {
        try {
            //服务下线,isReplication为true表示集群信息同步
            boolean isSuccess = registry.cancel(app.getName(), id,
                "true".equals(isReplication));

            if (isSuccess) {
                logger.debug("Found (Cancel): {} - {}", app.getName(), id);
                return Response.ok().build();
            } else {
                logger.info("Not Found (Cancel): {} - {}", app.getName(), id);
                return Response.status(Status.NOT_FOUND).build();
            }
        } catch (Throwable e) {
            logger.error("Error (cancel): {} - {}", app.getName(), id, e);
            return Response.serverError().build();
        }

    }
    //InstanceRegistry
    public boolean cancel(String appName, String serverId, boolean isReplication) {
        //发布下线事件
        handleCancelation(appName, serverId, isReplication);
        //调用父类下线
        return super.cancel(appName, serverId, isReplication);
    }
    //PeerAwareInstanceRegistryImpl
    public boolean cancel(final String appName, final String id,
                          final boolean isReplication) {
        //调用父类下线                  
        if (super.cancel(appName, id, isReplication)) {
            //集群同步下线信息
            replicateToPeers(Action.Cancel, appName, id, null, null, isReplication);
            return true;
        }
        return false;
    }

下线

    protected boolean internalCancel(String appName, String id, boolean isReplication) {
        try {
            read.lock();
       //服务取消数增加 CANCEL.increment(isReplication);
// 根据实例集合名称取出实例信息集合 Map<String, Lease<InstanceInfo>> gMap = registry.get(appName); Lease<InstanceInfo> leaseToCancel = null; if (gMap != null) { // 删除租约信息,remove后返回id对应的具体实例信息 leaseToCancel = gMap.remove(id); } // 添加到最近取消租约队列 recentCanceledQueue.add(new Pair<Long, String>(System.currentTimeMillis(), appName + "(" + id + ")")); // 覆盖状态map中移除当前服务实例 InstanceStatus instanceStatus = overriddenInstanceStatusMap.remove(id); if (instanceStatus != null) { logger.debug("Removed instance id {} from the overridden map which has value {}", id, instanceStatus.name()); } if (leaseToCancel == null) { CANCEL_NOT_FOUND.increment(isReplication); logger.warn("DS: Registry: cancel failed because Lease is not registered for: {}/{}", appName, id); return false; } else { // 设置取消租约的时间戳 leaseToCancel.cancel();
          // 获取实例信息 InstanceInfo instanceInfo
= leaseToCancel.getHolder(); String vip = null; String svip = null; if (instanceInfo != null) { // 设置实例信息中的ActionType为Delete instanceInfo.setActionType(ActionType.DELETED); recentlyChangedQueue.add(new RecentlyChangedItem(leaseToCancel));
            //修改最后操作时间 instanceInfo.setLastUpdatedTimestamp(); vip
= instanceInfo.getVIPAddress(); svip = instanceInfo.getSecureVipAddress(); } // 使缓存无效,调用responseCache.invalidate让服务在缓存中失效 invalidateCache(appName, vip, svip); logger.info("Cancelled instance {}/{} (replication={})", appName, id, isReplication); } } finally { read.unlock(); } synchronized (lock) { if (this.expectedNumberOfClientsSendingRenews > 0) { this.expectedNumberOfClientsSendingRenews = this.expectedNumberOfClientsSendingRenews - 1; // updateRenewsPerMinThreshold(); } } return true; }

四、集群同步

  在客户端的注册、心跳、下线等都会触发集群同步操作,都会走到下面的这个方法中,通过action的值判断,到底是哪种同步操作

    private void replicateToPeers(Action action, String appName, String id,
                                  InstanceInfo info,
                                  InstanceStatus newStatus,
                                  //当是集群同步的请求时,值为true
                                  boolean isReplication) {
        Stopwatch tracer = action.getTimer().start();
        try {
            if (isReplication) {
                //操作次数+1
                numberOfReplicationsLastMin.increment();
            }
            // 集群节点为null,或者是集群同步请求时,直接返回,防止循环同步
            if (peerEurekaNodes == Collections.EMPTY_LIST || isReplication) {
                return;
            }

            for (final PeerEurekaNode node : peerEurekaNodes.getPeerEurekaNodes()) {
                // 判断是否是自己,是自己则直接跳过
                if (peerEurekaNodes.isThisMyUrl(node.getServiceUrl())) {
                    continue;
                }
                //集群同步
                replicateInstanceActionsToPeers(action, appName, id, info, newStatus, node);
            }
        } finally {
            tracer.stop();
        }
    }

集群同步

 private void replicateInstanceActionsToPeers(Action action, String appName,
                                                 String id, InstanceInfo info, InstanceStatus newStatus,
                                                 PeerEurekaNode node) {
        try {
            InstanceInfo infoFromRegistry;
            CurrentRequestVersion.set(Version.V2);
            switch (action) {
                //服务下线
                case Cancel:
                    node.cancel(appName, id);
                    break;
                //心跳续约
                case Heartbeat:
                    InstanceStatus overriddenStatus = overriddenInstanceStatusMap.get(id);
                    infoFromRegistry = getInstanceByAppAndId(appName, id, false);
                    node.heartbeat(appName, id, infoFromRegistry, overriddenStatus, false);
                    break;
                //服务注册    
                case Register:
                    node.register(info);
                    break;
                //状态更新 UP/DOWN/STARTING/OUT_OF_SERVICE/UNKNOWN
                case StatusUpdate:
                    infoFromRegistry = getInstanceByAppAndId(appName, id, false);
                    node.statusUpdate(appName, id, newStatus, infoFromRegistry);
                    break;
                //删除状态覆盖,也就是把InstanceInfo.overriddenStatus状态置为InstanceStatus.UNKNOWN    
                case DeleteStatusOverride:
                    infoFromRegistry = getInstanceByAppAndId(appName, id, false);
                    node.deleteStatusOverride(appName, id, infoFromRegistry);
                    break;
            }
        } catch (Throwable t) {
            logger.error("Cannot replicate information to {} for action {}", node.getServiceUrl(), action.name(), t);
        } finally {
            CurrentRequestVersion.remove();
        }
    }

状态枚举

starting:实例初始化状态,此状态主要给实例预留初始化时间

down:当健康检查失败时,实例的状态转变到down

up:正常服务状态

out_of_service:不参与接收服务,但是服务正常

unknown:未知状态

为什么一定需要一个覆盖状态呢?

加入只有一个状态字段,当客户端调用修改状态字段修改状态up为status = out_of_service

1、client端调用updateStatus更新实例状态到out_of_service,即status = out_of_service,此时client的状态是up状态。

2、server端此实例的状态现在是out_of_service,并且标记responseCache无效。

3、客户端等待定时器定时更新实例的状态,但是由于时间间隔没有更新,所以client的状态还是up状态。

4、client端发送续租renew,由于此时客户端的状态是up状态,server端的状态是out_of_service,因此此前客户端调用的实例下线状态又被改回到原来的状态up。

引入覆盖状态后

1、客户端调用updateStatus方法时,同时更新server端实例的status和overriddenStatus状态。

2、客户端调用renew方法时,也要更新server端实例的status和overriddenstatus状态,但是有一下规则的:

(1)如果客户端上传的实例状态是down或者starting,表明客户端是重启或者healthCheck失败。此时这个实例不能作为服务提供服务。因此即使客户端调用updateStatus把实例状态更新为up,也是没用的。此时客户端实例的准确状态就是down或者starting。

(2)如果客户端的实例是up或者out_of_service,此时是不可信的。有可能client端的实例状态已被改变,此时要使用overriddenstatus状态作为当前实例的状态,避免被覆盖。

(3)情况2中的overriddenstatus有可能不存在,缓存失效,此时要使用server端已经存在的实例的状态。

每种操作进行集群同步的时候,就是往其他集群节点发送一个一摸一样的操作请求,但是isReplication参数是true,防止循环同步,我们就挑一个注册的操作看一下

    public void register(final InstanceInfo info) throws Exception {
        //
        long expiryTime = System.currentTimeMillis() + getLeaseRenewalOf(info);
        //创建了一个线程
        batchingDispatcher.process(
                taskId("register", info),
                //创建了一个异步任务
                new InstanceReplicationTask(targetHost, Action.Register, info, null, true) {
                    public EurekaHttpResponse<Void> execute() {
                        //发送注册请求
                        return replicationClient.register(info);
                    }
                },
                expiryTime
        );
    }

五、服务剔除

  Eureka Server的服务剔除是通过定时任务完成的,在EurekaBootStrap启动引导的initEurekaServerContext上下文初始化方法中,调用了这么一行代码registry.openForTraffic(applicationInfoManager, registryCount),调用的是InstanceRegistry的openForTraffic方法,最终调用了AbstractInstanceRegistry的postInit方法来初始化服务剔除的定时任务。

    protected void postInit() {
        renewsLastMin.start();
        if (evictionTaskRef.get() != null) {
            //如果服务剔除任务不为空,就执行cancel方法
            //该方法把任务的状态修改为了cancel任务取消
            evictionTaskRef.get().cancel();
        }
         //创建新的服务剔除任务
        evictionTaskRef.set(new EvictionTask());
        //交给调度器去执行,延迟60s,每60s执行一次驱逐任务
        evictionTimer.schedule(evictionTaskRef.get(),
                serverConfig.getEvictionIntervalTimerInMs(),
                serverConfig.getEvictionIntervalTimerInMs());
    }
    class EvictionTask extends TimerTask {

        private final AtomicLong lastExecutionNanosRef = new AtomicLong(0l);

        @Override
        public void run() {
            try {
                //计算任务执行的时间偏差:补偿时间
                long compensationTimeMs = getCompensationTimeMs();
                logger.info("Running the evict task with compensationTime {}ms", compensationTimeMs);
                //执行剔除
                evict(compensationTimeMs);
            } catch (Throwable e) {
                logger.error("Could not run the evict task", e);
            }
        }
    }

执行剔除

public void evict(long additionalLeaseMs) {
        logger.debug("Running the evict task");
        
        //自我保护机制触发
        if (!isLeaseExpirationEnabled()) {
            logger.debug("DS: lease expiration is currently disabled.");
            return;
        }

        //首先收集所有过期的服务,以随机顺序将其剔除
        List<Lease<InstanceInfo>> expiredLeases = new ArrayList<>();
        //循环注册表中的所有的服务
        for (Entry<String, Map<String, Lease<InstanceInfo>>> groupEntry : registry.entrySet()) {
            Map<String, Lease<InstanceInfo>> leaseMap = groupEntry.getValue();
            if (leaseMap != null) {
                //获取到租约
                for (Entry<String, Lease<InstanceInfo>> leaseEntry : leaseMap.entrySet()) {
                    Lease<InstanceInfo> lease = leaseEntry.getValue();
                    //如果服务过期,就把服务添加到expiredLeases map中
                    if (lease.isExpired(additionalLeaseMs) && lease.getHolder() != null) {
                        expiredLeases.add(lease);
                    }
                }
            }
        }

        //为了补偿GC暂停或本地时间差异导致的剔除任务执行时间差异,使用当前注册表大小作为触发自我保护的机制
        //否则,可能清除完整的注册表。
        //注册表大小
        int registrySize = (int) getLocalRegistrySize();
        //注册表中服务的续约阈值 = 注册大小 * 0.85
        int registrySizeThreshold = (int) (registrySize * serverConfig.getRenewalPercentThreshold());
        //剔除极限 = 注册表大小 - 注册表续约阈值 
        int evictionLimit = registrySize - registrySizeThreshold;
        //过期的服务数 和 evictionLimit 取最小,如果大于 0,说明需要有服务要剔除
        int toEvict = Math.min(expiredLeases.size(), evictionLimit);
        if (toEvict > 0) {
            //剔除 toEvict 个
            logger.info("Evicting {} items (expired={}, evictionLimit={})", toEvict, expiredLeases.size(), evictionLimit);
            //取随机值
            Random random = new Random(System.currentTimeMillis());
            for (int i = 0; i < toEvict; i++) {
                //选择一个随机实例剔除,如果顺序剔除可能会剔除同一个服务的所有实例节点
                int next = i + random.nextInt(expiredLeases.size() - i);
                Collections.swap(expiredLeases, i, next);
                 //获取剔除服务的实例
                Lease<InstanceInfo> lease = expiredLeases.get(i);
                //应用名
                String appName = lease.getHolder().getAppName();
                //实例ID
                String id = lease.getHolder().getId();
                //expired Counter 过期计数增加
                EXPIRED.increment();
                logger.warn("DS: Registry: expired lease for {}/{}", appName, id);
          //核心方法,服务剔除 internalCancel(appName, id,
false); } } }

判断是否过期,这里的bug在之前也已经提到过了,不过影响不大

    public boolean isExpired(long additionalLeaseMs) {
        //evictionTimestamp (剔除时间戳) > 0 || 最后更新时间戳 + 租期(90s) + 补偿时间
        return (evictionTimestamp > 0 || System.currentTimeMillis() > (lastUpdateTimestamp + duration + additionalLeaseMs));
    }

核心剔除方法internalCancel走的和服务下线是同一个方法,这里就不再详述了。

六、自我保护机制

  默认情况下,当EurekaServer在一定时间内(默认90秒)没有接收到某个客户端实例的心跳,EurekaServer将会注销该实例。但是当网络分区故障发生时,客户端与EurekaServer之间无法正常通信,此时不应该注销客户端。Eureka通过“自我保护机制”来解决这个问题:当EurekaServer短时间内丢失过多客户端时,这个节点就会进入自我保护模式。在自我保护模式下,EurekaServer不会剔除任何客户端。当网络故障恢复后,该节点会自动退出自我保护模式

自我保护机制的实现是基于维护服务注册表的类AbstractInstanceRegistry中的2个变量来维护的。

//期望每分钟最小续租次数
protected volatile int numberOfRenewsPerMinThreshold;
//期望每分钟最大续租次数
protected volatile int expectedNumberOfClientsSendingRenews;

在eureka启动时,集群同步之后执行了一个方法registry.openForTraffic(applicationInfoManager, registryCount),跟服务剔除进入的是同一个地方。

在方法内会调用updateRenewsPerMinThreshold()方法修改值,

    protected void updateRenewsPerMinThreshold() {
        //每分钟最小续租次数=预估心跳续租次数*(60 / 配置的心跳间隔时间,默认30s)*自我保护机制触发的百分比,默认85%
        this.numberOfRenewsPerMinThreshold = (int) (this.expectedNumberOfClientsSendingRenews
                * (60.0 / serverConfig.getExpectedClientRenewalIntervalSeconds())
                * serverConfig.getRenewalPercentThreshold());
    }

其次在服务剔除,下线,注册以及初始化的时候(开启了一个定时线程)去修改这个值。然后在服务剔除的时候会判断是否开启了自我保护机制,如果开启了自我保护机制,就不会剔除服务。

···       
 if (!isLeaseExpirationEnabled()) {
      logger.debug("DS: lease expiration is currently disabled.");
      return;
}
···
    public boolean isLeaseExpirationEnabled() {
        //enableSelfPreservation默认是true,开启了自我保护机制
        if (!isSelfPreservationModeEnabled()) {
            // 如果没有开启自我保护机制,直接返回
            return true;
        }
        //getNumOfRenewsInLastMin()方法计算出了上一分钟服务实例一共发送过来多少次心跳
        //numberOfRenewsPerMinThreshold则是期望的一分钟内最少要有多少次心跳,通过上面的公式计算出来的
        //如果上一分钟发送过来的心跳次数小于了期望值numberOfRenewsPerMinThreshold,则返回false,触发自我保护机制
        return numberOfRenewsPerMinThreshold > 0 && getNumOfRenewsInLastMin() > numberOfRenewsPerMinThreshold;
    }

七、服务发现

全量拉取  

  服务全量发现是通过调用ApplicationsResource中的getContainers方法来实现的

public Response getContainers(@PathParam("version") String version,
                                  @HeaderParam(HEADER_ACCEPT) String acceptHeader,
                                  @HeaderParam(HEADER_ACCEPT_ENCODING) String acceptEncoding,
                                  @HeaderParam(EurekaAccept.HTTP_X_EUREKA_ACCEPT) String eurekaAccept,
                                  @Context UriInfo uriInfo,
                                  @Nullable @QueryParam("regions") String regionsStr) {

        boolean isRemoteRegionRequested = null != regionsStr && !regionsStr.isEmpty();
        String[] regions = null;
        if (!isRemoteRegionRequested) {
            EurekaMonitors.GET_ALL.increment();
        } else {
            regions = regionsStr.toLowerCase().split(",");
            Arrays.sort(regions); // So we don't have different caches for same regions queried in different order.
            EurekaMonitors.GET_ALL_WITH_REMOTE_REGIONS.increment();
        }

        //检测是否允许方法
        if (!registry.shouldAllowAccess(isRemoteRegionRequested)) {
            return Response.status(Status.FORBIDDEN).build();
        }
        CurrentRequestVersion.set(Version.toEnum(version));
        KeyType keyType = Key.KeyType.JSON;
        String returnMediaType = MediaType.APPLICATION_JSON;
        if (acceptHeader == null || !acceptHeader.contains(HEADER_JSON_VALUE)) {
            keyType = Key.KeyType.XML;
            returnMediaType = MediaType.APPLICATION_XML;
        }

        Key cacheKey = new Key(Key.EntityType.Application,
                ResponseCacheImpl.ALL_APPS,
                keyType, CurrentRequestVersion.get(), EurekaAccept.fromString(eurekaAccept), regions
        );

        Response response;
        if (acceptEncoding != null && acceptEncoding.contains(HEADER_GZIP_VALUE)) {
            response = Response.ok(responseCache.getGZIP(cacheKey))
                    .header(HEADER_CONTENT_ENCODING, HEADER_GZIP_VALUE)
                    .header(HEADER_CONTENT_TYPE, returnMediaType)
                    .build();
        } else {
            //获取服务列表
            response = Response.ok(responseCache.get(cacheKey))
                    .build();
        }
        CurrentRequestVersion.remove();
        return response;
    }
    //ResponseCacheImpl
    String get(final Key key, boolean useReadOnlyCache) {
        //核心逻辑
        Value payload = getValue(key, useReadOnlyCache);
        if (payload == null || payload.getPayload().equals(EMPTY_PAYLOAD)) {
            return null;
        } else {
            return payload.getPayload();
        }
    }
    Value getValue(final Key key, boolean useReadOnlyCache) {
        Value payload = null;
        try {
            //只读缓存是否开启
            if (useReadOnlyCache) {
                //从只读缓存中获取
                final Value currentPayload = readOnlyCacheMap.get(key);
                if (currentPayload != null) {
                    payload = currentPayload;
                } else {
                    //只读缓存为null,从读写缓存中获取
                    payload = readWriteCacheMap.get(key);
                    //放入只读缓存
                    readOnlyCacheMap.put(key, payload);
                }
            } else {
                //直接从读写缓存中取
                payload = readWriteCacheMap.get(key);
            }
        } catch (Throwable t) {
            logger.error("Cannot get value for key : {}", key, t);
        }
        return payload;
    }

eureka存在两个缓存,一个只读缓存(ConcurrentMap<Key, Value>),一个读写缓存(LoadingCache<Key, Value>,guava自己封装的一种缓存结构)。

ResponseCacheImpl(EurekaServerConfig serverConfig, ServerCodecs serverCodecs, AbstractInstanceRegistry registry) {
        this.serverConfig = serverConfig;
        this.serverCodecs = serverCodecs;
        this.shouldUseReadOnlyResponseCache = serverConfig.shouldUseReadOnlyResponseCache();
        this.registry = registry;

        long responseCacheUpdateIntervalMs = serverConfig.getResponseCacheUpdateIntervalMs();
        //初始化读写缓存
        this.readWriteCacheMap =
                CacheBuilder.newBuilder()
                        //初始化缓存大小 默认1000
                        .initialCapacity(serverConfig.getInitialCapacityOfResponseCache())
                        //缓存过期时间 180s
                        .expireAfterWrite(serverConfig.getResponseCacheAutoExpirationInSeconds(), TimeUnit.SECONDS)
                        //添加与服务移除的监听器
                        .removalListener(new RemovalListener<Key, Value>() {
                            @Override
                            public void onRemoval(RemovalNotification<Key, Value> notification) {
                                Key removedKey = notification.getKey();
                                if (removedKey.hasRegions()) {
                                    Key cloneWithNoRegions = removedKey.cloneWithoutRegions();
                                    regionSpecificKeys.remove(cloneWithNoRegions, removedKey);
                                }
                            }
                        })
                        //添加一个缓存CacheLoader实现类,当调用get时,会调用到load方法
                        .build(new CacheLoader<Key, Value>() {
                            @Override
                            public Value load(Key key) throws Exception {
                                if (key.hasRegions()) {
                                    Key cloneWithNoRegions = key.cloneWithoutRegions();
                                    regionSpecificKeys.put(cloneWithNoRegions, key);
                                }
                                Value value = generatePayload(key);
                                return value;
                            }
                        });
        //初始化只读缓存,通过定时器从读写缓存中同步数据
        if (shouldUseReadOnlyResponseCache) {
            timer.schedule(getCacheUpdateTask(),
                    new Date(((System.currentTimeMillis() / responseCacheUpdateIntervalMs) * responseCacheUpdateIntervalMs)
                            + responseCacheUpdateIntervalMs),
                    responseCacheUpdateIntervalMs);
        }

        try {
            Monitors.registerObject(this);
        } catch (Throwable e) {
            logger.warn("Cannot register the JMX monitor for the InstanceRegistry", e);
        }
    }

只读缓存比较简单,就是一个map数据结构,我们看看从读写缓存中读取

private Value generatePayload(Key key) {
        Stopwatch tracer = null;
        try {
            String payload;
            switch (key.getEntityType()) {
                case Application:
                    boolean isRemoteRegionRequested = key.hasRegions();
                    //全量获取
                    if (ALL_APPS.equals(key.getName())) {
                        if (isRemoteRegionRequested) {
                            tracer = serializeAllAppsWithRemoteRegionTimer.start();
                            payload = getPayLoad(key, registry.getApplicationsFromMultipleRegions(key.getRegions()));
                        } else {
                            tracer = serializeAllAppsTimer.start();
                            //registry.getApplications()获取所有服务列表,getPayLoad重新编码
                            payload = getPayLoad(key, registry.getApplications());
                        }
                    //增量获取
                    } else if (ALL_APPS_DELTA.equals(key.getName())) {
                        if (isRemoteRegionRequested) {
                            tracer = serializeDeltaAppsWithRemoteRegionTimer.start();
                            versionDeltaWithRegions.incrementAndGet();
                            versionDeltaWithRegionsLegacy.incrementAndGet();
                            payload = getPayLoad(key,
                                    registry.getApplicationDeltasFromMultipleRegions(key.getRegions()));
                        } else {
                            tracer = serializeDeltaAppsTimer.start();
                            versionDelta.incrementAndGet();
                            versionDeltaLegacy.incrementAndGet();
                            payload = getPayLoad(key, registry.getApplicationDeltas());
                        }
                    } else {
                        tracer = serializeOneApptimer.start();
                        payload = getPayLoad(key, registry.getApplication(key.getName()));
                    }
                    break;
                case VIP:
                case SVIP:
                    tracer = serializeViptimer.start();
                    payload = getPayLoad(key, getApplicationsForVip(key, registry));
                    break;
                default:
                    logger.error("Unidentified entity type: {} found in the cache key.", key.getEntityType());
                    payload = "";
                    break;
            }
            return new Value(payload);
        } finally {
            if (tracer != null) {
                tracer.stop();
            }
        }
    }

从读写缓存中获取,有两个分支全量获取registry.getApplicationsFromMultipleRegions(key.getRegions()),逻辑比较简单就是从我们的注册表ConcurrentHashMap<String, Map<String, Lease<InstanceInfo>>> registry 中获取全量服务端的注册实例信息。服务增量获取则调用的是registry.getApplicationDeltasFromMultipleRegions(key.getRegions())方法,我们具体来分析一下这个方法。

public Applications getApplicationDeltasFromMultipleRegions(String[] remoteRegions) {
        if (null == remoteRegions) {
            remoteRegions = allKnownRemoteRegions; // null means all remote regions.
        }

        boolean includeRemoteRegion = remoteRegions.length != 0;

        if (includeRemoteRegion) {
            GET_ALL_WITH_REMOTE_REGIONS_CACHE_MISS_DELTA.increment();
        } else {
            GET_ALL_CACHE_MISS_DELTA.increment();
        }
        //增量数据
        Applications apps = new Applications();
        //设置版本号
        apps.setVersion(responseCache.getVersionDeltaWithRegions().get());
        Map<String, Application> applicationInstancesMap = new HashMap<String, Application>();
        try {
            write.lock();
            //最近修改过的实例队列,包括,最新注册,最近修改(心跳续约),最近下线
            Iterator<RecentlyChangedItem> iter = this.recentlyChangedQueue.iterator();
            logger.debug("The number of elements in the delta queue is :{}", this.recentlyChangedQueue.size());
            //遍历增量队列,保存至apps
            while (iter.hasNext()) {
                Lease<InstanceInfo> lease = iter.next().getLeaseInfo();
                InstanceInfo instanceInfo = lease.getHolder();
                logger.debug("The instance id {} is found with status {} and actiontype {}",
                        instanceInfo.getId(), instanceInfo.getStatus().name(), instanceInfo.getActionType().name());
                Application app = applicationInstancesMap.get(instanceInfo.getAppName());
                if (app == null) {
                    app = new Application(instanceInfo.getAppName());
                    applicationInstancesMap.put(instanceInfo.getAppName(), app);
                    apps.addApplication(app);
                }
                app.addInstance(new InstanceInfo(decorateInstanceInfo(lease)));
            }

            if (includeRemoteRegion) {
                for (String remoteRegion : remoteRegions) {
                    RemoteRegionRegistry remoteRegistry = regionNameVSRemoteRegistry.get(remoteRegion);
                    if (null != remoteRegistry) {
                        Applications remoteAppsDelta = remoteRegistry.getApplicationDeltas();
                        if (null != remoteAppsDelta) {
                            for (Application application : remoteAppsDelta.getRegisteredApplications()) {
                                if (shouldFetchFromRemoteRegistry(application.getName(), remoteRegion)) {
                                    Application appInstanceTillNow =
                                            apps.getRegisteredApplications(application.getName());
                                    if (appInstanceTillNow == null) {
                                        appInstanceTillNow = new Application(application.getName());
                                        apps.addApplication(appInstanceTillNow);
                                    }
                                    for (InstanceInfo instanceInfo : application.getInstances()) {
                                        appInstanceTillNow.addInstance(new InstanceInfo(instanceInfo));
                                    }
                                }
                            }
                        }
                    }
                }
            }
            //全量数据
            Applications allApps = getApplicationsFromMultipleRegions(remoteRegions);
            //设置全量数据的hashcode 设置到 增量数据的hashcode
            //客户端拿到这个hashcode 会跟 自己本地的全量数据+增量数据的 hashcode 进行比较,如果不匹配还会再次拉取
            apps.setAppsHashCode(allApps.getReconcileHashCode());
            return apps;
        } finally {
            write.unlock();
        }
    }

服务段在启动的时候还会初始化一个定时任务,定时清除recentlyChangedQueue队列中的数据

    private TimerTask getDeltaRetentionTask() {
        return new TimerTask() {
            @Override
            public void run() {
                Iterator<RecentlyChangedItem> it = recentlyChangedQueue.iterator();
                //判断队列中最近的更新时间 是否 小于 当前系统时间 - 可保存在队列中的时间(可配置的,默认3分钟)
                //就是一个定时清除的功能
                while (it.hasNext()) {
                    if (it.next().getLastUpdateTime() <
                            System.currentTimeMillis() - serverConfig.getRetentionTimeInMSInDeltaQueue()) {
                        it.remove();
                    } else {
                        break;
                    }
                }
            }

        };
    }

增量拉取

  服务增量发现是通过调用ApplicationsResource中的getContainerDifferential方法来实现的

 public Response getContainerDifferential(
            @PathParam("version") String version,
            @HeaderParam(HEADER_ACCEPT) String acceptHeader,
            @HeaderParam(HEADER_ACCEPT_ENCODING) String acceptEncoding,
            @HeaderParam(EurekaAccept.HTTP_X_EUREKA_ACCEPT) String eurekaAccept,
            @Context UriInfo uriInfo, @Nullable @QueryParam("regions") String regionsStr) {

        boolean isRemoteRegionRequested = null != regionsStr && !regionsStr.isEmpty();

        // If the delta flag is disabled in discovery or if the lease expiration
        // has been disabled, redirect clients to get all instances
        if ((serverConfig.shouldDisableDelta()) || (!registry.shouldAllowAccess(isRemoteRegionRequested))) {
            return Response.status(Status.FORBIDDEN).build();
        }

        String[] regions = null;
        if (!isRemoteRegionRequested) {
            EurekaMonitors.GET_ALL_DELTA.increment();
        } else {
            regions = regionsStr.toLowerCase().split(",");
            Arrays.sort(regions); // So we don't have different caches for same regions queried in different order.
            EurekaMonitors.GET_ALL_DELTA_WITH_REMOTE_REGIONS.increment();
        }

        CurrentRequestVersion.set(Version.toEnum(version));
        KeyType keyType = Key.KeyType.JSON;
        String returnMediaType = MediaType.APPLICATION_JSON;
        if (acceptHeader == null || !acceptHeader.contains(HEADER_JSON_VALUE)) {
            keyType = Key.KeyType.XML;
            returnMediaType = MediaType.APPLICATION_XML;
        }

        Key cacheKey = new Key(Key.EntityType.Application,
                ResponseCacheImpl.ALL_APPS_DELTA,
                keyType, CurrentRequestVersion.get(), EurekaAccept.fromString(eurekaAccept), regions
        );

        final Response response;

        if (acceptEncoding != null && acceptEncoding.contains(HEADER_GZIP_VALUE)) {
             response = Response.ok(responseCache.getGZIP(cacheKey))
                    .header(HEADER_CONTENT_ENCODING, HEADER_GZIP_VALUE)
                    .header(HEADER_CONTENT_TYPE, returnMediaType)
                    .build();
        } else {
        //核心拉取的逻辑,走的和全量发现同一个逻辑
            response = Response.ok(responseCache.get(cacheKey)).build();
        }

        CurrentRequestVersion.remove();
        return response;
    }
}

直接通过全量的逻辑走到generatePayload方法,里面有一个增量拉取的逻辑,上面已经分析过了。

posted @ 2022-01-03 12:16  上官兰夏  阅读(582)  评论(0编辑  收藏  举报