Eureka服务端注册、心跳、下线源码分析
一、服务注册
eureka的服务注册是通过http请求进行的,使用的使用jersey框架,也是一种mvc架构,服务注册的控制层是ApplicationResource,访问的接口是addInstance
@POST @Consumes({"application/json", "application/xml"}) public Response addInstance(InstanceInfo info, @HeaderParam(PeerEurekaNode.HEADER_REPLICATION) String isReplication) { logger.debug("Registering instance {} (replication={})", info.getId(), isReplication); // 入参校验 if (isBlank(info.getId())) { return Response.status(400).entity("Missing instanceId").build(); } else if (isBlank(info.getHostName())) { return Response.status(400).entity("Missing hostname").build(); } else if (isBlank(info.getIPAddr())) { return Response.status(400).entity("Missing ip address").build(); } else if (isBlank(info.getAppName())) { return Response.status(400).entity("Missing appName").build(); } else if (!appName.equals(info.getAppName())) { return Response.status(400).entity("Mismatched appName, expecting " + appName + " but was " + info.getAppName()).build(); } else if (info.getDataCenterInfo() == null) { return Response.status(400).entity("Missing dataCenterInfo").build(); } else if (info.getDataCenterInfo().getName() == null) { return Response.status(400).entity("Missing dataCenterInfo Name").build(); } // 获取注册中心 DataCenterInfo dataCenterInfo = info.getDataCenterInfo(); if (dataCenterInfo instanceof UniqueIdentifier) { String dataCenterInfoId = ((UniqueIdentifier) dataCenterInfo).getId(); if (isBlank(dataCenterInfoId)) { boolean experimental = "true".equalsIgnoreCase(serverConfig.getExperimental("registration.validation.dataCenterInfoId")); if (experimental) { String entity = "DataCenterInfo of type " + dataCenterInfo.getClass() + " must contain a valid id"; return Response.status(400).entity(entity).build(); } else if (dataCenterInfo instanceof AmazonInfo) { AmazonInfo amazonInfo = (AmazonInfo) dataCenterInfo; String effectiveId = amazonInfo.get(AmazonInfo.MetaDataKey.instanceId); if (effectiveId == null) { amazonInfo.getMetadata().put(AmazonInfo.MetaDataKey.instanceId.getName(), info.getId()); } } else { logger.warn("Registering DataCenterInfo of type {} without an appropriate id", dataCenterInfo.getClass()); } } } //服务注册 registry.register(info, "true".equals(isReplication)); //响应成功 return Response.status(204).build(); }
调用InstanceRegistry的注册方法
public void register(final InstanceInfo info, final boolean isReplication) { //获取心跳续约时间 发布EurekaInstanceRegisteredEvent注册事件 handleRegistration(info, resolveInstanceLeaseDuration(info), isReplication); super.register(info, isReplication); }
发布注册事件
private void handleRegistration(InstanceInfo info, int leaseDuration, boolean isReplication) { log("register " + info.getAppName() + ", vip " + info.getVIPAddress() + ", leaseDuration " + leaseDuration + ", isReplication " + isReplication); //发布注册事件 publishEvent(new EurekaInstanceRegisteredEvent(this, info, leaseDuration, isReplication)); }
注册实例
//PeerAwareInstanceRegistryImpl public void register(final InstanceInfo info, final boolean isReplication) { int leaseDuration = Lease.DEFAULT_DURATION_IN_SECS; if (info.getLeaseInfo() != null && info.getLeaseInfo().getDurationInSecs() > 0) { leaseDuration = info.getLeaseInfo().getDurationInSecs(); } super.register(info, leaseDuration, isReplication); //集群注册同步 replicateToPeers(Action.Register, info.getAppName(), info.getId(), info, null, isReplication); } //AbstractInstanceRegistry public void register(InstanceInfo registrant, int leaseDuration, boolean isReplication) { try { read.lock(); //获取集群,第一次为空 Map<String, Lease<InstanceInfo>> gMap = registry.get(registrant.getAppName()); REGISTER.increment(isReplication); if (gMap == null) { //创建一个新的,并存放 final ConcurrentHashMap<String, Lease<InstanceInfo>> gNewMap = new ConcurrentHashMap<String, Lease<InstanceInfo>>(); gMap = registry.putIfAbsent(registrant.getAppName(), gNewMap); if (gMap == null) { gMap = gNewMap; } } //通过实例id获取实例 Lease<InstanceInfo> existingLease = gMap.get(registrant.getId()); //注册存在冲突,正常情况下,不会进入 if (existingLease != null && (existingLease.getHolder() != null)) { Long existingLastDirtyTimestamp = existingLease.getHolder().getLastDirtyTimestamp(); Long registrationLastDirtyTimestamp = registrant.getLastDirtyTimestamp(); logger.debug("Existing lease found (existing={}, provided={}", existingLastDirtyTimestamp, registrationLastDirtyTimestamp); // 进行时间戳的比较,保留最后一次注册信息 if (existingLastDirtyTimestamp > registrationLastDirtyTimestamp) { logger.warn("There is an existing lease and the existing lease's dirty timestamp {} is greater" + " than the one that is being registered {}", existingLastDirtyTimestamp, registrationLastDirtyTimestamp); logger.warn("Using the existing instanceInfo instead of the new instanceInfo as the registrant"); registrant = existingLease.getHolder(); } } else { // 只存在一个注册 synchronized (lock) { if (this.expectedNumberOfClientsSendingRenews > 0) { // 注册数量+1 this.expectedNumberOfClientsSendingRenews = this.expectedNumberOfClientsSendingRenews + 1; updateRenewsPerMinThreshold(); } } logger.debug("No previous lease information found; it is new registration"); } //创建注册器对象 包括心跳续约时间 注册时间戳 Lease<InstanceInfo> lease = new Lease<InstanceInfo>(registrant, leaseDuration); if (existingLease != null) { lease.setServiceUpTimestamp(existingLease.getServiceUpTimestamp()); } //保存实例信息到本地注册表 gMap.put(registrant.getId(), lease); recentRegisteredQueue.add(new Pair<Long, String>( System.currentTimeMillis(), registrant.getAppName() + "(" + registrant.getId() + ")")); // if (!InstanceStatus.UNKNOWN.equals(registrant.getOverriddenStatus())) { logger.debug("Found overridden status {} for instance {}. Checking to see if needs to be add to the " + "overrides", registrant.getOverriddenStatus(), registrant.getId()); if (!overriddenInstanceStatusMap.containsKey(registrant.getId())) { logger.info("Not found overridden id {} and hence adding it", registrant.getId()); overriddenInstanceStatusMap.put(registrant.getId(), registrant.getOverriddenStatus()); } } InstanceStatus overriddenStatusFromMap = overriddenInstanceStatusMap.get(registrant.getId()); if (overriddenStatusFromMap != null) { logger.info("Storing overridden status {} from map", overriddenStatusFromMap); registrant.setOverriddenStatus(overriddenStatusFromMap); } // 获取实例状态 InstanceStatus overriddenInstanceStatus = getOverriddenInstanceStatus(registrant, existingLease, isReplication); registrant.setStatusWithoutDirty(overriddenInstanceStatus); // 设置实例上线时间戳 if (InstanceStatus.UP.equals(registrant.getStatus())) { lease.serviceUp(); } registrant.setActionType(ActionType.ADDED); recentlyChangedQueue.add(new RecentlyChangedItem(lease)); registrant.setLastUpdatedTimestamp(); //失效缓存 invalidateCache(registrant.getAppName(), registrant.getVIPAddress(), registrant.getSecureVipAddress()); logger.info("Registered instance {}/{} with status {} (replication={})", registrant.getAppName(), registrant.getId(), registrant.getStatus(), isReplication); } finally { read.unlock(); } }
二、心跳续约
server端的心跳续约,是通过InstanceResource接收客户端的请求。
// InstanceResource接收EurekaClient端发送的心跳续约请求 // 也有可能是接收其他EurekaServer端同步数据的请求 public Response renewLease( //判断是心跳续约还是同步集群数据 @HeaderParam(PeerEurekaNode.HEADER_REPLICATION) String isReplication, // 实例的覆盖状态 @QueryParam("overriddenstatus") String overriddenStatus, // 实例状态 @QueryParam("status") String status, // 实例信息在EurekClient端上次被修改的时间 @QueryParam("lastDirtyTimestamp") String lastDirtyTimestamp) { boolean isFromReplicaNode = "true".equals(isReplication); //续约 boolean isSuccess = registry.renew(app.getName(), id, isFromReplicaNode); // 续约失败,返回404,EurekaClient端收到404后会发起注册请求 if (!isSuccess) { logger.warn("Not Found (Renew): {} - {}", app.getName(), id); return Response.status(Status.NOT_FOUND).build(); } Response response; if (lastDirtyTimestamp != null && serverConfig.shouldSyncWhenTimestampDiffers()) { // 验证传入的lastDirtyTimestamp和EurekaServer端保存的lastDirtyTimestamp是否相同 response = this.validateDirtyTimestamp(Long.valueOf(lastDirtyTimestamp), isFromReplicaNode); if (response.getStatus() == Response.Status.NOT_FOUND.getStatusCode() && (overriddenStatus != null) && !(InstanceStatus.UNKNOWN.name().equals(overriddenStatus)) && isFromReplicaNode) { //状态覆盖 registry.storeOverriddenStatusIfRequired(app.getAppName(), id, InstanceStatus.valueOf(overriddenStatus)); } } else { //续约成功返回 response = Response.ok().build(); } logger.debug("Found (Renew): {} - {}; reply status={}", app.getName(), id, response.getStatus()); return response; }
//InstanceRegistry public boolean renew(final String appName, final String serverId, boolean isReplication) { log("renew " + appName + " serverId " + serverId + ", isReplication {}" + isReplication); List<Application> applications = getSortedApplications(); for (Application input : applications) { if (input.getName().equals(appName)) { InstanceInfo instance = null; for (InstanceInfo info : input.getInstances()) { if (info.getId().equals(serverId)) { instance = info; break; } } //发布心跳续约事件 publishEvent(new EurekaInstanceRenewedEvent(this, appName, serverId, instance, isReplication)); break; } } //调用父类的续约方法 return super.renew(appName, serverId, isReplication); } //PeerAwareInstanceRegistryImpl public boolean renew(final String appName, final String id, final boolean isReplication) { //父类AbstractInstanceRegistry if (super.renew(appName, id, isReplication)) { // 如果是续约请求则向其他EurekaServer节点同步续约信息 // 如果是同步信息请求则直接返回 replicateToPeers(Action.Heartbeat, appName, id, null, null, isReplication); return true; } return false; }
调用父类的续约方法
public boolean renew(String appName, String id, boolean isReplication) { RENEW.increment(isReplication); //注册表信息 Map<String, Lease<InstanceInfo>> gMap = registry.get(appName); Lease<InstanceInfo> leaseToRenew = null; if (gMap != null) { //根据实例id取出实例信息 leaseToRenew = gMap.get(id); } if (leaseToRenew == null) { RENEW_NOT_FOUND.increment(isReplication); logger.warn("DS: Registry: lease doesn't exist, registering resource: {} - {}", appName, id); return false; } else { InstanceInfo instanceInfo = leaseToRenew.getHolder(); if (instanceInfo != null) { // 获得实例的覆盖状态 InstanceStatus overriddenInstanceStatus = this.getOverriddenInstanceStatus( instanceInfo, leaseToRenew, isReplication); // 实例覆盖状态为UNKNOWN,续租失败 if (overriddenInstanceStatus == InstanceStatus.UNKNOWN) { logger.info("Instance status UNKNOWN possibly due to deleted override for instance {}" + "; re-register required", instanceInfo.getId()); RENEW_NOT_FOUND.increment(isReplication); return false; } // 实例状态与覆盖状态不一致 if (!instanceInfo.getStatus().equals(overriddenInstanceStatus)) { logger.info( "The instance status {} is different from overridden instance status {} for instance {}. " + "Hence setting the status to overridden status", instanceInfo.getStatus().name(), instanceInfo.getOverriddenStatus().name(), instanceInfo.getId()); // 强行把实例的覆盖状态设为实例状态 instanceInfo.setStatusWithoutDirty(overriddenInstanceStatus); } } renewsLastMin.increment(); // 续租(设置lastUpdateTimestamp(租约最后更新时间)) leaseToRenew.renew(); return true; } }
public void renew() { lastUpdateTimestamp = System.currentTimeMillis() + duration; }
lastUpdateTimestamp本来指的最后更新时间,而这里直接加上了一个续约时间duration,这样的话lastUpdateTimestamp指的应该是过期时间,所以这地方是有点小bug的,在判断是否过期的时候也指出了
/** * Checks if the lease of a given {@link com.netflix.appinfo.InstanceInfo} has expired or not. * * Note that due to renew() doing the 'wrong" thing and setting lastUpdateTimestamp to +duration more than * what it should be, the expiry will actually be 2 * duration. This is a minor bug and should only affect * instances that ungracefully shutdown. Due to possible wide ranging impact to existing usage, this will * not be fixed. * * @param additionalLeaseMs any additional lease time to add to the lease evaluation in ms. */ public boolean isExpired(long additionalLeaseMs) { return (evictionTimestamp > 0 || System.currentTimeMillis() > (lastUpdateTimestamp + duration + additionalLeaseMs)); }
翻译过来:请注意,由于renew()做了“错误”的事情,将lastUpdateTimestamp加上了duration,过期时间实际上是加了2次的duration。这是一个小错误,只会影响不正常关闭的实例。由于可能对现有使用产生广泛影响,因此不会修复此问题。
三、服务下线
一种是优雅下线,是client端发送下线请求,由server端处理,剔除服务。
一种是客户端异常,网络异常等原因,非正常下线。
优雅下线,服务端InstanceResource接收由cancelLease方法处理。
public Response cancelLease( @HeaderParam(PeerEurekaNode.HEADER_REPLICATION) String isReplication) { try { //服务下线,isReplication为true表示集群信息同步 boolean isSuccess = registry.cancel(app.getName(), id, "true".equals(isReplication)); if (isSuccess) { logger.debug("Found (Cancel): {} - {}", app.getName(), id); return Response.ok().build(); } else { logger.info("Not Found (Cancel): {} - {}", app.getName(), id); return Response.status(Status.NOT_FOUND).build(); } } catch (Throwable e) { logger.error("Error (cancel): {} - {}", app.getName(), id, e); return Response.serverError().build(); } } //InstanceRegistry public boolean cancel(String appName, String serverId, boolean isReplication) { //发布下线事件 handleCancelation(appName, serverId, isReplication); //调用父类下线 return super.cancel(appName, serverId, isReplication); } //PeerAwareInstanceRegistryImpl public boolean cancel(final String appName, final String id, final boolean isReplication) { //调用父类下线 if (super.cancel(appName, id, isReplication)) { //集群同步下线信息 replicateToPeers(Action.Cancel, appName, id, null, null, isReplication); return true; } return false; }
下线
protected boolean internalCancel(String appName, String id, boolean isReplication) { try { read.lock();
//服务取消数增加 CANCEL.increment(isReplication); // 根据实例集合名称取出实例信息集合 Map<String, Lease<InstanceInfo>> gMap = registry.get(appName); Lease<InstanceInfo> leaseToCancel = null; if (gMap != null) { // 删除租约信息,remove后返回id对应的具体实例信息 leaseToCancel = gMap.remove(id); } // 添加到最近取消租约队列 recentCanceledQueue.add(new Pair<Long, String>(System.currentTimeMillis(), appName + "(" + id + ")")); // 覆盖状态map中移除当前服务实例 InstanceStatus instanceStatus = overriddenInstanceStatusMap.remove(id); if (instanceStatus != null) { logger.debug("Removed instance id {} from the overridden map which has value {}", id, instanceStatus.name()); } if (leaseToCancel == null) { CANCEL_NOT_FOUND.increment(isReplication); logger.warn("DS: Registry: cancel failed because Lease is not registered for: {}/{}", appName, id); return false; } else { // 设置取消租约的时间戳 leaseToCancel.cancel();
// 获取实例信息 InstanceInfo instanceInfo = leaseToCancel.getHolder(); String vip = null; String svip = null; if (instanceInfo != null) { // 设置实例信息中的ActionType为Delete instanceInfo.setActionType(ActionType.DELETED); recentlyChangedQueue.add(new RecentlyChangedItem(leaseToCancel));
//修改最后操作时间 instanceInfo.setLastUpdatedTimestamp(); vip = instanceInfo.getVIPAddress(); svip = instanceInfo.getSecureVipAddress(); } // 使缓存无效,调用responseCache.invalidate让服务在缓存中失效 invalidateCache(appName, vip, svip); logger.info("Cancelled instance {}/{} (replication={})", appName, id, isReplication); } } finally { read.unlock(); } synchronized (lock) { if (this.expectedNumberOfClientsSendingRenews > 0) { this.expectedNumberOfClientsSendingRenews = this.expectedNumberOfClientsSendingRenews - 1; // updateRenewsPerMinThreshold(); } } return true; }
四、集群同步
在客户端的注册、心跳、下线等都会触发集群同步操作,都会走到下面的这个方法中,通过action的值判断,到底是哪种同步操作
private void replicateToPeers(Action action, String appName, String id, InstanceInfo info, InstanceStatus newStatus, //当是集群同步的请求时,值为true boolean isReplication) { Stopwatch tracer = action.getTimer().start(); try { if (isReplication) { //操作次数+1 numberOfReplicationsLastMin.increment(); } // 集群节点为null,或者是集群同步请求时,直接返回,防止循环同步 if (peerEurekaNodes == Collections.EMPTY_LIST || isReplication) { return; } for (final PeerEurekaNode node : peerEurekaNodes.getPeerEurekaNodes()) { // 判断是否是自己,是自己则直接跳过 if (peerEurekaNodes.isThisMyUrl(node.getServiceUrl())) { continue; } //集群同步 replicateInstanceActionsToPeers(action, appName, id, info, newStatus, node); } } finally { tracer.stop(); } }
集群同步
private void replicateInstanceActionsToPeers(Action action, String appName, String id, InstanceInfo info, InstanceStatus newStatus, PeerEurekaNode node) { try { InstanceInfo infoFromRegistry; CurrentRequestVersion.set(Version.V2); switch (action) { //服务下线 case Cancel: node.cancel(appName, id); break; //心跳续约 case Heartbeat: InstanceStatus overriddenStatus = overriddenInstanceStatusMap.get(id); infoFromRegistry = getInstanceByAppAndId(appName, id, false); node.heartbeat(appName, id, infoFromRegistry, overriddenStatus, false); break; //服务注册 case Register: node.register(info); break; //状态更新 UP/DOWN/STARTING/OUT_OF_SERVICE/UNKNOWN case StatusUpdate: infoFromRegistry = getInstanceByAppAndId(appName, id, false); node.statusUpdate(appName, id, newStatus, infoFromRegistry); break; //删除状态覆盖,也就是把InstanceInfo.overriddenStatus状态置为InstanceStatus.UNKNOWN case DeleteStatusOverride: infoFromRegistry = getInstanceByAppAndId(appName, id, false); node.deleteStatusOverride(appName, id, infoFromRegistry); break; } } catch (Throwable t) { logger.error("Cannot replicate information to {} for action {}", node.getServiceUrl(), action.name(), t); } finally { CurrentRequestVersion.remove(); } }
状态枚举
starting:实例初始化状态,此状态主要给实例预留初始化时间
down:当健康检查失败时,实例的状态转变到down
up:正常服务状态
out_of_service:不参与接收服务,但是服务正常
unknown:未知状态
为什么一定需要一个覆盖状态呢?
加入只有一个状态字段,当客户端调用修改状态字段修改状态up为status = out_of_service
1、client端调用updateStatus更新实例状态到out_of_service,即status = out_of_service,此时client的状态是up状态。
2、server端此实例的状态现在是out_of_service,并且标记responseCache无效。
3、客户端等待定时器定时更新实例的状态,但是由于时间间隔没有更新,所以client的状态还是up状态。
4、client端发送续租renew,由于此时客户端的状态是up状态,server端的状态是out_of_service,因此此前客户端调用的实例下线状态又被改回到原来的状态up。
引入覆盖状态后
1、客户端调用updateStatus方法时,同时更新server端实例的status和overriddenStatus状态。
2、客户端调用renew方法时,也要更新server端实例的status和overriddenstatus状态,但是有一下规则的:
(1)如果客户端上传的实例状态是down或者starting,表明客户端是重启或者healthCheck失败。此时这个实例不能作为服务提供服务。因此即使客户端调用updateStatus把实例状态更新为up,也是没用的。此时客户端实例的准确状态就是down或者starting。
(2)如果客户端的实例是up或者out_of_service,此时是不可信的。有可能client端的实例状态已被改变,此时要使用overriddenstatus状态作为当前实例的状态,避免被覆盖。
(3)情况2中的overriddenstatus有可能不存在,缓存失效,此时要使用server端已经存在的实例的状态。
每种操作进行集群同步的时候,就是往其他集群节点发送一个一摸一样的操作请求,但是isReplication参数是true,防止循环同步,我们就挑一个注册的操作看一下
public void register(final InstanceInfo info) throws Exception { // long expiryTime = System.currentTimeMillis() + getLeaseRenewalOf(info); //创建了一个线程 batchingDispatcher.process( taskId("register", info), //创建了一个异步任务 new InstanceReplicationTask(targetHost, Action.Register, info, null, true) { public EurekaHttpResponse<Void> execute() { //发送注册请求 return replicationClient.register(info); } }, expiryTime ); }
五、服务剔除
Eureka Server的服务剔除是通过定时任务完成的,在EurekaBootStrap启动引导的initEurekaServerContext上下文初始化方法中,调用了这么一行代码registry.openForTraffic(applicationInfoManager, registryCount),调用的是InstanceRegistry的openForTraffic方法,最终调用了AbstractInstanceRegistry的postInit方法来初始化服务剔除的定时任务。
protected void postInit() { renewsLastMin.start(); if (evictionTaskRef.get() != null) { //如果服务剔除任务不为空,就执行cancel方法 //该方法把任务的状态修改为了cancel任务取消 evictionTaskRef.get().cancel(); } //创建新的服务剔除任务 evictionTaskRef.set(new EvictionTask()); //交给调度器去执行,延迟60s,每60s执行一次驱逐任务 evictionTimer.schedule(evictionTaskRef.get(), serverConfig.getEvictionIntervalTimerInMs(), serverConfig.getEvictionIntervalTimerInMs()); } class EvictionTask extends TimerTask { private final AtomicLong lastExecutionNanosRef = new AtomicLong(0l); @Override public void run() { try { //计算任务执行的时间偏差:补偿时间 long compensationTimeMs = getCompensationTimeMs(); logger.info("Running the evict task with compensationTime {}ms", compensationTimeMs); //执行剔除 evict(compensationTimeMs); } catch (Throwable e) { logger.error("Could not run the evict task", e); } } }
执行剔除
public void evict(long additionalLeaseMs) { logger.debug("Running the evict task"); //自我保护机制触发 if (!isLeaseExpirationEnabled()) { logger.debug("DS: lease expiration is currently disabled."); return; } //首先收集所有过期的服务,以随机顺序将其剔除 List<Lease<InstanceInfo>> expiredLeases = new ArrayList<>(); //循环注册表中的所有的服务 for (Entry<String, Map<String, Lease<InstanceInfo>>> groupEntry : registry.entrySet()) { Map<String, Lease<InstanceInfo>> leaseMap = groupEntry.getValue(); if (leaseMap != null) { //获取到租约 for (Entry<String, Lease<InstanceInfo>> leaseEntry : leaseMap.entrySet()) { Lease<InstanceInfo> lease = leaseEntry.getValue(); //如果服务过期,就把服务添加到expiredLeases map中 if (lease.isExpired(additionalLeaseMs) && lease.getHolder() != null) { expiredLeases.add(lease); } } } } //为了补偿GC暂停或本地时间差异导致的剔除任务执行时间差异,使用当前注册表大小作为触发自我保护的机制 //否则,可能清除完整的注册表。 //注册表大小 int registrySize = (int) getLocalRegistrySize(); //注册表中服务的续约阈值 = 注册大小 * 0.85 int registrySizeThreshold = (int) (registrySize * serverConfig.getRenewalPercentThreshold()); //剔除极限 = 注册表大小 - 注册表续约阈值 int evictionLimit = registrySize - registrySizeThreshold; //过期的服务数 和 evictionLimit 取最小,如果大于 0,说明需要有服务要剔除 int toEvict = Math.min(expiredLeases.size(), evictionLimit); if (toEvict > 0) { //剔除 toEvict 个 logger.info("Evicting {} items (expired={}, evictionLimit={})", toEvict, expiredLeases.size(), evictionLimit); //取随机值 Random random = new Random(System.currentTimeMillis()); for (int i = 0; i < toEvict; i++) { //选择一个随机实例剔除,如果顺序剔除可能会剔除同一个服务的所有实例节点 int next = i + random.nextInt(expiredLeases.size() - i); Collections.swap(expiredLeases, i, next); //获取剔除服务的实例 Lease<InstanceInfo> lease = expiredLeases.get(i); //应用名 String appName = lease.getHolder().getAppName(); //实例ID String id = lease.getHolder().getId(); //expired Counter 过期计数增加 EXPIRED.increment(); logger.warn("DS: Registry: expired lease for {}/{}", appName, id);
//核心方法,服务剔除 internalCancel(appName, id, false); } } }
判断是否过期,这里的bug在之前也已经提到过了,不过影响不大
public boolean isExpired(long additionalLeaseMs) { //evictionTimestamp (剔除时间戳) > 0 || 最后更新时间戳 + 租期(90s) + 补偿时间 return (evictionTimestamp > 0 || System.currentTimeMillis() > (lastUpdateTimestamp + duration + additionalLeaseMs)); }
核心剔除方法internalCancel走的和服务下线是同一个方法,这里就不再详述了。
六、自我保护机制
默认情况下,当EurekaServer在一定时间内(默认90秒)没有接收到某个客户端实例的心跳,EurekaServer将会注销该实例。但是当网络分区故障发生时,客户端与EurekaServer之间无法正常通信,此时不应该注销客户端。Eureka通过“自我保护机制”来解决这个问题:当EurekaServer短时间内丢失过多客户端时,这个节点就会进入自我保护模式。在自我保护模式下,EurekaServer不会剔除任何客户端。当网络故障恢复后,该节点会自动退出自我保护模式
自我保护机制的实现是基于维护服务注册表的类AbstractInstanceRegistry中的2个变量来维护的。
//期望每分钟最小续租次数 protected volatile int numberOfRenewsPerMinThreshold; //期望每分钟最大续租次数 protected volatile int expectedNumberOfClientsSendingRenews;
在eureka启动时,集群同步之后执行了一个方法registry.openForTraffic(applicationInfoManager, registryCount),跟服务剔除进入的是同一个地方。
在方法内会调用updateRenewsPerMinThreshold()方法修改值,
protected void updateRenewsPerMinThreshold() { //每分钟最小续租次数=预估心跳续租次数*(60 / 配置的心跳间隔时间,默认30s)*自我保护机制触发的百分比,默认85% this.numberOfRenewsPerMinThreshold = (int) (this.expectedNumberOfClientsSendingRenews * (60.0 / serverConfig.getExpectedClientRenewalIntervalSeconds()) * serverConfig.getRenewalPercentThreshold()); }
其次在服务剔除,下线,注册以及初始化的时候(开启了一个定时线程)去修改这个值。然后在服务剔除的时候会判断是否开启了自我保护机制,如果开启了自我保护机制,就不会剔除服务。
··· if (!isLeaseExpirationEnabled()) { logger.debug("DS: lease expiration is currently disabled."); return; } ···
public boolean isLeaseExpirationEnabled() { //enableSelfPreservation默认是true,开启了自我保护机制 if (!isSelfPreservationModeEnabled()) { // 如果没有开启自我保护机制,直接返回 return true; } //getNumOfRenewsInLastMin()方法计算出了上一分钟服务实例一共发送过来多少次心跳 //numberOfRenewsPerMinThreshold则是期望的一分钟内最少要有多少次心跳,通过上面的公式计算出来的 //如果上一分钟发送过来的心跳次数小于了期望值numberOfRenewsPerMinThreshold,则返回false,触发自我保护机制 return numberOfRenewsPerMinThreshold > 0 && getNumOfRenewsInLastMin() > numberOfRenewsPerMinThreshold; }
七、服务发现
全量拉取
服务全量发现是通过调用ApplicationsResource中的getContainers方法来实现的
public Response getContainers(@PathParam("version") String version, @HeaderParam(HEADER_ACCEPT) String acceptHeader, @HeaderParam(HEADER_ACCEPT_ENCODING) String acceptEncoding, @HeaderParam(EurekaAccept.HTTP_X_EUREKA_ACCEPT) String eurekaAccept, @Context UriInfo uriInfo, @Nullable @QueryParam("regions") String regionsStr) { boolean isRemoteRegionRequested = null != regionsStr && !regionsStr.isEmpty(); String[] regions = null; if (!isRemoteRegionRequested) { EurekaMonitors.GET_ALL.increment(); } else { regions = regionsStr.toLowerCase().split(","); Arrays.sort(regions); // So we don't have different caches for same regions queried in different order. EurekaMonitors.GET_ALL_WITH_REMOTE_REGIONS.increment(); } //检测是否允许方法 if (!registry.shouldAllowAccess(isRemoteRegionRequested)) { return Response.status(Status.FORBIDDEN).build(); } CurrentRequestVersion.set(Version.toEnum(version)); KeyType keyType = Key.KeyType.JSON; String returnMediaType = MediaType.APPLICATION_JSON; if (acceptHeader == null || !acceptHeader.contains(HEADER_JSON_VALUE)) { keyType = Key.KeyType.XML; returnMediaType = MediaType.APPLICATION_XML; } Key cacheKey = new Key(Key.EntityType.Application, ResponseCacheImpl.ALL_APPS, keyType, CurrentRequestVersion.get(), EurekaAccept.fromString(eurekaAccept), regions ); Response response; if (acceptEncoding != null && acceptEncoding.contains(HEADER_GZIP_VALUE)) { response = Response.ok(responseCache.getGZIP(cacheKey)) .header(HEADER_CONTENT_ENCODING, HEADER_GZIP_VALUE) .header(HEADER_CONTENT_TYPE, returnMediaType) .build(); } else { //获取服务列表 response = Response.ok(responseCache.get(cacheKey)) .build(); } CurrentRequestVersion.remove(); return response; } //ResponseCacheImpl String get(final Key key, boolean useReadOnlyCache) { //核心逻辑 Value payload = getValue(key, useReadOnlyCache); if (payload == null || payload.getPayload().equals(EMPTY_PAYLOAD)) { return null; } else { return payload.getPayload(); } } Value getValue(final Key key, boolean useReadOnlyCache) { Value payload = null; try { //只读缓存是否开启 if (useReadOnlyCache) { //从只读缓存中获取 final Value currentPayload = readOnlyCacheMap.get(key); if (currentPayload != null) { payload = currentPayload; } else { //只读缓存为null,从读写缓存中获取 payload = readWriteCacheMap.get(key); //放入只读缓存 readOnlyCacheMap.put(key, payload); } } else { //直接从读写缓存中取 payload = readWriteCacheMap.get(key); } } catch (Throwable t) { logger.error("Cannot get value for key : {}", key, t); } return payload; }
eureka存在两个缓存,一个只读缓存(ConcurrentMap<Key, Value>),一个读写缓存(LoadingCache<Key, Value>,guava自己封装的一种缓存结构)。
ResponseCacheImpl(EurekaServerConfig serverConfig, ServerCodecs serverCodecs, AbstractInstanceRegistry registry) { this.serverConfig = serverConfig; this.serverCodecs = serverCodecs; this.shouldUseReadOnlyResponseCache = serverConfig.shouldUseReadOnlyResponseCache(); this.registry = registry; long responseCacheUpdateIntervalMs = serverConfig.getResponseCacheUpdateIntervalMs(); //初始化读写缓存 this.readWriteCacheMap = CacheBuilder.newBuilder() //初始化缓存大小 默认1000 .initialCapacity(serverConfig.getInitialCapacityOfResponseCache()) //缓存过期时间 180s .expireAfterWrite(serverConfig.getResponseCacheAutoExpirationInSeconds(), TimeUnit.SECONDS) //添加与服务移除的监听器 .removalListener(new RemovalListener<Key, Value>() { @Override public void onRemoval(RemovalNotification<Key, Value> notification) { Key removedKey = notification.getKey(); if (removedKey.hasRegions()) { Key cloneWithNoRegions = removedKey.cloneWithoutRegions(); regionSpecificKeys.remove(cloneWithNoRegions, removedKey); } } }) //添加一个缓存CacheLoader实现类,当调用get时,会调用到load方法 .build(new CacheLoader<Key, Value>() { @Override public Value load(Key key) throws Exception { if (key.hasRegions()) { Key cloneWithNoRegions = key.cloneWithoutRegions(); regionSpecificKeys.put(cloneWithNoRegions, key); } Value value = generatePayload(key); return value; } }); //初始化只读缓存,通过定时器从读写缓存中同步数据 if (shouldUseReadOnlyResponseCache) { timer.schedule(getCacheUpdateTask(), new Date(((System.currentTimeMillis() / responseCacheUpdateIntervalMs) * responseCacheUpdateIntervalMs) + responseCacheUpdateIntervalMs), responseCacheUpdateIntervalMs); } try { Monitors.registerObject(this); } catch (Throwable e) { logger.warn("Cannot register the JMX monitor for the InstanceRegistry", e); } }
只读缓存比较简单,就是一个map数据结构,我们看看从读写缓存中读取
private Value generatePayload(Key key) { Stopwatch tracer = null; try { String payload; switch (key.getEntityType()) { case Application: boolean isRemoteRegionRequested = key.hasRegions(); //全量获取 if (ALL_APPS.equals(key.getName())) { if (isRemoteRegionRequested) { tracer = serializeAllAppsWithRemoteRegionTimer.start(); payload = getPayLoad(key, registry.getApplicationsFromMultipleRegions(key.getRegions())); } else { tracer = serializeAllAppsTimer.start(); //registry.getApplications()获取所有服务列表,getPayLoad重新编码 payload = getPayLoad(key, registry.getApplications()); } //增量获取 } else if (ALL_APPS_DELTA.equals(key.getName())) { if (isRemoteRegionRequested) { tracer = serializeDeltaAppsWithRemoteRegionTimer.start(); versionDeltaWithRegions.incrementAndGet(); versionDeltaWithRegionsLegacy.incrementAndGet(); payload = getPayLoad(key, registry.getApplicationDeltasFromMultipleRegions(key.getRegions())); } else { tracer = serializeDeltaAppsTimer.start(); versionDelta.incrementAndGet(); versionDeltaLegacy.incrementAndGet(); payload = getPayLoad(key, registry.getApplicationDeltas()); } } else { tracer = serializeOneApptimer.start(); payload = getPayLoad(key, registry.getApplication(key.getName())); } break; case VIP: case SVIP: tracer = serializeViptimer.start(); payload = getPayLoad(key, getApplicationsForVip(key, registry)); break; default: logger.error("Unidentified entity type: {} found in the cache key.", key.getEntityType()); payload = ""; break; } return new Value(payload); } finally { if (tracer != null) { tracer.stop(); } } }
从读写缓存中获取,有两个分支全量获取registry.getApplicationsFromMultipleRegions(key.getRegions()),逻辑比较简单就是从我们的注册表ConcurrentHashMap<String, Map<String, Lease<InstanceInfo>>> registry 中获取全量服务端的注册实例信息。服务增量获取则调用的是registry.getApplicationDeltasFromMultipleRegions(key.getRegions())方法,我们具体来分析一下这个方法。
public Applications getApplicationDeltasFromMultipleRegions(String[] remoteRegions) { if (null == remoteRegions) { remoteRegions = allKnownRemoteRegions; // null means all remote regions. } boolean includeRemoteRegion = remoteRegions.length != 0; if (includeRemoteRegion) { GET_ALL_WITH_REMOTE_REGIONS_CACHE_MISS_DELTA.increment(); } else { GET_ALL_CACHE_MISS_DELTA.increment(); } //增量数据 Applications apps = new Applications(); //设置版本号 apps.setVersion(responseCache.getVersionDeltaWithRegions().get()); Map<String, Application> applicationInstancesMap = new HashMap<String, Application>(); try { write.lock(); //最近修改过的实例队列,包括,最新注册,最近修改(心跳续约),最近下线 Iterator<RecentlyChangedItem> iter = this.recentlyChangedQueue.iterator(); logger.debug("The number of elements in the delta queue is :{}", this.recentlyChangedQueue.size()); //遍历增量队列,保存至apps while (iter.hasNext()) { Lease<InstanceInfo> lease = iter.next().getLeaseInfo(); InstanceInfo instanceInfo = lease.getHolder(); logger.debug("The instance id {} is found with status {} and actiontype {}", instanceInfo.getId(), instanceInfo.getStatus().name(), instanceInfo.getActionType().name()); Application app = applicationInstancesMap.get(instanceInfo.getAppName()); if (app == null) { app = new Application(instanceInfo.getAppName()); applicationInstancesMap.put(instanceInfo.getAppName(), app); apps.addApplication(app); } app.addInstance(new InstanceInfo(decorateInstanceInfo(lease))); } if (includeRemoteRegion) { for (String remoteRegion : remoteRegions) { RemoteRegionRegistry remoteRegistry = regionNameVSRemoteRegistry.get(remoteRegion); if (null != remoteRegistry) { Applications remoteAppsDelta = remoteRegistry.getApplicationDeltas(); if (null != remoteAppsDelta) { for (Application application : remoteAppsDelta.getRegisteredApplications()) { if (shouldFetchFromRemoteRegistry(application.getName(), remoteRegion)) { Application appInstanceTillNow = apps.getRegisteredApplications(application.getName()); if (appInstanceTillNow == null) { appInstanceTillNow = new Application(application.getName()); apps.addApplication(appInstanceTillNow); } for (InstanceInfo instanceInfo : application.getInstances()) { appInstanceTillNow.addInstance(new InstanceInfo(instanceInfo)); } } } } } } } //全量数据 Applications allApps = getApplicationsFromMultipleRegions(remoteRegions); //设置全量数据的hashcode 设置到 增量数据的hashcode //客户端拿到这个hashcode 会跟 自己本地的全量数据+增量数据的 hashcode 进行比较,如果不匹配还会再次拉取 apps.setAppsHashCode(allApps.getReconcileHashCode()); return apps; } finally { write.unlock(); } }
服务段在启动的时候还会初始化一个定时任务,定时清除recentlyChangedQueue队列中的数据
private TimerTask getDeltaRetentionTask() { return new TimerTask() { @Override public void run() { Iterator<RecentlyChangedItem> it = recentlyChangedQueue.iterator(); //判断队列中最近的更新时间 是否 小于 当前系统时间 - 可保存在队列中的时间(可配置的,默认3分钟) //就是一个定时清除的功能 while (it.hasNext()) { if (it.next().getLastUpdateTime() < System.currentTimeMillis() - serverConfig.getRetentionTimeInMSInDeltaQueue()) { it.remove(); } else { break; } } } }; }
增量拉取
服务增量发现是通过调用ApplicationsResource中的getContainerDifferential方法来实现的
public Response getContainerDifferential( @PathParam("version") String version, @HeaderParam(HEADER_ACCEPT) String acceptHeader, @HeaderParam(HEADER_ACCEPT_ENCODING) String acceptEncoding, @HeaderParam(EurekaAccept.HTTP_X_EUREKA_ACCEPT) String eurekaAccept, @Context UriInfo uriInfo, @Nullable @QueryParam("regions") String regionsStr) { boolean isRemoteRegionRequested = null != regionsStr && !regionsStr.isEmpty(); // If the delta flag is disabled in discovery or if the lease expiration // has been disabled, redirect clients to get all instances if ((serverConfig.shouldDisableDelta()) || (!registry.shouldAllowAccess(isRemoteRegionRequested))) { return Response.status(Status.FORBIDDEN).build(); } String[] regions = null; if (!isRemoteRegionRequested) { EurekaMonitors.GET_ALL_DELTA.increment(); } else { regions = regionsStr.toLowerCase().split(","); Arrays.sort(regions); // So we don't have different caches for same regions queried in different order. EurekaMonitors.GET_ALL_DELTA_WITH_REMOTE_REGIONS.increment(); } CurrentRequestVersion.set(Version.toEnum(version)); KeyType keyType = Key.KeyType.JSON; String returnMediaType = MediaType.APPLICATION_JSON; if (acceptHeader == null || !acceptHeader.contains(HEADER_JSON_VALUE)) { keyType = Key.KeyType.XML; returnMediaType = MediaType.APPLICATION_XML; } Key cacheKey = new Key(Key.EntityType.Application, ResponseCacheImpl.ALL_APPS_DELTA, keyType, CurrentRequestVersion.get(), EurekaAccept.fromString(eurekaAccept), regions ); final Response response; if (acceptEncoding != null && acceptEncoding.contains(HEADER_GZIP_VALUE)) { response = Response.ok(responseCache.getGZIP(cacheKey)) .header(HEADER_CONTENT_ENCODING, HEADER_GZIP_VALUE) .header(HEADER_CONTENT_TYPE, returnMediaType) .build(); } else { //核心拉取的逻辑,走的和全量发现同一个逻辑 response = Response.ok(responseCache.get(cacheKey)).build(); } CurrentRequestVersion.remove(); return response; } }
直接通过全量的逻辑走到generatePayload方法,里面有一个增量拉取的逻辑,上面已经分析过了。