服务健康状态指标收集——Prometheus&Druid
这一篇主要是介绍一种使用Prometheus收集服务健康状态指标的案例,以常用的Druid数据源为例。
先看几个公共类的定义:
HealthStats:其中定义了MeterRegistry和namespace(用来区分不同的统计对象)
public class HealthStats { private MeterRegistry registry; private String namespace; public HealthStats() { } public HealthStats(MeterRegistry registry, String namespace) { this.registry = registry; this.namespace = namespace; } public MeterRegistry getRegistry() { return registry; } public String getNamespace() { return namespace; } }
HealthStatsDefault:继承了HealthStats,指标收集操作的核心类。
public abstract class HealthStatsDefault extends HealthStats { //指标收集器集合 private final Map<String, AtomicDouble> gaugeCollector = new ConcurrentHashMap<>(); //存储指标标签的集合 private final Map<String, String> constantsCollector = new ConcurrentHashMap<>(); public Map<String, AtomicDouble> getGaugeCollector() { return gaugeCollector; } public Map<String, String> getConstantsCollector() { return constantsCollector; } public HealthStatsDefault(MeterRegistry registry, String namespace) { super(registry, namespace); } public void constantsCollect(String gaugeName, String value) { constantsCollector.put(gaugeName, value); } public void infoCollect() { //对于Druid数据源,metricsName:pepper.gauge.druid.default.info String metricsName = MetricsNameBuilder.builder() .setMetricsType(MetricsType.GAUGE) .setType(getType()) .setSubType(getSubType()) .setName("Info") .build(); List<Tag> tags = new ArrayList<>(); tags.add(Tag.of("namespace", getNamespace())); //这里的namespace就是在caf框架中使用@EnableDataSource所定义的,如shua-kxct for (Map.Entry<String, String> entry : constantsCollector.entrySet()) { tags.add(Tag.of(entry.getKey(), entry.getValue())); } getRegistry().gauge(metricsName, tags, 1); } public void gaugeCollect(String gaugeName, double value) { getOrInitGauge(gaugeName, () -> new String[]{"GaugeName", gaugeName, "namespace", getNamespace()}).set(value); } public void gaugeCollect(String gaugeName, double value, String... additionalTags) { if (ArrayUtils.isEmpty(additionalTags)) { gaugeCollect(gaugeName, value); } String[] defaultTags = new String[]{"GaugeName", gaugeName, "namespace", getNamespace()}; String[] tags = Arrays.copyOf(defaultTags, defaultTags.length + additionalTags.length); System.arraycopy(additionalTags, 0, tags, defaultTags.length, additionalTags.length); getOrInitGauge(gaugeName, () -> tags).set(value); } private AtomicDouble getOrInitGauge(String gaugeName, Tags tagsFuc) { final AtomicDouble gauge = gaugeCollector.get(gaugeName); if (gauge != null) return gauge; synchronized (gaugeCollector) { if (gaugeCollector.get(gaugeName) == null) { final AtomicDouble obj = new AtomicDouble(); String metricsName = MetricsNameBuilder.builder() .setMetricsType(MetricsType.GAUGE) .setType(getType()) .setSubType(getSubType()) .setName(gaugeName) .build(); Gauge.builder(metricsName, obj, AtomicDouble::get).tags(tagsFuc.tags()).register(getRegistry()); gaugeCollector.putIfAbsent(gaugeName, obj); } } return gaugeCollector.get(gaugeName); } public abstract String getType(); public abstract String getSubType();
HealthTracker:健康状态追踪器,核心任务调度类,负责存储要收集的统计对象,通过加载扩展的调度器,去定时执行指标收集任务。
public class HealthTracker { //存储要收集的健康指标集合 private static final Set<HealthStats> HEALTH_STAT_SET = Sets.newConcurrentHashSet(); //指标收集任务调度器 private static final ScheduledExecutorService scheduledExecutor; static { scheduledExecutor = Executors.newSingleThreadScheduledExecutor(new ThreadFactory()); //定义任务调度器,延迟30秒初始化,每隔60秒执行一次 scheduledExecutor.scheduleAtFixedRate(() -> { try { //获取扩展的任务调度器,遍历执行 final List<HealthScheduledRun> extensions = ExtensionLoader.getExtensionLoader(HealthScheduledRun.class).getExtensions(); for (HealthScheduledRun extension : extensions) { extension.run(HEALTH_STAT_SET); } } catch (Exception e) { e.printStackTrace(); } }, 30, 60, TimeUnit.SECONDS); } public static void addStats(HealthStats stats) { HEALTH_STAT_SET.add(stats); } }
接下来以Druid为例,看下监控指标收集的过程。
任务调度执行器:DruidHealthStatsScheduled,通过spi机制实现了调度器接口。使用Druid提供的对外暴露的监控状态数据的接口来获取健康状态数据。
@SpiMeta(name = "druidHealthStatsScheduled") @ExtensionOrder(value = 1) public class DruidHealthStatsScheduled implements HealthScheduledRun { @Override public void run(Set<HealthStats> healthStats) { Map<String, DruidHealthStats> statsMap = transferStats(healthStats); //使用Druid对外暴露监控数据的接口获取Druid运行状态数据 List<Map<String, Object>> statDataList = DruidStatManagerFacade.getInstance().getDataSourceStatDataList(); for (Map<String, Object> map : statDataList) { JSONObject data = JSONObject.parseObject(JSONObject.toJSONString(map)); String name = data.getString("Name"); Map<String, Object> innerMap = data.getInnerMap(); DruidHealthStats druidHealthStats; if (StringUtils.isNotEmpty(name) && (druidHealthStats = statsMap.get(name)) != null) { //将Druid对外暴露的一些指标放入标签集合中 druidHealthStats.constantsCollect(DruidHealthQuota.NAME, data.getOrDefault(DruidHealthQuota.NAME, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.DB_TYPE, data.getOrDefault(DruidHealthQuota.DB_TYPE, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.URL, truncateUrl(data.getOrDefault(DruidHealthQuota.URL, "null").toString())); druidHealthStats.constantsCollect(DruidHealthQuota.TEST_ON_BORROW, data.getOrDefault(DruidHealthQuota.TEST_ON_BORROW, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.TEST_ON_RETURN, data.getOrDefault(DruidHealthQuota.TEST_ON_RETURN, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.TEST_ON_IDLE, data.getOrDefault(DruidHealthQuota.TEST_ON_IDLE, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.DEFAULT_AUTO_COMMIT, data.getOrDefault(DruidHealthQuota.DEFAULT_AUTO_COMMIT, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.DEFAULT_READ_ONLY, data.getOrDefault(DruidHealthQuota.DEFAULT_READ_ONLY, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.DEFAULT_TRANSACTION_ISOLATION, data.getOrDefault(DruidHealthQuota.DEFAULT_TRANSACTION_ISOLATION, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.REMOVE_ABANDONED, data.getOrDefault(DruidHealthQuota.REMOVE_ABANDONED, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.KEEP_ALIVE, data.getOrDefault(DruidHealthQuota.KEEP_ALIVE, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.FAIL_FAST, data.getOrDefault(DruidHealthQuota.FAIL_FAST, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.MAX_WAIT, data.getOrDefault(DruidHealthQuota.MAX_WAIT, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.MAX_WAIT_THREAD_COUNT, data.getOrDefault(DruidHealthQuota.MAX_WAIT_THREAD_COUNT, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.POOL_PREPARED_STATEMENTS, data.getOrDefault(DruidHealthQuota.POOL_PREPARED_STATEMENTS, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.LOG_DIFFERENT_THREAD, data.getOrDefault(DruidHealthQuota.LOG_DIFFERENT_THREAD, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.USE_UNFAIR_LOCK, data.getOrDefault(DruidHealthQuota.USE_UNFAIR_LOCK, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.INIT_GLOBAL_VARIANTS, data.getOrDefault(DruidHealthQuota.INIT_GLOBAL_VARIANTS, "null").toString()); druidHealthStats.constantsCollect(DruidHealthQuota.INIT_VARIANTS, data.getOrDefault(DruidHealthQuota.INIT_VARIANTS, "null").toString()); //这里是将Druid监控数据赋值到Prometheus中 if (innerMap.containsKey(DruidHealthQuota.WAIT_THREAD_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.WAIT_THREAD_COUNT, data.getLong(DruidHealthQuota.WAIT_THREAD_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.NOT_EMPTY_WAIT_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.NOT_EMPTY_WAIT_COUNT, data.getLong(DruidHealthQuota.NOT_EMPTY_WAIT_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.NOT_EMPTY_WAIT_MILLIS)) { druidHealthStats.gaugeCollect(DruidHealthQuota.NOT_EMPTY_WAIT_MILLIS, data.getLong(DruidHealthQuota.NOT_EMPTY_WAIT_MILLIS)); } if (innerMap.containsKey(DruidHealthQuota.POOLING_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.POOLING_COUNT, data.getLong(DruidHealthQuota.POOLING_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.POOLING_PEAK)) { druidHealthStats.gaugeCollect(DruidHealthQuota.POOLING_PEAK, data.getLong(DruidHealthQuota.POOLING_PEAK)); } if (innerMap.containsKey(DruidHealthQuota.POOLING_PEAK_TIME)) { druidHealthStats.gaugeCollect(DruidHealthQuota.POOLING_PEAK_TIME, data.getLong(DruidHealthQuota.POOLING_PEAK_TIME)); } if (innerMap.containsKey(DruidHealthQuota.ACTIVE_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.ACTIVE_COUNT, data.getLong(DruidHealthQuota.ACTIVE_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.ACTIVE_PEAK)) { druidHealthStats.gaugeCollect(DruidHealthQuota.ACTIVE_PEAK, data.getLong(DruidHealthQuota.ACTIVE_PEAK)); } if (innerMap.containsKey(DruidHealthQuota.ACTIVE_PEAK_TIME)) { druidHealthStats.gaugeCollect(DruidHealthQuota.ACTIVE_PEAK_TIME, data.getLong(DruidHealthQuota.ACTIVE_PEAK_TIME)); } if (innerMap.containsKey(DruidHealthQuota.INITIAL_SIZE)) { druidHealthStats.gaugeCollect(DruidHealthQuota.INITIAL_SIZE, data.getLong(DruidHealthQuota.INITIAL_SIZE)); } if (innerMap.containsKey(DruidHealthQuota.MIN_IDLE)) { druidHealthStats.gaugeCollect(DruidHealthQuota.MIN_IDLE, data.getLong(DruidHealthQuota.MIN_IDLE)); } if (innerMap.containsKey(DruidHealthQuota.MAX_ACTIVE)) { druidHealthStats.gaugeCollect(DruidHealthQuota.MAX_ACTIVE, data.getLong(DruidHealthQuota.MAX_ACTIVE)); } if (innerMap.containsKey(DruidHealthQuota.QUERY_TIMEOUT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.QUERY_TIMEOUT, data.getLong(DruidHealthQuota.QUERY_TIMEOUT)); } if (innerMap.containsKey(DruidHealthQuota.TRANSACTION_QUERY_TIMEOUT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.TRANSACTION_QUERY_TIMEOUT, data.getLong(DruidHealthQuota.TRANSACTION_QUERY_TIMEOUT)); } if (innerMap.containsKey(DruidHealthQuota.LOGIN_TIMEOUT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.LOGIN_TIMEOUT, data.getLong(DruidHealthQuota.LOGIN_TIMEOUT)); } if (innerMap.containsKey(DruidHealthQuota.LOGIC_CONNECT_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.LOGIC_CONNECT_COUNT, data.getLong(DruidHealthQuota.LOGIC_CONNECT_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.LOGIC_CLOSE_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.LOGIC_CLOSE_COUNT, data.getLong(DruidHealthQuota.LOGIC_CLOSE_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.LOGIC_CONNECT_ERROR_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.LOGIC_CONNECT_ERROR_COUNT, data.getLong(DruidHealthQuota.LOGIC_CONNECT_ERROR_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.PHYSICAL_CONNECT_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.PHYSICAL_CONNECT_COUNT, data.getLong(DruidHealthQuota.PHYSICAL_CONNECT_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.PHYSICAL_CLOSE_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.PHYSICAL_CLOSE_COUNT, data.getLong(DruidHealthQuota.PHYSICAL_CLOSE_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.PHYSICAL_CONNECT_ERROR_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.PHYSICAL_CONNECT_ERROR_COUNT, data.getLong(DruidHealthQuota.PHYSICAL_CONNECT_ERROR_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.EXECUTE_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.EXECUTE_COUNT, data.getLong(DruidHealthQuota.EXECUTE_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.ERROR_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.ERROR_COUNT, data.getLong(DruidHealthQuota.ERROR_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.COMMIT_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.COMMIT_COUNT, data.getLong(DruidHealthQuota.COMMIT_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.ROLLBACK_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.ROLLBACK_COUNT, data.getLong(DruidHealthQuota.ROLLBACK_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.PSCACHE_ACCESS_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.PSCACHE_ACCESS_COUNT, data.getLong(DruidHealthQuota.PSCACHE_ACCESS_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.PSCACHE_HIT_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.PSCACHE_HIT_COUNT, data.getLong(DruidHealthQuota.PSCACHE_HIT_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.PSCACHE_MISS_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.PSCACHE_MISS_COUNT, data.getLong(DruidHealthQuota.PSCACHE_MISS_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.START_TRANSACTION_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.START_TRANSACTION_COUNT, data.getLong(DruidHealthQuota.START_TRANSACTION_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.CLOB_OPEN_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.CLOB_OPEN_COUNT, data.getLong(DruidHealthQuota.CLOB_OPEN_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.BLOB_OPEN_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.BLOB_OPEN_COUNT, data.getLong(DruidHealthQuota.BLOB_OPEN_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.KEEP_ALIVE_CHECK_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.KEEP_ALIVE_CHECK_COUNT, data.getLong(DruidHealthQuota.KEEP_ALIVE_CHECK_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.MAX_POOL_PREPARED_STATEMENT_PRE_CONNECTION_SIZE)) { druidHealthStats.gaugeCollect(DruidHealthQuota.MAX_POOL_PREPARED_STATEMENT_PRE_CONNECTION_SIZE, data.getLong(DruidHealthQuota.MAX_POOL_PREPARED_STATEMENT_PRE_CONNECTION_SIZE)); } if (innerMap.containsKey(DruidHealthQuota.MIN_EVICTABLE_IDLE_TIME_MILLIS)) { druidHealthStats.gaugeCollect(DruidHealthQuota.MIN_EVICTABLE_IDLE_TIME_MILLIS, data.getLong(DruidHealthQuota.MIN_EVICTABLE_IDLE_TIME_MILLIS)); } if (innerMap.containsKey(DruidHealthQuota.MAX_EVICTABLE_IDLE_TIME_MILLIS)) { druidHealthStats.gaugeCollect(DruidHealthQuota.MAX_EVICTABLE_IDLE_TIME_MILLIS, data.getLong(DruidHealthQuota.MAX_EVICTABLE_IDLE_TIME_MILLIS)); } if (innerMap.containsKey(DruidHealthQuota.RECYCLE_ERROR_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.RECYCLE_ERROR_COUNT, data.getLong(DruidHealthQuota.RECYCLE_ERROR_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.PREPARED_STATEMENT_OPEN_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.PREPARED_STATEMENT_OPEN_COUNT, data.getLong(DruidHealthQuota.PREPARED_STATEMENT_OPEN_COUNT)); } if (innerMap.containsKey(DruidHealthQuota.PREPARED_STATEMENT_CLOSE_COUNT)) { druidHealthStats.gaugeCollect(DruidHealthQuota.PREPARED_STATEMENT_CLOSE_COUNT, data.getLong(DruidHealthQuota.PREPARED_STATEMENT_CLOSE_COUNT)); } //将以上放入标签集合中的Druid属性信息进行一次收集 druidHealthStats.infoCollect(); } } }
Druid监控追踪器:DruidHealthTracker,这是将Druid健康健康接入Prometheus的入口,其中创建了统计对象DruidHealthStats,并将其放入核心调度类的集合中,定时由任务调度器去执行统计任务。
public class DruidHealthTracker { private static Set<String> UNIQUE_NAME = new ConcurrentSkipListSet<>(); /** * 添加要监控的Druid数据源 * @param namespace 区别数据源的命名空间 * @param druidDataSource Druid数据源实例(需要在用户应用中创建) */ public static void addDataSource(String namespace, DruidDataSource druidDataSource) { Assert.assertNotNull(namespace); Assert.assertFalse("Duplicate datasource name error.", UNIQUE_NAME.contains(namespace)); UNIQUE_NAME.add(namespace); druidDataSource.setName(namespace); //创建要统计的健康指标对象 DruidHealthStats stats = new DruidHealthStats(MetricsRegistry.getREGISTRY(), namespace, druidDataSource); //添加到核心调度类的统计对象集合中 HealthTracker.addStats(stats); } }
接下来看下在项目框架中的使用:
public class DataSourceValueBindingBeanPostProcessor extends BaseDataSourceConfiguration implements BeanPostProcessor, Ordered, EnvironmentAware, BeanFactoryAware { @Autowired protected CustomizedConfigurationPropertiesBinder binder; private Environment environment; private BeanFactory beanFactory; public Object postProcessBeforeInitialization(Object bean, String beanName) throws BeansException { if (bean instanceof DruidDataSource) { DruidDataSource druidDataSource = (DruidDataSource) bean; String namespace = StringUtils.substringBefore(beanName, DataSource.class.getSimpleName()); initDataSource(druidDataSource); if (environment.containsProperty(PREFIX_APP_DATASOURCE + "." + namespace + ".data-source" + ".filters")) { druidDataSource.clearFilters(); } Bindable<?> target = Bindable.of(DruidDataSource.class).withExistingValue(druidDataSource); binder.bind(PREFIX_APP_DATASOURCE + "." + namespace + ".data-source", target); DruidHealthTracker.addDataSource(namespace, druidDataSource); } else if (bean instanceof SqlSessionFactoryBean) { SqlSessionFactoryBean sqlSessionFactoryBean = (SqlSessionFactoryBean) bean; String namespace = StringUtils.substringBefore(beanName, SqlSessionFactory.class.getSimpleName()); DataSource dataSource = beanFactory.getBean(namespace + DataSource.class.getSimpleName(), DataSource.class); String typeAliasesPackageKey = PREFIX_APP_DATASOURCE + "." + namespace + ".type-aliases-package"; String typeAliasesPackage = environment.getProperty(typeAliasesPackageKey); Assert.isTrue(StringUtils.isNotEmpty(typeAliasesPackage), String.format("%s=%s must be not null! ", typeAliasesPackageKey, typeAliasesPackage)); initSqlSessionFactoryBean(dataSource, typeAliasesPackage, sqlSessionFactoryBean); } else if (bean instanceof DataSourceTransactionManager) { DataSourceTransactionManager dataSourceTransactionManager = (DataSourceTransactionManager) bean; String namespace = StringUtils.substringBefore(beanName, DataSourceTransactionManager.class.getSimpleName()); DataSource dataSource = beanFactory.getBean(namespace + DataSource.class.getSimpleName(), DataSource.class); dataSourceTransactionManager.setDataSource(dataSource); dataSourceTransactionManager.afterPropertiesSet(); } else if (bean instanceof StatFilter) { String namespace = StringUtils.substringBefore(beanName, StatFilter.class.getSimpleName()); String enabled = environment.getProperty(PREFIX_APP_DATASOURCE + "." + namespace + ".data-source.stat.enabled"); if (!"true".equalsIgnoreCase(enabled)) { return bean; } StatFilter statFilter = (StatFilter) bean; initStatFilter(statFilter); Bindable<?> target = Bindable.of(StatFilter.class).withExistingValue(statFilter); binder.bind(PREFIX_APP_DATASOURCE + "." + namespace + ".data-source.stat", target); } return bean; }
其实就是通过实现BeanPostProcessor,在数据源对象初始化过程的前置处理器中将Druid监控接入Prometheus中。