skywalking(五)实现skywalking 实现钉钉告警

1. alert告警参数简介

1.1 告警指标

cat /apps/apache-skywalking-apm-bin/config/oal/core.oal

  • service_resp_time

    服务的响应时间

  • service_sla

    服务的http请求成功率SLA,比如99%等

  • service_cpm

    表示每分钟的吞吐量

  • service_apdex

    应用性能指数是0.8是0.x

  • service_percentile

    指定最近多少数据范围内的响应时间百分比,即p99, p95, p90, p75, p50在内的数据统计结果

  • endpoint_relation_cpm

    端点的每分钟的吞吐量

  • endpoint_relation_resp_time

    端点的响应时间

  • endpoint_relation_sla

    端点的http请求成功率SLA,比如99%等。

  • endpoint_relation_percentile

    端点的最近多少数据范围内的响应时间百分比,即p99、 p95、 p90、 p75、p50在内的数据统计结果

1.2 告警规则

cat /apps/apache-skywalking-apm-bin/config/alarm-settings.yml
rules: # 定义rule规则
service_cpm_rule: # 唯一的规则名称,必须以_rule结尾
metrics-name: service_cpm # 指标名称,对应core.oal文件中的采集指标名称
include-names: # skywalking服务名称,不填写默认为所有服务
- dubbox-provider
- dubbox-consumer
op: ">" # 操作符,>, >=, <, <=, ==
threshold: 1 # 指标阈值
period: 2 # 评估指标的间隔周期
count: 1 # 匹配成功多少次就会触发告警
silence-period: 2 # 触发告警后的静默时间,min
message: dubbo-provider service_cpm 大于1了 # 告警信息
rules:
# 告警规则 名称唯一 必须以_rule 结尾
service_resp_time_rule:
# 度量名称,只支持int long double
metrics-name: service_resp_time
# 操作符
op: ">"
# 阈值 ms
threshold: 1000
# 评估度量的时间长度,ms
period: 10
# 度量有多少次符合告警条件后,才会触发告警
count: 2
# 静默时间 默认情况下,它和周期一样,在同一个周期内只会触发一次。
silence-period: 10
message: 服务【{name}】的平均响应时间在最近10分钟内有2分钟超过1秒
service_sla_rule:
metrics-name: service_sla
op: "<"
threshold: 8000
period: 10
count: 2
silence-period: 10
message: 服务【{name}】的成功率在最近10分钟内有2分钟低于80%
composite-rules:
# 规则名称:在告警信息中显示的唯一名称,必须以_rule结尾
comp_rule:
# 指定如何组成规则,支持&&, ||, ()操作符
expression: service_resp_time_rule && service_sla_rule
message: 服务【{name}】在最近10分钟内有2分钟平均响应时间超过1秒并且成功率低于80%
dingtalkHooks:
textTemplate: |-
{
"msgtype": "text",
"text": {
"content": "Apache SkyWalking Alarm: \n %s."
}
}
webhooks:
- url: https://oapi.dingtalk.com/robot/send?access_token=a374b5e60f6d408e46d17ba5340245b522314298422d3279d71ee019fc2c6b64
# secret: SEC8a7e5fe2bb03d383963c144a9cf754c4f0b43b6b3e04a2e892 # 钉钉机器人加签,secret认证

详细指标、规则等见官方文档:https://github.com/apache/skywalking/blob/master/docs/en/setup/backend/backend-alarm.md#list-of-all-

2. 钉钉配置

创建机器人方法参考:

2.1 自定义关键字

设置SkyWalking

安全设置文档:https://open.dingtalk.com/document/robots/customize-robot-security-settings

2.2 脚本验证

测试发送消息,包含关键字SkyWalking

[root@dubbo-server1 opt]#bash dingding-keywords.sh "SkyWalking-test"
{"errcode":0,"errmsg":"ok"}

钉钉收到消息

3. 告警指标配置

采用默认指标,也可以自定义

[root@skywalking-server config]#cat oal/core.oal
// For services using protocols HTTP 1/2, gRPC, RPC, etc., the cpm metrics means "calls per minute",
// for services that are built on top of TCP, the cpm means "packages per minute".
// Service scope metrics
service_resp_time = from(Service.latency).longAvg(); // 服务的响应时间
service_sla = from(Service.*).percent(status == true); // 服务的http请求成功率SLA,比如99%等
service_cpm = from(Service.*).cpm(); // 表示每分钟的吞吐量
service_percentile = from(Service.latency).percentile(10); // 指定最近多少数据范围内的响应时间百分比,即p50, p75, p90, p95, p99在内的数据统计结果
service_apdex = from(Service.latency).apdex(name, status); // 应用性能指数
service_mq_consume_count = from(Service.*).filter(type == RequestType.MQ).count(); //
service_mq_consume_latency = from((str->long)Service.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();
// Service relation scope metrics for topology
service_relation_client_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_relation_server_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_relation_client_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_relation_server_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_relation_client_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_relation_server_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_relation_client_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_relation_server_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
// Service Instance relation scope metrics for topology
service_instance_relation_client_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_instance_relation_server_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_instance_relation_client_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_instance_relation_server_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_instance_relation_client_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_instance_relation_server_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_instance_relation_client_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_instance_relation_server_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
// Service Instance Scope metrics
service_instance_sla = from(ServiceInstance.*).percent(status == true);
service_instance_resp_time= from(ServiceInstance.latency).longAvg();
service_instance_cpm = from(ServiceInstance.*).cpm();
// Endpoint scope metrics
endpoint_cpm = from(Endpoint.*).cpm();
endpoint_resp_time = from(Endpoint.latency).longAvg();
endpoint_sla = from(Endpoint.*).percent(status == true);
endpoint_percentile = from(Endpoint.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
endpoint_mq_consume_count = from(Endpoint.*).filter(type == RequestType.MQ).count();
endpoint_mq_consume_latency = from((str->long)Endpoint.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();
// Endpoint relation scope metrics
endpoint_relation_cpm = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
endpoint_relation_resp_time = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).longAvg();
endpoint_relation_sla = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
endpoint_relation_percentile = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
database_access_resp_time = from(DatabaseAccess.latency).longAvg();
database_access_sla = from(DatabaseAccess.*).percent(status == true);
database_access_cpm = from(DatabaseAccess.*).cpm();
database_access_percentile = from(DatabaseAccess.latency).percentile(10);

4. 自定义告警规则

4.1 定义告警配置

dingtalkHooks部分配置参考:https://github.com/apache/skywalking/blob/master/docs/en/setup/backend/backend-alarm.md#dingtalk

cat /apps/apache-skywalking-apm-bin/config/alarm-settings.yml

rules:
service_cpm_rule:
metrics-name: service_cpm
# include-names: # skywalking服务名称,不填写默认所有服务
# - dubbox-provider
# - dubbox-consumer
# - aa579e648a7c4677b048659fe6aaf385@10.0.0.92
op: ">"
threshold: 1
period: 1
count: 1
silence-period: 1
message: dubbo-provider的当前指标service_cpm,请求值大于1000了!!!!
dingtalkHooks:
textTemplate: |-
{
"msgtype": "text",
"text": {
"content": "Apache SkyWalking Alarm: \n %s."
}
}
webhooks:
- url: https://oapi.dingtalk.com/robot/send?access_token=a374b5e60f6d408e46d17ba5340245b522314298422d3279d71ee019fc2c6b64
# secret: SEC8a7e5fe2bb03d383963c144a9cf754c4f0b43b6b3e04a2e892 # 钉钉机器人加签,secret认证

即测试当请求量大于1时,自动发送钉钉告警

4.2 重启服务

重启skywalking server服务

4.3 访问测试页面

不断刷新页面,使访问量大于1

4.4 验证钉钉

4.5 验证告警历史


  1. 创建群组机器人

    1. 添加机器人

    1. 创建加签或关键词

    安全设置文档:https://open.dingtalk.com/document/robots/customize-robot-security-settings

    1. 查看机器人

    复制Webhook地址

    ↩︎

posted @   areke  阅读(1215)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· DeepSeek “源神”启动!「GitHub 热点速览」
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 我与微信审核的“相爱相杀”看个人小程序副业
· C# 集成 DeepSeek 模型实现 AI 私有化(本地部署与 API 调用教程)
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
点击右上角即可分享
微信分享提示