Prometheus（二）

一、kubernetes 二进制部署的prometheus实现服务发现

1.1 kubernetes集群外部署prometheus

主机：10.0.0.61

1.1.1 下载二进制程序

mkdir /apps
cd /apps
wget https://github.com/prometheus/prometheus/releases/download/v2.40.7/prometheus-2.40.7.linux-amd64.tar.gz
tar -xvf prometheus-2.40.7.linux-amd64.tar.gz
ln -s /apps/prometheus-2.40.7.linux-amd64 /apps/prometheus

1.1.2 启动prometheus服务

cat >>/etc/systemd/system/prometheus.service <<EOF
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target

[Service]
Restart=on-failure
WorkingDirectory=/apps/prometheus/
ExecStart=/apps/prometheus/prometheus   --config.file=/apps/prometheus/prometheus.yml --web.enable-lifecycle

[Install]
WantedBy=multi-user.target
EOF

启动服务

systemctl daemon-reload
systemctl enable --now prometheus.service

验证状态

# 查看服务状态
[root@prometheus-server apps]#systemctl is-active prometheus.service 
active

# 查看端口
[root@prometheus-server apps]#ss -ntl|grep 9090
LISTEN  0        4096                   *:9090                 *:*

‍

1.1.3 创建RBAC授权

允许从外访问k8s集群

apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: monitoring

---
apiVersion: v1
kind: Secret
type: kubernetes.io/service-account-token
metadata:
  name: monitoring-token
  namespace: monitoring
  annotations:
    kubernetes.io/service-account.name: "prometheus"
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups:
  - ""
  resources:
  - nodes
  - services
  - endpoints
  - pods
  - nodes/proxy
  verbs:
  - get
  - list
  - watch
- apiGroups:
  - "extensions"
  resources:
    - ingresses
  verbs:
  - get
  - list
  - watch
- apiGroups:
  - ""
  resources:
  - configmaps
  - nodes/metrics
  verbs:
  - get
- nonResourceURLs:
  - /metrics
  verbs:
  - get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: monitoring

1.1.4 准备文件

# 准备token文件
## k8s集群中生成token
kubectl describe secrets monitoring-token -n monitoring|grep "token:"|awk '{print $2}' > k8s.token
## 复制文件至prometheus server服务器上，需提前在prometheus server上创建目录mkdir -p /apps/certs
ssh 10.0.0.61 'mkdir -p /apps/certs'
scp k8s.token 10.0.0.61:/apps/certs/

# 准备tls证书
## 复制k8s上ca.pem（或ca.crt）文件至prometheus server服务器上
scp /etc/kubernetes/ssl/ca.pem 10.0.0.61:/apps/certs/

查看prometheus server文件

[root@prometheus-server certs]#ls /apps/certs/
ca.pem  k8s.token

1.2 实现node服务发现

1.2.1 配置node发现规则

global:
  scrape_interval: 15s
  evaluation_interval: 15s
  scrape_timeout: 10s

scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]

  - job_name: 'kubernetes-node'
    kubernetes_sd_configs:
    - role: node
      api_server: https://10.0.0.10:6443 # k8s master VIP
      tls_config:
        ca_file: /apps/certs/ca.pem
      bearer_token_file: /apps/certs/k8s.token
    scheme: http
    tls_config:
      ca_file: /apps/certs/ca.pem
    bearer_token_file: /apps/certs/k8s.token
    relabel_configs:
    - source_labels: [__address__]
      target_label: __address__
      regex: '(.*):10250'
      replacement: '${1}:9100'
      action: replace

    - action: labelmap
      regex: __meta_kubernetes_node_label_(.+)

1.2.2 验证node发现

标签替换、引用

‍

1.2.3 grafana展示

创建新数据源

展示监控

模板：11704

‍

1.2.4 node常见监控指标

node_cpu_		CPU相关指标
node_boot_time		系统自启动以后的总运行时间
node_disk*		磁盘IO
node_filesystem*	系统文件使用量
node_load1		系统CPU负载
node_memory*		内存使用量
node_network*		网络带宽指标
go_*			node exporter中go相关指标
process_*		node exporter自身进程相关运行指标

‍

1.3 实现cadvisor服务发现

1.3.1 配置cadvisor发现规则

......
  - job_name: 'kubernetes-nodes-cadvisor'
    kubernetes_sd_configs:
    - role: node
      api_server: https://10.0.0.10:6443 # k8s master VIP
      tls_config:
        ca_file: /apps/certs/ca.pem
      bearer_token_file: /apps/certs/k8s.token
    scheme: https
    tls_config:
      ca_file: /apps/certs/ca.pem
    bearer_token_file: /apps/certs/k8s.token
    relabel_configs:
    - action: labelmap
      regex: __meta_kubernetes_node_label_(.+)

    - target_label: __address__
      replacement: '10.0.0.10:6443'		# k8s VIP
      #replacement: kubernetes.default.svc:443	# 以pod方式部署k8s集群内，可用service访问时

    - source_labels: [__meta_kubernetes_node_name]
      regex: (.+)
      target_label: __metrics_path__
      replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
    metric_relabel_configs:
      - action: replace
        source_labels: [id]
        regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$'
        target_label: rkt_container_name
        replacement: '${2}-${1}'
      - action: replace
        source_labels: [id]
        regex: '^/system\.slice/(.+)\.service$'
        target_label: systemd_service_name
        replacement: '${1}'

查看cadvisor数据

TOKEN=`cat /apps/certs/k8s.token`
curl --cacert /apps/certs/ca.pem -H "Authorization: Bearer $TOKEN" https://10.0.0.10:6443/api/v1/nodes/10.0.0.12/proxy/metrics/cadvisor

1.3.2 验证cadvisor发现

‍

标签替换

1.3.3 grafana展示

导入模板：14282

pod名称显示异常，可按如下方法修改，其中cadvisor版本为0.45.0：

‍

查看prometheus label

name="f342b6b20fb6b87f546db34808b8b16d09c4809cd2e42c027ca8ff99780696bb"
pod="cadvisor-4ds2w"

修改模板

将name更改为pod，示例如下

## 原命令
sum(rate(container_cpu_usage_seconds_total{instance=~"$host",name=~"$container",name=~".+"}[5m])) by (name) *100

## 更改后命令
sum(rate(container_cpu_usage_seconds_total{instance=~"$host",name=~"$container",name=~".+"}[5m])) by (pod) *100

‍

展示

pod名称显示正常

二、prometheus 基于consul、file实现服务发现

2.1 consul服务发现

https://www.consul.io/

consul是分布式k/v数据存储集群，目前常用于服务的服务注册和发现。

2.1.1 部署consul集群

下载地址：https://releases.hashicorp.com/consul/

主机清单

node1 10.0.0.71
node2 10.0.0.72
node3 10.0.0.73

2.1.1.1 下载consul二进制程序

# 下载
wget https://releases.hashicorp.com/consul/1.14.0/consul_1.14.0_linux_amd64.zip
unzip consul_1.14.0_linux_amd64.zip
scp consul 10.0.0.71:/usr/local/bin/
scp consul 10.0.0.72:/usr/local/bin/
scp consul 10.0.0.73:/usr/local/bin/

# 创建数据目录
ssh 10.0.0.71 "mkdir -p /data/consul"
ssh 10.0.0.72 "mkdir -p /data/consul"
ssh 10.0.0.73 "mkdir -p /data/consul"

2.1.1.2 启动服务

#node1
nohup consul agent -server -bootstrap -bind=10.0.0.71 -client=10.0.0.71 -data-dir=/data/consul -ui -node=10.0.0.71 &

#node2
nohup consul agent -bind=10.0.0.72 -client=10.0.0.72 -data-dir=/data/consul -node=10.0.0.72 -join=10.0.0.71 &

#node3
nohup consul agent -bind=10.0.0.73 -client=10.0.0.73 -data-dir=/data/consul -node=10.0.0.73 -join=10.0.0.71 &

参数说明

consul agent -server	使用server模式运行consul服务
-bootstrap		首次部署使用初始化模式
-bind			设置集群通信的监听地址
-client			设置客户端访问的监听地址
-data-dir		数据目录
-ui			启动内置静态web UI服务器
-node			此节点名称，集群中必须唯一
-datacenter=dc1		集群名称，默认为dc1
-join			加入到已有consul环境

2.1.1.3 验证集群

查看日志

[root@consul-server1 ~]#tail -f nohup.out 
2023-03-03T23:47:21.201+0800 [INFO]  agent.server: federation state anti-entropy synced
2023-03-03T23:47:21.202+0800 [INFO]  agent.leader: stopping routine: routine="virtual IP version check"
2023-03-03T23:47:21.202+0800 [INFO]  agent.leader: stopped routine: routine="virtual IP version check"
2023-03-03T23:47:22.549+0800 [INFO]  agent: Synced node info
2023-03-03T23:47:22.687+0800 [ERROR] agent.server.autopilot: Failed to reconcile current state with the desired state
2023-03-03T23:47:26.947+0800 [INFO]  agent.server.serf.lan: serf: EventMemberJoin: 10.0.0.72 10.0.0.72
2023-03-03T23:47:26.947+0800 [INFO]  agent.server: member joined, marking health alive: member=10.0.0.72 partition=default
2023-03-03T23:47:34.219+0800 [INFO]  agent.server.serf.lan: serf: EventMemberJoin: 10.0.0.73 10.0.0.73
2023-03-03T23:47:34.219+0800 [INFO]  agent.server: member joined, marking health alive: member=10.0.0.73 partition=default
2023-03-03T23:47:40.891+0800 [INFO]  agent: Newer Consul version available: new_version=1.15.0 current_version=1.14.0

查看监听端口

[root@consul-server1 ~]#netstat -ntlp|grep consul
tcp        0      0 10.0.0.71:8503          0.0.0.0:*               LISTEN      3546/consul       
tcp        0      0 10.0.0.71:8600          0.0.0.0:*               LISTEN      3546/consul       
tcp        0      0 10.0.0.71:8300          0.0.0.0:*               LISTEN      3546/consul       
tcp        0      0 10.0.0.71:8301          0.0.0.0:*               LISTEN      3546/consul       
tcp        0      0 10.0.0.71:8302          0.0.0.0:*               LISTEN      3546/consul       
tcp        0      0 10.0.0.71:8500          0.0.0.0:*               LISTEN      3546/consul

查看web，访问IP:8500

‍

2.1.1.4 测试数据

通过consul API写入数据

curl -X PUT -d '{"id":"node-exporter1","name":"node-exporter1","address":"10.0.0.11","port":9100,"tags":["node-exporter"],"checks":[{"http":"http://10.0.0.11:9100","interval":"5s"}]}' \
http://10.0.0.71:8500/v1/agent/service/register

curl -X PUT -d '{"id":"node-exporter2","name":"node-exporter2","address":"10.0.0.12","port":9100,"tags":["node-exporter"],"checks":[{"http":"http://10.0.0.12:9100","interval": "5s"}]}' \
http://10.0.0.71:8500/v1/agent/service/register

curl -X PUT -d '{"id":"node-exporter3","name":"node-exporter3","address":"10.0.0.13","port":9100,"tags":["node-exporter"],"checks":[{"http":"http://10.0.0.13:9100","interval": "5s"}]}' \
http://10.0.0.71:8500/v1/agent/service/register

‍

consul验证数据

说明，consul删除服务命令：

curl --request PUT http://10.0.0.71:8500/v1/agent/service/deregister/node-exporter1

2.1.2 consul服务发现

2.1.2.1 二进制prometheus配置发现规则

  - job_name: 'consul'
    honor_labels: true
    metrics_path: "/metrics"
    scheme: http
    consul_sd_configs:
    - server: 10.0.0.71:8500
      services: []		# 发现的目标服务名称，空为所有服务，可以写servicea,serviceb,servicec
    - server: 10.0.0.72:8500
      services: []
    - server: 10.0.0.73:8500
      services: []
    relabel_configs:
    - source_labels: [__meta_consul_tags]
      target_label: 'product'
    - source_labels: [__meta_consul_dc]
      target_label: 'idc'
    - source_labels: [__meta_consul_service]
      regex: 'consul'
      action: drop		# 删除consul本机监控

‍

2.1.2.2 prometheus验证数据

‍

未匹配删除__meta_consul_service='consul'时

‍

2.2 file服务发现

2.2.1 准备文件

yaml文件

- targets: ['10.0.0.11:9100','10.0.0.12:9100','10.0.0.13:9100']

json文件

[
  {
    "targets": ["10.0.0.41:9100","10.0.0.42:9100","10.0.0.43:9100"]
  }
]

‍

2.2.2 prometheus配置规则

  - job_name: 'file_sd'
    file_sd_configs:
    - files:                    # 支持yaml和json格式文件
      - /apps/prometheus/file_sd/file_sd.yaml		# 支持*模糊匹配
      - /apps/prometheus/file_sd/file_sd.json		# 支持*模糊匹配
      refresh_interval: 10s     # 重新读取文件的刷新时间，若file文件内容改动，会自动发现变更内容

‍

2.2.3 验证数据

json和yaml格式文件都可以正常发现

‍

三、prometheus 监控案例-kube-state-metrics

https://github.com/kubernetes/kube-state-metrics

kube-state-metrics：通过监听API server生成有关资源对象的状态指标，比如Deployment、Node、Pod，需要注意的是kube-state-metrics使用场景不是用于监控对方是否存活，而是周期性获取目标对象metrics指标数据并在web界面进行显示或被prometheus抓取（如pod状态是running还是terminating、pod的创建时间等），目前kube-state-metrics收集的指标数据可参见官方文档

https://github.com/kubernetes/kube-state-metrics/tree/main/docs，kube-state-metrics并不会存储这些指标数据，所以可以使用prometheus来抓取这些数据然后存储，主要关注的是业务相关的一些元数据，如deployment、pod、副本状态等，调度了多少个replicas，现在可用的有几个，多少个pod是running/stopped/terminated状态，pod重启了多少次，目前有多少job在运行中

镜像地址：

https://hub.docker.com/r/bitnami/kube-state-metrics

https://quay.io/repository/coreos/kube-state-metrics?tag=latest&tab=tags

指标：

https://xie.infoq.cn/article/9e1fff6306649e65480a96bb1

3.1 部署kube-state-metrics

下载镜像

docker pull bitnami/kube-state-metrics:2.8.0
docker tag bitnami/kube-state-metrics:2.8.0 harbor.chu.net/baseimages/kube-state-metrics:2.8.0
docker push harbor.chu.net/baseimages/kube-state-metrics:2.8.0

编写yaml文件

apiVersion: apps/v1
kind: Deployment
metadata:
  name: kube-state-metrics
  namespace: kube-system
spec:
  replicas: 1
  selector:
    matchLabels:
      app: kube-state-metrics
  template:
    metadata:
      labels:
        app: kube-state-metrics
    spec:
      serviceAccountName: kube-state-metrics
      containers:
      - name: kube-state-metrics
        # image: bitnami/kube-state-metrics:2.8.0			# 网络镜像
        image: harbor.chu.net/baseimages/kube-state-metrics:2.8.0	# 本地镜像
        ports:
        - containerPort: 8080

---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: kube-state-metrics
  namespace: kube-system

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: kube-state-metrics
rules:
- apiGroups: [""]
  resources: ["nodes", "pods", "services", "resourcequotas", "replicationcontrollers", "limitranges", "persistentvolumeclaims", "persistentvolumes", "namespaces", "endpoints"]
  verbs: ["list", "watch"]
- apiGroups: ["extensions"]
  resources: ["daemonsets", "deployments", "replicasets"]
  verbs: ["list", "watch"]
- apiGroups: ["apps"]
  resources: ["statefulsets"]
  verbs: ["list", "watch"]
- apiGroups: ["batch"]
  resources: ["cronjobs", "jobs"]
  verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
  resources: ["horizontalpodautoscalers"]
  verbs: ["list", "watch"]

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: kube-state-metrics
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: kube-state-metrics
subjects:
- kind: ServiceAccount
  name: kube-state-metrics
  namespace: kube-system

---
apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scrape: 'true'
  name: kube-state-metrics
  namespace: kube-system
  labels:
    app: kube-state-metrics
spec:
  type: NodePort
  ports:
  - name: kube-state-metrics
    port: 8080
    targetPort: 8080
    nodePort: 31666
    protocol: TCP
  selector:
    app: kube-state-metrics

查看状态

# 查看pod
[root@k8s-deploy ~]#kubectl get pod  -n kube-system
NAME                                      READY   STATUS    RESTARTS      AGE
...
kube-state-metrics-6bc4545d76-xbdb2       1/1     Running   0             52s

# 查看service
[root@k8s-deploy case]#kubectl get svc -n kube-system
NAME                 TYPE        CLUSTER-IP      EXTERNAL-IP   PORT(S)                        AGE
kube-dns             ClusterIP   10.100.0.2      <none>        53/UDP,53/TCP,9153/TCP         43d
kube-state-metrics   NodePort    10.100.81.152   <none>        8080:31666/TCP                 2m47s
kubelet              ClusterIP   None            <none>        10250/TCP,10255/TCP,4194/TCP   11d

‍

3.2 验证数据

‍

metrics

‍

healthz

‍

3.3 配置prometheus采集数据

  - job_name: 'kube-state-metrics'
    static_configs:
      - targets: ["10.0.0.10:31666"]   # k8s VIP

‍

3.4 验证prometheus状态

‍

3.5 grafana展示

模板13824

‍

修改查询条件，显示masters/nodes数据

count(node_load1{kubernetes_io_role='master'})
count(node_load1{kubernetes_io_role='node'})

‍

模板14518

‍

四、prometheus 监控案例-Tomcat、Redis、Mysql、Haproxy、Nginx

https://github.com/prometheus

4.1 监控Tomcat

https://github.com/nlighten/tomcat_exporter

监控tomcat活跃连接数、堆栈内存等信息

# tomcat活跃连接数
tomcat_connections_active_total{name='http-nio-8080',}

# jvm内存
jvm_memory_bytes_used{area='heap',}

4.1.1 构建镜像

下载jar/war包

地址：https://repo1.maven.org/maven2/io/prometheus/

TOMCAT_SIMPLECLIENT_VERSION=0.8.0
TOMCAT_EXPORTER_VERSION=0.0.12
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_common/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_common-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_hotspot/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_hotspot-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_servlet/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_servlet-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_servlet_common/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_servlet_common-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/nl/nlighten/tomcat_exporter_client/${TOMCAT_EXPORTER_VERSION}/tomcat_exporter_client-${TOMCAT_EXPORTER_VERSION}.jar
curl -O https://repo1.maven.org/maven2/nl/nlighten/tomcat_exporter_servlet/${TOMCAT_EXPORTER_VERSION}/tomcat_exporter_servlet-${TOMCAT_EXPORTER_VERSION}.war

编写Dockerfile

FROM tomcat:8.5.73

ADD server.xml /usr/local/tomcat/conf/server.xml 

RUN mkdir /data/tomcat/webapps -p
ADD myapp /data/tomcat/webapps/myapp
ADD metrics.war /data/tomcat/webapps 
ADD simpleclient-0.8.0.jar  /usr/local/tomcat/lib/
ADD simpleclient_common-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_hotspot-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_servlet-0.8.0.jar /usr/local/tomcat/lib/
ADD tomcat_exporter_client-0.0.12.jar /usr/local/tomcat/lib/

EXPOSE 8080 8443 8009

构建镜像

docker build -t harbor.chu.net/web/tomcat-app1:v1 .
docker push harbor.chu.net/web/tomcat-app1:v1

‍

4.1.2 测试镜像

docker run -it --rm -p 8080:8080 harbor.chu.net/web/tomcat-app1:v1

验证/metrics页面

‍

4.1.3 部署tomcat

apiVersion: apps/v1
kind: Deployment
metadata:
  name: tomcat-deployment
  namespace: default
spec:
  selector: 
    matchLabels: 
     app: tomcat
  replicas: 1
  template:
    metadata:
      labels:
        app: tomcat
      annotations:
        prometheus.io/scrape: 'true'
    spec:
      containers:
      - name: tomcat
        image: harbor.chu.net/web/tomcat-app1:v1 
        imagePullPolicy: Always
        ports:
        - containerPort: 8080
        securityContext: 
          privileged: true

---
kind: Service
apiVersion: v1
metadata:
  annotations:
    prometheus.io/scrape: 'true'
  name: tomcat-service
spec:
  selector:
    app: tomcat
  ports:
  - nodePort: 31080
    port: 80
    protocol: TCP
    targetPort: 8080
  type: NodePort

创建pod

[root@k8s-deploy yaml]#kubectl apply -f tomcat-deploy.yaml

[root@k8s-deploy yaml]#kubectl get svc
NAME                     TYPE        CLUSTER-IP      EXTERNAL-IP   PORT(S)        AGE
kubernetes               ClusterIP   10.100.0.1      <none>        443/TCP        45d
nodeport-nginx-service   NodePort    10.100.225.73   <none>        80:30120/TCP   37d
tomcat-service           NodePort    10.100.53.196   <none>        80:31080/TCP   6s

‍

4.1.4 配置prometheus采集数据

  - job_name: 'tomcat-monitor-metrics'
    static_configs:
      - targets: ["10.0.0.13:31080"]   # 可配置VIP

‍

4.1.5 grafana展示

模板：https://github.com/nlighten/tomcat_exporter/blob/master/dashboard/example.json

‍

4.2 监控Redis

https://github.com/oliver006/redis_exporter

4.2.1 部署redis

下载redis_exporter镜像

docker pull oliver006/redis_exporter:v1.45.0
docker tag oliver006/redis_exporter:v1.45.0 harbor.chu.net/web/redis_exporter:v1.45.0
docker push harbor.chu.net/web/redis_exporter:v1.45.0

编写yaml文件

apiVersion: apps/v1
kind: Deployment
metadata:
  name: redis
  namespace: web
spec:
  replicas: 1
  selector:
    matchLabels:
      app: redis
  template:
    metadata:
      labels:
        app: redis
    spec:
      containers:
      - name: redis
        image: redis:4.0.14 
        resources:
          requests:
            cpu: 100m
            memory: 100Mi
        ports:
        - containerPort: 6379
      - name: redis-exporter
        # image: oliver006/redis_exporter:latest
        image: harbor.chu.net/web/redis_exporter:v1.45.0
        resources:
          requests:
            cpu: 100m
            memory: 100Mi
        ports:
        - containerPort: 9121

---
kind: Service
apiVersion: v1
metadata:
  annotations:
    prometheus.io/scrape: 'true'
    prometheus.io/port: "9121"
  name: redis-exporter-service
  namespace: web 
spec:
  selector:
    app: redis
  ports:
  - nodePort: 39121
    name: prom
    port: 9121
    protocol: TCP
    targetPort: 9121
  type: NodePort

---
kind: Service
apiVersion: v1
metadata:
#  annotations:
#    prometheus.io/scrape: 'false'
  name: redis-redis-service
  namespace: web
spec:
  selector:
    app: redis
  ports:
  - nodePort: 36379
    name: redis
    port: 6379
    protocol: TCP
    targetPort: 6379
  type: NodePort

查看状态

[root@k8s-deploy yaml]#kubectl get pod -n web
NAME                     READY   STATUS    RESTARTS   AGE
redis-6969686c88-qcc47   2/2     Running   0          50s


[root@k8s-deploy yaml]#kubectl get svc -n web
NAME                     TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)                                        AGE                                     27d
redis-exporter-service   NodePort    10.100.199.125   <none>        9121:39121/TCP                                 6s
redis-redis-service      NodePort    10.100.203.43    <none>        6379:36379/TCP                                 6s

‍

4.2.2 验证metrics

‍

4.2.3 配置prometheus

静态配置

  - job_name: redis_exporter
    static_configs:
    - targets: ['10.0.0.13:39121']	# 可配置VIP

【可选】配置kubernetes API动态服务发现

          - source_labels: [__meta_kubernetes_pod_name]
            action: replace
            target_label: instance
            regex: (.*redis.*)

4.2.4 grafana展示

模板：https://raw.githubusercontent.com/oliver006/redis_exporter/30fba62cc31d80ccb8653ac33965e17c15929a24/contrib/redis-mixin/dashboards/redis-overview.json

‍

4.3 监控Mysql

https://github.com/prometheus/mysqld_exporter

4.3.1 安装mysql

安装mysql

[root@prometheus-server ~]#apt install mariadb-server -y

授权监控账户权限

# 登录mysql
[root@prometheus-server ~]#mysql
MariaDB [(none)]> CREATE USER 'mysql_exporter'@'localhost' IDENTIFIED BY '123456' WITH MAX_USER_CONNECTIONS 3;
Query OK, 0 rows affected (0.001 sec)

MariaDB [(none)]> GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'mysql_exporter'@'localhost';
Query OK, 0 rows affected (0.000 sec)

验证权限

[root@prometheus-server prometheus]#mysql -umysql_exporter -p123456 -hlocalhost
Welcome to the MariaDB monitor.  Commands end with ; or \g.
Your MariaDB connection id is 38
Server version: 10.3.38-MariaDB-0ubuntu0.20.04.1 Ubuntu 20.04

Copyright (c) 2000, 2018, Oracle, MariaDB Corporation Ab and others.

Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.

MariaDB [(none)]> show databases;
+--------------------+
| Database           |
+--------------------+
| information_schema |
| mysql              |
| performance_schema |
+--------------------+
3 rows in set (0.001 sec)

MariaDB [(none)]>

‍

4.3.2 准备mysql_exporter环境

下载exporter

wget https://github.com/prometheus/mysqld_exporter/releases/download/v0.14.0/mysqld_exporter-0.14.0.linux-amd64.tar.gz
tar xvf mysqld_exporter-0.14.0.linux-amd64.tar.gz
cp mysqld_exporter-0.14.0.linux-amd64/mysqld_exporter /usr/local/bin/

免密登录配置

cat >>/root/.my.cnf <<EOF
[client]
user=mysql_exporter
password=123456
EOF

验证权限

[root@prometheus-server apps]#mysql
Welcome to the MariaDB monitor.  Commands end with ; or \g.
Your MariaDB connection id is 39
Server version: 10.3.38-MariaDB-0ubuntu0.20.04.1 Ubuntu 20.04

Copyright (c) 2000, 2018, Oracle, MariaDB Corporation Ab and others.

Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.

MariaDB [(none)]>

‍

4.3.3 启动mysql_exporter

编写service

cat >> /etc/systemd/system/mysqld_exporter.service <<EOF
[Unit]
Description=Prometheus MySQL Exporter
After=network.target

[Service]
ExecStart=/usr/local/bin/mysqld_exporter --config.my-cnf=/root/.my.cnf

[Install]
WantedBy=multi-user.target
EOF

启动服务

systemctl daemon-reload
systemctl enable --now mysqld_exporter.service

‍

4.3.4 验证metrics

‍

4.3.5 配置prometheus

  - job_name: mysql-exporter
    static_configs:
      - targets: ['10.0.0.61:9104']

验证Prometheus状态

4.3.6 grafana展示

模板：https://raw.githubusercontent.com/prometheus/mysqld_exporter/main/mysqld-mixin/dashboards/mysql-overview.json

‍

模板：13106

4.4 监控Haproxy

https://github.com/prometheus/haproxy_exporter

4.4.1 部署haproxy

安装haproxy

apt-cache madsion haproxy
apt install haproxy -y

编辑配置文件

[root@k8s-ha1 ~]#vim /etc/haproxy/haproxy.cfg
global
    # 修改socket文件
    stats socket /run/haproxy/admin.sock mode 660 level admin expose-fd listeners
    ......
listen stats
    mode http
    bind :9999
    stats enable
    stats uri   /haproxy-status
    stats realm HAProxy\ Stats\ Page
    stats auth  haadmin:123456

listen prometheus-server-9090
    bind :9090
    mode http
    server 10.0.0.61 10.0.0.61:9090 check inter 3s fall 3 rise 5

重启服务

systemctl restart haproxy

4.4.2 部署haproxy_exporter

下载

wget https://github.com/prometheus/haproxy_exporter/releases/download/v0.14.0/haproxy_exporter-0.14.0.linux-amd64.tar.gz
tar xvf haproxy_exporter-0.14.0.linux-amd64.tar.gz
cp haproxy_exporter-0.14.0.linux-amd64/haproxy_exporter /usr/local/bin/

启动

# 方式一
haproxy_exporter --haproxy.scrape-uri=unix:/run/haproxy/admin.sock

# 方式二
haproxy_exporter --haproxy.scrape-uri="http://haadmin:123456@127.0.0.1:9999/haproxy-status;csv"

查看端口

[root@k8s-ha1 ~]#netstat -ntlp|grep 9101
tcp6       0      0 :::9101                 :::*                    LISTEN      527956/haproxy_expo

验证状态页面

‍

4.4.3 验证metrics数据

4.4.4 prometheus采集配置

  - job_name: haproxy-monitor
    static_configs:
      - targets: ['10.0.0.31:9101']

4.4.5 grafana展示

模板：367

‍

模板：2428

‍

4.5 监控Nginx

需要在编译安装nginx时添加nginx-module-vts模块

GitHub地址：https://github.com/vozlt/nginx-module-vts

4.5.1 安装nginx

下载nginx源码、nginx-module-vts模块

wget http://nginx.org/download/nginx-1.20.2.tar.gz
wget https://github.com/vozlt/nginx-module-vts/archive/refs/tags/v0.2.0.tar.gz

tar xvf nginx-1.20.2.tar.gz -C /usr/local/src/
tar xvf v0.2.0.tar.gz -C /usr/local/src/

编译安装nginx

cd /usr/local/src/nginx-1.20.2
./configure --prefix=/apps/nginx \
--with-http_ssl_module \
--with-http_v2_module \
--with-http_realip_module \
--with-http_stub_status_module \
--with-http_gzip_static_module \
--with-pcre \
--with-file-aio \
--with-stream \
--with-stream_ssl_module \
--with-stream_realip_module \
--add-module=/usr/local/src/nginx-module-vts-0.2.0/

make && make install

‍

编辑nginx配置文件

# http块配置
http {
    vhost_traffic_status_zone;		#启用状态页

    # server块配置
    server {
        listen       10.0.0.31:80;
        server_name  localhost;

        #charset koi8-r;

        #access_log  logs/host.access.log  main;

        location / {
            root   html;
            index  index.html index.htm;
            proxy_pass http://10.0.0.31:9090;
        }
    
        location /status {
            vhost_traffic_status_display;
            vhost_traffic_status_display_format html;
        }
    ...
    }
...
}

检查配置

[root@k8s-ha1 apps]#/apps/nginx/sbin/nginx -t
nginx: the configuration file /apps/nginx/conf/nginx.conf syntax is ok
nginx: configuration file /apps/nginx/conf/nginx.conf test is successful

启动服务

/apps/nginx/sbin/nginx

验证web状态页

‍

4.5.2 安装nginx_exporter

https://github.com/hnlq715/nginx-vts-exporter/releases

下载nginx_exporter

4.wget https://github.com/hnlq715/nginx-vts-exporter/releases/download/v0.10.3/nginx-vts-exporter-0.10.3.linux-amd64.tar.gz
tar xvf nginx-vts-exporter-0.10.3.linux-amd64.tar.gz
cp nginx-vts-exporter-0.10.3.linux-amd64/nginx-vts-exporter /usr/local/bin/

启动exporter

nginx-vts-exporter -nginx.scrape_uri http://10.0.0.31/status/format/json

查看端口

[root@k8s-ha1 apps]#netstat -ntlp|grep nginx
tcp        0      0 10.0.0.100:80           0.0.0.0:*               LISTEN      555495/nginx: maste 
tcp        0      0 10.0.0.31:80            0.0.0.0:*               LISTEN      555495/nginx: maste 
tcp        0      0 10.0.0.100:443          0.0.0.0:*               LISTEN      555495/nginx: maste 
tcp6       0      0 :::9913                 :::*                    LISTEN      559231/nginx-vts-ex 	# nginx_exporter 端口

4.5.3 验证metrics数据

‍

4.5.4 prometheus采集配置

  - job_name: nginx-monitor
    static_configs:
      - targets: ['10.0.0.31:9913']

‍

4.5.5 grafana展示

模板：2949

五、基于blackbox_exporter实现对URL状态、IP可用性、端口状态、TLS证书的过期时间监控

https://github.com/prometheus/blackbox_exporter

blackbox_exporter是prometheus官方提供的一个exporter，可以监控HTTP、HTTPS、DNS、TCP、ICMP等目标实例，实现对被监控节点进行监控和数据采集。

HTTP/HTTPS: URL/API可用性检测
TCP：端口监听检测
ICMP：主机存货检测
DNS：域名解析

5.1 部署blackbox_exporter

https://prometheus.io/download/#blackbox_exporter

部署在LoadBalance（10.0.0.31）上

下载

cd /apps
wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.23.0/blackbox_exporter-0.23.0.linux-amd64.tar.gz
tar xvf blackbox_exporter-0.23.0.linux-amd64.tar.gz
ln -s /apps/blackbox_exporter-0.23.0.linux-amd64 /apps/blackbox_exporter

启动服务

# 创建服务
cat >> /etc/systemd/system/blackbox_exporter.service <<EOF
[Unit]
Description=Prometheus Blackbox Exporter
After=network.target

[Service]
Type=simple
User=root
Group=root
ExecStart=/apps/blackbox_exporter/blackbox_exporter \
    --config.file=/apps/blackbox_exporter/blackbox.yml \
    --web.listen-address=:9115
Restart=on-failure

[Install]
WantedBy=multi-user.target
EOF

# 启动服务
systemctl daemon-reload
systemctl enable --now blackbox_exporter.service

查看端口

[root@k8s-ha1 apps]#ss -ntl|grep 9115
LISTEN  0        4096                   *:9115                 *:*

验证web页面

‍

5.2 URL监控

prometheus server配置URL监控

  - job_name: "http_status"
    metrics_path: /probe
    params:
      module: [http_2xx]
    static_configs:
      - targets: ['http://harbor.chu.net','www.baidu.com']
        labels:
          instance: http_status
          group: web
    relabel_configs:
      - source_labels: [__address__]            # 将__address__（当前监控目标URL地址的标签）修改为__param_target，用于传递给blackbox_exporter
        target_label: __param_target            # 标签key为__param_target、value为harbor.chu.net
      - source_labels: [__param_target] 	# 基于__param_target获取监控目标
        target_label: url                       # 将监控目标的值与url创建一个label
      - target_label: __address__               # 新添加一个目标__address__，指向blackbox_exporter服务器地址，用于将监控请求发送给指定的blackbox_exporter服务器
        replacement: 10.0.0.31:9115             # 指定blackbox_exporter服务器地址

检测配置文件

#/apps/prometheus/promtool check config /apps/prometheus/prometheus.yml 
Checking /apps/prometheus/prometheus.yml
 SUCCESS: /apps/prometheus/prometheus.yml is valid prometheus config file syntax

重启服务

systemctl restart prometheus.service

‍

prometheus验证数据

‍

blackbox_exporter页面验证数据

‍

5.3 IP监控

prometheus配置IP监控

  - job_name: "ping_status"
    metrics_path: /probe
    params:
      module: [icmp]
    static_configs:
      - targets: ['10.0.0.2','223.6.6.6']
        labels:
          instance: ping_status
          group: icmp
    relabel_configs:
      - source_labels: [__address__]            # 将__address__（当前监控目标URL地址的标签）修改为__param_target，用于传递给blackbox_exporter
        target_label: __param_target            # 标签key为__param_target、value为'10.0.0.2','223.6.6.6'
      - source_labels: [__param_target] 	# 基于__param_target获取监控目标
        target_label: ip                       	# 将监控目标的值与ip创建一个label
      - target_label: __address__               # 新添加一个目标__address__，指向blackbox_exporter服务器地址，用于将监控请求发送给指定的blackbox_exporter服务器
        replacement: 10.0.0.31:9115             # 指定blackbox_exporter服务器地址

重启服务

/apps/prometheus/promtool check config /apps/prometheus/prometheus.yml
systemctl restart prometheus.service

‍

验证数据

‍

5.4 端口监控

prometheus配置监控

  - job_name: "port_status"
    metrics_path: /probe
    params:
      module: [tcp_connect]
    static_configs:
      - targets: ['10.0.0.11:9100','10.0.0.61:9090','10.0.0.8:22']
        labels:
          instance: port_status
          group: port
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target      
      - source_labels: [__param_target]
        target_label: ip
      - target_label: __address__
        replacement: 10.0.0.31:9115

‍

验证数据

‍

5.5 TLS证书监控

prometheus配置监控

  - job_name: "https_status"
    metrics_path: /probe
    params:
      module: [http_2xx]
    static_configs:
      - targets: ['https://www.baidu.com']
        labels:
          instance: https_status
          group: web
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: url
      - target_label: __address__
        replacement: 10.0.0.31:9115

‍

5.6 grafana展示

导入模板：9965

显示https证书剩余天数

‍

六、prometheus 结合钉钉实现告警通知、企业微信实现告警通知、告警模板的使用、告警分类发送

6.1 Alertmanager

https://github.com/prometheus/alertmanager

6.1.1 prometheus触发告警过程

prometheus-->触发阈值-->超出持续时间-->alertmanager-->分组|抑制|静默-->媒体类型-->邮件|钉钉|微信等

分组：将类似性质的警告合并为单个通知，如网络通知、主机通知、服务通知

静默：是一种简单的特定时间静音的机制，如服务器要升级维护可以先设置这个时间段告警静默

抑制：当警告发出后，停止重复发送由此警告引发的其他警告即合并为一个故障引起的多个报警事件，可以消除冗余告警

‍

6.1.2 安装Alertmanager

与prometheus server/10.0.0.61安装在一起（也可分开安装）

# 下载Alertmanager
cd /apps
wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz
tar xvf alertmanager-0.25.0.linux-amd64.tar.gz
ln -s /apps/alertmanager-0.25.0.linux-amd64 /apps/alertmanager

# 创建服务
cat >> /etc/systemd/system/alertmanager.service <<EOF
[Unit]
Description=Prometheus Alertmanager
After=network.target

[Service]
ExecStart=/apps/alertmanager/alertmanager --config.file=/apps/alertmanager/alertmanager.yml

[Install]
WantedBy=multi-user.target
EOF

# 启动服务
systemctl daemon-reload
systemctl enable --now alertmanager.service

查看服务

[root@prometheus-server alertmanager]#systemctl is-active alertmanager.service
active

# 查看端口
[root@prometheus-server alertmanager]#netstat -ntlp|grep alertmanager
tcp6       0      0 :::9093                 :::*                    LISTEN      6301/alertmanager   
tcp6       0      0 :::9094                 :::*                    LISTEN      6301/alertmanager

‍

6.1.3 配置文件说明

官方配置文档：https://prometheus.io/docs/alerting/latest/configuration/

alertmanager.yml配置详解

global:		# 全局配置
	resolve_timeout: 1m 			#单次探测超时时间
	smtp_from: '1234567890@qq.com'		#发件人邮箱地址
	smtp_smarthost:	'smtp.qq.com:465'	#邮箱smtp地址。
	smtp_auth_username: '123456789@qq.com'	#发件人的登陆用户名，默认和发件人地址一致。
	smtp_auth_password: 'ptqdjqbhjudejf'	#发件人的登陆密码，有时候是授权码。
	smtp_hello: '@qq.com'
	smtp_require_tls: false			#是否需要tls协议。默认是true。

	wechart_api_urt:			#企业微信API地址
	wechart_api_secret:			#企业微信APl 
	secretwechat_api_corp_id: 		#企业微信corp id信息
	resolve_timeout: 60s			#当一个告警在Alertmanager持续多长时间未接收到新告警后就标记告警状态为resolved(已解决/己恢复)。

route:		# 用来设置报警的分发策略
	group_by: [alertname]	#通过alertname的值对告警进行分类
	group_wait: 10s 	#一组警告第一次发送之前等待的延迟时间,即产生告警后延迟10秒钟将组内新产生的消息一起合并发送(一般设置为0秒~几分钟)。
	group_interval: 2m	#一组已发送过初始通知的告警接收到新告警后，下次发送通知前等待的延迟时间(一般设置为5分钟或更多)
	repeat_interval: 5m 	#一条成功发送的告警，在最终发送通知之前等待的时间(通常设置为3小时或更长时间)
	#间隔示例:
	#group_wait: 10s 	#第一次产生告警，等待10s，组内有告警就一起发出，没有其它告警就单独发出。
	#group_interval: 2m 	#第二次产生告警,先等待2分钟,2分钟后还没有恢复就进入repeat_interval。
	#repeat_interval: 5m 	#在最终发送消息前再等待5分钟,5分钟后还没有恢复就发送第二次告警。

	receiver: default-receiver	#其它的告警发送给default-receiver
	routes: 			#将critical的报警发送给myalertname
	- receiver: myalertname
	  group_wait: 10s
	  match_re:
	    severity: critical
receivers:			#定义多接收者
- name: 'default-receiver'
  email_configs:
  - to: '2403416792@qq.com'
    send_resolved: true 	#通知己经恢复的告警
- name: myalertname
  webhook_configs:
  - url: 'http://172.30.7.101:806O/dingtalk/alertname/send'
    send_resolved: true 	#通知已经恢复的告警

inhibit_rules:			#抑制的规则
  - source_match: 		#源匹配级别，当匹配成功发出通知，但是其它'alertname', 'dev', 'instance'产生的warning 级别的告警通知将被抑制
      severity: 'critical'	#报警的事件级别
    target_match:
      severity: 'warning'	#调用source_match的severity即如果已经有'critical’级别的报警，那么将匹配目标为新产生的告警级别为'warning'的将被抑制
    equal: ['alertname','dev','instance']	#匹配那些对象的告警

‍

6.2 钉钉告警

6.2.1 创建群组机器人

添加机器人

‍

创建加签或关键词

‍

查看机器人

复制Webhook地址

‍

6.2.2 钉钉认证-关键字-脚本

mkdir -p /data/scripts

shell脚本

dingding-keywords.sh

#! /bin/bash
source /etc/profile
MESSAGE=$1

# https为Webhook地址
/usr/bin/curl   -X "POST" "https://oapi.dingtalk.com/robot/send?access_token=a6026ee5fe935ad7d23bc3b4ba1f63c51b237e72160027a046720466fcfac4cf" \
                -H 'Content-Type: application/json' \
                -d '{"msgtype": "text",
                        "text": {
                                "content": "'${MESSAGE}'"
                        }   
                }'

‍

python脚本

dingding-keywords.py

#! /usr/bin/env python3
import sys
import requests
import json

def info(msg):
        # url为Webhook地址
        url = r"https://oapi.dingtalk.com/robot/send?access_token=a6026ee5fe935ad7d23bc3b4ba1f63c51b237e72160027a046720466fcfac4cf"
        headers = {
                    "Content-Type": "application/json;charset=utf-8"
                   }
        formdata = {
                    "msgtype": "text",
                    "text": {"content": str(msg)}
                    }
        requests.post(url=url, data=json.dumps(formdata), headers=headers)

info(sys.argv[1])

‍

6.2.3 验证脚本

测试发送消息

[root@prometheus-server ~]#bash /data/scripts/dingding-keywords.sh "namespace=default\ncpu=85%\nalertname=sh-pod"
{"errcode":0,"errmsg":"ok"}

[root@prometheus-server ~]#python3 /data/scripts/dingding-keywords.py "namespace=default cpu=85% alertname=python-pod"

验证

‍

6.2.4 部署webhook-dingtalk

https://github.com/timonwong/prometheus-webhook-dingtalk/releases

# 下载
cd /apps
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
tar xvf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
ln -s /apps/prometheus-webhook-dingtalk-2.1.0.linux-amd64 /apps/prometheus-webhook-dingtalk

修改config.yaml

cp /apps/prometheus-webhook-dingtalk/config.example.yml /apps/prometheus-webhook-dingtalk/config.yml
[root@prometheus-server apps]#egrep -v "^#|^$" /apps/prometheus-webhook-dingtalk/config.yml 
targets:
  webhook1:										# 可自定义，用于alertmanager调用
    url: https://oapi.dingtalk.com/robot/send?access_token=a6026ee5fe935ad7d23bc3b4ba1f63c51b237e72160027a046720466fcfac4cf
    secret: SEC8a7e5fe2bb03d383963c144a9cf8156fbbde20d9e754c4f0b43b6b3e04a2e892		# 钉钉机器人加签，secret认证
  alertname:										# 用于钉钉关键字认证
    url: https://oapi.dingtalk.com/robot/send?access_token=a6026ee5fe935ad7d23bc3b4ba1f63c51b237e72160027a046720466fcfac4cf
  webhook_legacy:
    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
    # Customize template content
    message:
      # Use legacy template
      title: '{{ template "legacy.title" . }}'
      text: '{{ template "legacy.content" . }}'
  webhook_mention_all:							
    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
    mention:
      all: true										# 给所有人发送
  webhook_mention_users:	
    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
    mention:										# 给指定人发送
      mobiles: ['156xxxx8827', '189xxxx8325']

启动服务

# 后台启动
nohup /apps/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --web.listen-address=':8060' --config.file=/apps/prometheus-webhook-dingtalk/config.yml &

查看日志

ts=2023-03-14T10:10:47.289Z caller=main.go:113 component=configuration msg="Webhook urls for prometheus alertmanager" urls="http://localhost:8060/dingtalk/webhook_mention_all/send http://localhost:8060/dingtalk/webhook_mention_users/send http://localhost:8060/dingtalk/webhook1/send http://localhost:8060/dingtalk/alertname/send http://localhost:8060/dingtalk/webhook_legacy/send"

可看到http://localhost:8060/dingtalk/的URL分别对应config.yml中target

查看端口

[root@prometheus-server prometheus-webhook-dingtalk]#lsof -i :8060
COMMAND     PID USER   FD   TYPE DEVICE SIZE/OFF NODE NAME
prometheu 13078 root    3u  IPv6 101524      0t0  TCP *:8060 (LISTEN)

‍

6.2.5 配置alertmanager调用dingtalk

官方配置：https://prometheus.io/docs/alerting/latest/configuration/

vim /apps/alertmanager/alertmanager.yml

global:         # 全局配置
  resolve_timeout: 2m                   	#单次探测超时时间
  smtp_from: '2403416792@qq.com'                #发件人邮箱地址
  smtp_smarthost:       'smtp.qq.com:465'       #邮箱smtp地址。
  smtp_auth_username: '2403416792@qq.com'       #发件人的登陆用户名，默认和发件人地址一致。
  smtp_auth_password: 'ptqdjqbhjudejf'  	#发件人的登陆密码，有时候是授权码。
  smtp_hello: '@qq.com'
  smtp_require_tls: false                       #是否需要tls协议。默认是true。

route:
  group_by: [alertname] #通过alertname的值对告警进行分类
  group_wait: 10s
  group_interval: 2m
  repeat_interval: 10m
  receiver: dingding

receivers:                      #定义多接收者
- name: 'default-receiver'
  email_configs:
  - to: '2403416792@qq.com'
    send_resolved: true         #通知己经恢复的告警
- name: dingding
  webhook_configs:
  - url: 'http://10.0.0.61:8060/dingtalk/webhook1/send' # 对应prometheus-webhook-dingtalk/config.yml文件中的target webhook1，即使用钉钉加签认证
    send_resolved: true         #通知已经恢复的告警

重启服务

systemctl restart alertmanager.service

查看端口

[root@prometheus-server ~]#netstat -ntlp|grep alert
tcp6       0      0 :::9093                 :::*                    LISTEN      13570/alertmanager  
tcp6       0      0 :::9094                 :::*                    LISTEN      13570/alertmanager

查看web页面

‍

6.2.6 prometheus配置

6.2.6.1 配置报警规则

mkdir -p /apps/prometheus/rules

/apps/prometheus/rules/server_rules.yaml内容如下：

groups:
  - name: alertmanager_pod.rules
    rules:
    - alert: Pod_all_cpu_usage
      expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 10
      for: 2m
      labels:
        severity: critical
        service: pods
        project: myserver
      annotations:
        description: 容器 {{ $labels.name }} CPU 资源利用率大于 10% , (current value is {{ $value }})
        summary: Dev CPU 负载告警

    - alert: Pod_all_memory_usage  
      #expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 10  #内存大于10%
      expr: sort_desc(avg by(name)(irate(node_memory_MemFree_bytes {name!=""}[5m]))) > 2*1024*1024*1024   #内存大于2G
      for: 2m
      labels:
        severity: critical
        project: myserver
      annotations:
        description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
        summary: Dev Memory 负载告警

    - alert: Pod_all_network_receive_usage
      #expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 50*1024*1024
      expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 0
      for: 2m
      labels:
        #severity: critical
        project: myserver
      annotations:
        description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }})

    - alert: node内存可用大小 
      expr: node_memory_MemFree_bytes > 1024 #故意写错的
      #expr: node_memory_MemFree_bytes < 524288000 #内存小于500兆
      for: 30s
      labels:
        project: node
      annotations:
        description: node节点可用内存小于500兆

‍

6.2.6.2 加载报警规则

vim /apps/prometheus/prometheus.yml
...
# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - 10.0.0.61:9093		# alertmanager地址

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "/apps/prometheus/rules/server_rules.yaml"		# 告警规则文件，可配置多个文件
  # - "second_rules.yml"
...

6.2.6.3 验证规则

[root@prometheus-server ~]#/apps/prometheus/promtool check rules /apps/prometheus/rules/server_rules.yaml 
Checking /apps/prometheus/rules/server_rules.yaml
  SUCCESS: 4 rules found

6.2.6.4 重启prometheus

systemctl restart prometheus.service

6.2.6.5 查看当前警告

[root@prometheus-server ~]#/apps/alertmanager/amtool alert --alertmanager.url=http://10.0.0.61:9093
Alertname   Starts At                Summary  State   
node内存可用大小  2023-03-14 16:13:48 UTC           active  
node内存可用大小  2023-03-14 16:13:48 UTC           active  
node内存可用大小  2023-03-14 16:13:48 UTC           active  
node内存可用大小  2023-03-14 16:13:48 UTC           active  
node内存可用大小  2023-03-14 16:13:48 UTC           active  
node内存可用大小  2023-03-14 16:13:48 UTC           active  
node内存可用大小  2023-03-14 16:13:48 UTC           active  
node内存可用大小  2023-03-14 16:13:48 UTC           active  
node内存可用大小  2023-03-14 16:13:48 UTC           active  
node内存可用大小  2023-03-14 16:13:48 UTC           active  
node内存可用大小  2023-03-14 16:13:48 UTC           active  
node内存可用大小  2023-03-14 16:13:48 UTC           active

‍

6.2.7 验证告警消息发送

6.2.7.1 prometheus告警状态

prometheus告警状态
inactive	#没有异常
pending		#已触发阈值，但未满足告警持续时间（即rule中的for字段）
firing		#已触发阈值且满足条件并发送至alertmanager

‍

6.2.7.2 dingtalk日志

[root@prometheus-server /]#tail -f nohup.out 
ts=2023-03-14T16:42:08.318Z caller=main.go:59 level=info msg="Starting prometheus-webhook-dingtalk" version="(version=2.1.0, branch=HEAD, revision=8580d1395f59490682fb2798136266bdb3005ab4)"
ts=2023-03-14T16:42:08.318Z caller=main.go:60 level=info msg="Build context" (gogo1.18.1,userroot@177bd003ba4d,date20220421-08:19:05)=(MISSING)
ts=2023-03-14T16:42:08.319Z caller=coordinator.go:83 level=info component=configuration file=/apps/prometheus-webhook-dingtalk/config.yml msg="Loading configuration file"
ts=2023-03-14T16:42:08.319Z caller=coordinator.go:91 level=info component=configuration file=/apps/prometheus-webhook-dingtalk/config.yml msg="Completed loading of configuration file"
ts=2023-03-14T16:42:08.319Z caller=main.go:97 level=info component=configuration msg="Loading templates" templates=
ts=2023-03-14T16:42:08.320Z caller=main.go:113 component=configuration msg="Webhook urls for prometheus alertmanager" urls="http://localhost:8060/dingtalk/webhook_legacy/send http://localhost:8060/dingtalk/webhook1/send http://localhost:8060/dingtalk/alertname/send"
ts=2023-03-14T16:42:08.320Z caller=web.go:208 level=info component=web msg="Start listening for connections" address=:8060

# 告警发送消息
ts=2023-03-14T16:42:08.951Z caller=entry.go:26 level=info component=web http_scheme=http http_proto=HTTP/1.1 http_method=POST remote_addr=10.0.0.61:34118 user_agent=Alertmanager/0.25.0 uri=http://10.0.0.61:8060/dingtalk/webhook1/send resp_status=200 resp_bytes_length=2 resp_elapsed_ms=311.106098 msg="request complete"

6.2.7.3 钉钉告警消息

‍

6.3 企业微信告警

https://work.weixin.qq.com/

先完成企业微信注册

6.3.1 创建企业微信通知应用

进入web页面，创建应用

‍

添加应用信息

‍

查看企业微信通知应用信息

‍

保存AgentID和Secret

‍

6.3.2 添加成员

手动或扫二维码添加成员

‍

验证通讯录

查看企业信息

保存企业ID

‍

6.3.3 测试消息

应用管理-->企业微信通知应用-->发送消息

‍

选择发送范围，编写发送信息

验证企业微信通知

6.3.4 配置prometheus

vim /apps/prometheus/prometheus.yml
...
# 检查是否有alerting配置
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - 10.0.0.61:9093		# alertmanager地址
# 与配置钉钉告警规则相同
rule_files:
  - "/apps/prometheus/rules/server_rules.yaml"          # 告警规则文件，可配置多个文件

重启服务

systemctl restart prometheus.service

‍

6.3.5 配置alertmanager

#cat /apps/alertmanager/alertmanager.yml 
global:		# 全局配置
  resolve_timeout: 2m 			#单次探测超时时间
  smtp_from: '2403416792@qq.com'		#发件人邮箱地址
  smtp_smarthost:	'smtp.qq.com:465'	#邮箱smtp地址。
  smtp_auth_username: '2403416792@qq.com'	#发件人的登陆用户名，默认和发件人地址一致。
  smtp_auth_password: 'ptqdjqbhjudejf'	#发件人的登陆密码，有时候是授权码。
  smtp_hello: '@qq.com'
  smtp_require_tls: false			#是否需要tls协议。默认是true。

route:		# 用来设置报警的分发策略
  group_by: [alertname]	#通过alertname的值对告警进行分类
  group_wait: 10s 	#一组警告第一次发送之前等待的延迟时闻,即产生告警后延迟10秒钟将组内新产生的消息一起合并发送(一般设置为0秒~几分钟)。
  group_interval: 2m	#一组已发送过初始通知的告警接收到新告警后，下次发送通知前等待的延迟时间(一般设置为5分钟或更多)
  repeat_interval: 10m 	#一条成功发送的告警，在最终发送通知之前等待的时间(通常设置为3小时或更长时间)
  receiver: wechat	#wechat告警

receivers:                      #定义多接收者
  - name: 'default-receiver'
    email_configs:
    - to: '2403416792@qq.com'
      send_resolved: true       #通知己经恢复的告警
  - name: dingding              # 钉钉告警
    webhook_configs:
    - url: 'http://10.0.0.61:8060/dingtalk/alertname/send'
      send_resolved: true       #通知已经恢复的告警
  - name: wechat                        # 微信告警
    wechat_configs:
    - corp_id: wwd2cebdb20b9c91d7       # 企业ID
      to_user: '@all'           # 发送给所有人
      agent_id: 1000002         # 应用ID
      api_secret: 5PwuWghboQCVEHe8DBQcGAyeU4oMHnaxevfQd84YpdA   #应用secret
      send_resolved: true       #通知已经恢复的告警

重启服务

systemctl restart alertmanager.service

‍

6.3.6 企业微信验证消息

出现60020错误码企业微信无法接收信息时

# 查看日志
/apps/alertmanager/alertmanager --config.file=/apps/alertmanager/alertmanager.yml --log.level=debug
...
caller=wechat.go:178 level=debug integration=wechat response="{\"errcode\":60020,\"errmsg\":\"not allow to access from your ip, hint: [1678823033431493982481539], from ip: 180.111.192.141, more info at https://open.work.weixin.qq.com/devtool/query?e=60020\"}" incident="{}:{alertname=\"node内存可用大小\"}"

错误码：60020

不安全的访问IP。请根据调用的应用类型分别按如下方法确认：
1）若调用者是企业自建应用或通讯录同步助手，请确认该IP是本企业服务器IP，并已经配置到应用详情的“企业可信IP”项目中。第三方服务商IP不能调用。
2）若调用者是第三方应用或服务商代开发应用，请确认该IP已经配置到“服务商管理后台”-“服务商信息”-“基本信息”-“IP白名单”。
3) 配置完可信IP之后，需要1分钟后才生效。

‍

配置添加可信IP

‍

根据实际设置可信域名或设置接收消息服务器URL，或联系企业微信技术团队解决

‍

6.4 告警分类发送

根据消息中的属性信息设置规则，将消息分类发送，如以下将severity级别为critical的通知消息发送到钉钉，其他发送到邮件：

6.4.1 prometheus rules配置

# cat /apps/prometheus/rules/server2_rules.yaml
groups:
  - name: alertmanager_pod.rules
    rules:
    - alert: Pod_all_cpu_usage	# 警告
      expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 20
      for: 2m
      labels:
        severity: warning
        service: pods
        project: myserver
      annotations:
        description: 容器 {{ $labels.name }} CPU 资源利用率大于 10% , (current value is {{ $value }})
        summary: Pod CPU 利用率超过20%

    - alert: Pod_all_cpu_usage	# 严重
      expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 30
      for: 2m
      labels:
        severity: critical
        service: pods
        project: myserver
      annotations:
        description: 容器 {{ $labels.name }} CPU 资源利用率大于 30% , (current value is {{ $value }})
        summary: Pod CPU 利用率超过30%

    - alert: Pod_all_memory_usage  
      #expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 10  #内存大于10%
      expr: sort_desc(avg by(name)(irate(node_memory_MemFree_bytes {name!=""}[5m]))) > 2*1024*1024*1024   #内存大于2G
      for: 2m
      labels:
        severity: critical
        project: myserver
      annotations:
        description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
        summary: Dev Memory 负载告警

    - alert: Pod_all_network_receive_usage
      #expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 50*1024*1024
      expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 1
      for: 2m
      labels:
        severity: critical
        project: myserver
      annotations:
        description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }})

    - alert: node内存可用大小 
      expr: node_memory_MemFree_bytes > 1024 #故意写错进行测试
      #expr: node_memory_MemFree_bytes < 524288000 #内存小于500兆
      for: 30s
      labels:
        project: node
      annotations:
        description: node节点可用内存小于500M

重启服务

systemctl restart prometheus.service

‍

6.4.2 alertmanager配置

#cat /apps/alertmanager/alertmanager.yml
global:
  resolve_timeout: 5m 				#单次探测超时时间
  smtp_from: '2403416792@qq.com'		#发件人邮箱地址
  smtp_smarthost: 'smtp.qq.com:465'		#邮箱smtp地址。
  smtp_auth_username: '2403416792@qq.com'	#发件人的登陆用户名，默认和发件人地址一致。
  smtp_auth_password: 'riudkgxfttjqecda'	#发件人的登陆密码，有时候是授权码。
  smtp_hello: '@qq.com'
  smtp_require_tls: false			#是否需要tls协议。默认是true。

route:
  group_by: [alertname]
  group_wait: 10s 
  group_interval: 10s
  repeat_interval: 10m
  receiver: 'dingding'		#默认告警方式为钉钉

  #添加消息路由
  routes:
  - receiver: 'dingding-worknode'	#critical级别的发送钉钉
    group_wait: 10s
    match_re:
      severity: critical		# 匹配严重等级告警
  - receiver: 'email-receiver'		# 宿主机告警通过邮件发送
    group_wait: 10s
    match_re:
      project: node			# 匹配node告警

receivers:
  - name: 'email-receiver'
    email_configs:
    - to: '2403416792@qq.com'
      send_resolved: true
  - name: 'dingding'
    webhook_configs:
    - url: 'http://10.0.0.61:8060/dingtalk/webhook1/send'
      send_resolved: true
  - name: 'dingding-worknode'
    webhook_configs:
    - url: 'http://10.0.0.61:8060/dingtalk/webhook1/send'
      send_resolved: true

inhibit_rules:			#抑制的规则
  - source_match: 		#源匹配级别，当匹配成功发出通知，但是其它'alertname', 'dev', 'instance'产生的warning 级别的告警通知将被抑制
      severity: 'critical'	#报警的事件级别
    target_match:
      severity: 'warning'	#调用source_match的severity即如果已经有'critical’级别的报警，那么将匹配目标为新产生的告警级别为'warning'的将被抑制
    equal: ['alertname','dev','instance']	#匹配那些对象的告警

重启服务

systemctl restart alertmanager.service

‍

6.4.3 验证消息发送

node告警发送邮件

‍

critical严重等级告警发送至钉钉

‍

钉钉消息

‍

6.5 钉钉告警模板

默认的消息内容需要调整、而且消息是连接在一起的。

6.5.1 定义模板

# vim /apps/prometheus-webhook-dingtalk/message_template.templ
{{ define "dingding.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}

=========  **监控告警** =========

**告警程序:**     Alertmanager
**告警类型:**    {{ $alert.Labels.alertname }}
**告警级别:**    {{ $alert.Labels.severity }} 级
**告警状态:**    {{ .Status }}
**故障主机:**    {{ $alert.Labels.instance }} {{ $alert.Labels.device }}
**告警主题:**    {{ .Annotations.summary }}
**告警详情:**    {{ $alert.Annotations.message }}{{ $alert.Annotations.description}}
**主机标签:**    {{ range .Labels.SortedPairs  }}  </br> [{{ .Name }}: {{ .Value | markdown | html }} ]
{{- end }} </br>

**故障时间:**    {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end =  =========
{{- end }}
{{- end }}

{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}

========= 告警恢复 =========
**告警程序:**     Alertmanager
**告警主题:**    {{ $alert.Annotations.summary }}
**告警主机:**    {{ .Labels.instance }}
**告警类型:**    {{ .Labels.alertname }}
**告警级别:**    {{ $alert.Labels.severity }} 级
**告警状态:**    {{ .Status }}
**告警详情:**    {{ $alert.Annotations.message }}{{ $alert.Annotations.description}}
**故障时间:**    {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
**恢复时间:**    {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}

========= = **end** =  =========
{{- end }}
{{- end }}
{{- end }}

6.5.2 dingtalk引用模板

#vim /apps/prometheus-webhook-dingtalk/config.yml 
# 开启模板
templates:
  - /apps/prometheus-webhook-dingtalk/message_template.templ	# 引用模板

targets:
  webhook1:
    url: https://oapi.dingtalk.com/robot/send?access_token=a6026ee5fe935ad7d23bc3b4ba1f63c51b237e72160027a046720466fcfac4cf
    secret: SEC8a7e5fe2bb03d383963c144a9cf8156fbbde20d9e754c4f0b43b6b3e04a2e892
    message:
      text: '{{ template "dingding.default.message" . }}'	# 模板文件中模板名称

6.5.3 alertmanager引用dingtalk地址

#cat /apps/alertmanager/alertmanager.yml
global:
  resolve_timeout: 5m 				#单次探测超时时间
  smtp_from: '2403416792@qq.com'		#发件人邮箱地址
  smtp_smarthost: 'smtp.qq.com:465'		#邮箱smtp地址。
  smtp_auth_username: '2403416792@qq.com'	#发件人的登陆用户名，默认和发件人地址一致。
  smtp_auth_password: 'riudkgxfttjqecda'	#发件人的登陆密码，有时候是授权码。
  smtp_hello: '@qq.com'
  smtp_require_tls: false			#是否需要tls协议。默认是true。

templates:
  - '/apps/alertmanager/message_template.templ'		# Altermanager引用模板

route:
  group_by: [alertname]
  group_wait: 10s 
  group_interval: 10s
  repeat_interval: 10m
  receiver: 'email-receiver'		#默认告警方式为邮件

  #添加消息路由
  routes:
  - receiver: 'dingding'		#critical级别的发送钉钉
    group_wait: 10s
    match_re:
      severity: critical		# 匹配严重等级告警

receivers:
  - name: 'email-receiver'
    email_configs:
    - to: '2403416792@qq.com'
      send_resolved: true
  - name: 'dingding'
    webhook_configs:
    - url: 'http://10.0.0.61:8060/dingtalk/webhook1/send'
      send_resolved: true

重启服务

/apps/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --web.listen-address=':8060' --config.file=/apps/prometheus-webhook-dingtalk/config.yml
systemctl restart alertmanager.service

‍

6.5.4 验证告警内容

‍

posted @ 2023-03-15 19:13 areke 阅读(798) 评论(0) 收藏举报

刷新页面返回顶部