极客时间运维进阶训练营第二十周作业

1、Prometheus 基于 consul 实现服务发现,并总结服务发现过程

 

 

 Prometheus  监听consul 注册内容 自动监控注册到consul中的服务信息

前提:服务需要提供prom能接收的指标数据接口


2、Prometheus 监控 JAVA 服务 (Tomcat)、Redis、MySQL、HAProxy

2.1 java tomcat

 

 

 

# 制作镜像

root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/tomcat-image# cat server.xml
<?xml version="1.0" encoding="UTF-8"?>
<!--
  Licensed to the Apache Software Foundation (ASF) under one or more
  contributor license agreements.  See the NOTICE file distributed with
  this work for additional information regarding copyright ownership.
  The ASF licenses this file to You under the Apache License, Version 2.0
  (the "License"); you may not use this file except in compliance with
  the License.  You may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
-->
<!-- Note:  A "Server" is not itself a "Container", so you may not
     define subcomponents such as "Valves" at this level.
     Documentation at /docs/config/server.html
 -->
<Server port="8005" shutdown="SHUTDOWN">
  <Listener className="org.apache.catalina.startup.VersionLoggerListener" />
  <!-- Security listener. Documentation at /docs/config/listeners.html
  <Listener className="org.apache.catalina.security.SecurityListener" />
  -->
  <!--APR library loader. Documentation at /docs/apr.html -->
  <Listener className="org.apache.catalina.core.AprLifecycleListener" SSLEngine="on" />
  <!-- Prevent memory leaks due to use of particular java/javax APIs-->
  <Listener className="org.apache.catalina.core.JreMemoryLeakPreventionListener" />
  <Listener className="org.apache.catalina.mbeans.GlobalResourcesLifecycleListener" />
  <Listener className="org.apache.catalina.core.ThreadLocalLeakPreventionListener" />

  <!-- Global JNDI resources
       Documentation at /docs/jndi-resources-howto.html
  -->
  <GlobalNamingResources>
    <!-- Editable user database that can also be used by
         UserDatabaseRealm to authenticate users
    -->
    <Resource name="UserDatabase" auth="Container"
              type="org.apache.catalina.UserDatabase"
              description="User database that can be updated and saved"
              factory="org.apache.catalina.users.MemoryUserDatabaseFactory"
              pathname="conf/tomcat-users.xml" />
  </GlobalNamingResources>

  <!-- A "Service" is a collection of one or more "Connectors" that share
       a single "Container" Note:  A "Service" is not itself a "Container",
       so you may not define subcomponents such as "Valves" at this level.
       Documentation at /docs/config/service.html
   -->
  <Service name="Catalina">

    <!--The connectors can use a shared executor, you can define one or more named thread pools-->
    <!--
    <Executor name="tomcatThreadPool" namePrefix="catalina-exec-"
        maxThreads="150" minSpareThreads="4"/>
    -->


    <!-- A "Connector" represents an endpoint by which requests are received
         and responses are returned. Documentation at :
         Java HTTP Connector: /docs/config/http.html
         Java AJP  Connector: /docs/config/ajp.html
         APR (HTTP/AJP) Connector: /docs/apr.html
         Define a non-SSL/TLS HTTP/1.1 Connector on port 8080
    -->
    <Connector port="8080" protocol="HTTP/1.1"
               connectionTimeout="20000"
               redirectPort="8443" />
    <!-- A "Connector" using the shared thread pool-->
    <!--
    <Connector executor="tomcatThreadPool"
               port="8080" protocol="HTTP/1.1"
               connectionTimeout="20000"
               redirectPort="8443" />
    -->
    <!-- Define an SSL/TLS HTTP/1.1 Connector on port 8443
         This connector uses the NIO implementation. The default
         SSLImplementation will depend on the presence of the APR/native
         library and the useOpenSSL attribute of the
         AprLifecycleListener.
         Either JSSE or OpenSSL style configuration may be used regardless of
         the SSLImplementation selected. JSSE style configuration is used below.
    -->
    <!--
    <Connector port="8443" protocol="org.apache.coyote.http11.Http11NioProtocol"
               maxThreads="150" SSLEnabled="true">
        <SSLHostConfig>
            <Certificate certificateKeystoreFile="conf/localhost-rsa.jks"
                         type="RSA" />
        </SSLHostConfig>
    </Connector>
    -->
    <!-- Define an SSL/TLS HTTP/1.1 Connector on port 8443 with HTTP/2
         This connector uses the APR/native implementation which always uses
         OpenSSL for TLS.
         Either JSSE or OpenSSL style configuration may be used. OpenSSL style
         configuration is used below.
    -->
    <!--
    <Connector port="8443" protocol="org.apache.coyote.http11.Http11AprProtocol"
               maxThreads="150" SSLEnabled="true" >
        <UpgradeProtocol className="org.apache.coyote.http2.Http2Protocol" />
        <SSLHostConfig>
            <Certificate certificateKeyFile="conf/localhost-rsa-key.pem"
                         certificateFile="conf/localhost-rsa-cert.pem"
                         certificateChainFile="conf/localhost-rsa-chain.pem"
                         type="RSA" />
        </SSLHostConfig>
    </Connector>
    -->

    <!-- Define an AJP 1.3 Connector on port 8009 -->
    <!--
    <Connector protocol="AJP/1.3"
               address="::1"
               port="8009"
               redirectPort="8443" />
    -->

    <!-- An Engine represents the entry point (within Catalina) that processes
         every request.  The Engine implementation for Tomcat stand alone
         analyzes the HTTP headers included with the request, and passes them
         on to the appropriate Host (virtual host).
         Documentation at /docs/config/engine.html -->

    <!-- You should set jvmRoute to support load-balancing via AJP ie :
    <Engine name="Catalina" defaultHost="localhost" jvmRoute="jvm1">
    -->
    <Engine name="Catalina" defaultHost="localhost">

      <!--For clustering, please take a look at documentation at:
          /docs/cluster-howto.html  (simple how to)
          /docs/config/cluster.html (reference documentation) -->
      <!--
      <Cluster className="org.apache.catalina.ha.tcp.SimpleTcpCluster"/>
      -->

      <!-- Use the LockOutRealm to prevent attempts to guess user passwords
           via a brute-force attack -->
      <Realm className="org.apache.catalina.realm.LockOutRealm">
        <!-- This Realm uses the UserDatabase configured in the global JNDI
             resources under the key "UserDatabase".  Any edits
             that are performed against this UserDatabase are immediately
             available for use by the Realm.  -->
        <Realm className="org.apache.catalina.realm.UserDatabaseRealm"
               resourceName="UserDatabase"/>
      </Realm>

      <Host name="localhost"  appBase="/data/tomcat/webapps"  unpackWARs="false" autoDeploy="false">

        <!-- SingleSignOn valve, share authentication between web applications
             Documentation at: /docs/config/valve.html -->
        <!--
        <Valve className="org.apache.catalina.authenticator.SingleSignOn" />
        -->

        <!-- Access log processes all example.
             Documentation at: /docs/config/valve.html
             Note: The pattern used is equivalent to using pattern="common" -->
        <Valve className="org.apache.catalina.valves.AccessLogValve" directory="logs"
               prefix="localhost_access_log" suffix=".txt"
               pattern="%h %l %u %t &quot;%r&quot; %s %b" />

      </Host>
    </Engine>
  </Service>
</Server>
root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/tomcat-image#

#  <Host name="localhost"  appBase="/data/tomcat/webapps"  unpackWARs="false" autoDeploy="false"> 中appbase 定义了app
root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/tomcat-image# cat Dockerfile
#FROM tomcat:8.5.73-jdk11-corretto
FROM tomcat:8.5.73

LABEL maintainer="jack 2973707860@qq.com"

ADD server.xml /usr/local/tomcat/conf/server.xml

RUN mkdir /data/tomcat/webapps -p
ADD myapp /data/tomcat/webapps/myapp
ADD metrics.war /data/tomcat/webapps
# 如果是其他环境需要将如下jar包放到lib环境中
ADD simpleclient-0.8.0.jar  /usr/local/tomcat/lib/
ADD simpleclient_common-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_hotspot-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_servlet-0.8.0.jar /usr/local/tomcat/lib/
ADD tomcat_exporter_client-0.0.12.jar /usr/local/tomcat/lib/


#ADD run_tomcat.sh /apps/tomcat/bin/

EXPOSE 8080 8443 8009

#CMD ["/apps/tomcat/bin/catalina.sh","run"] # 使用了官方镜像,因此无需启动

#CMD ["/apps/tomcat/bin/run_tomcat.sh"] #  # 使用了官方镜像,因此无需启动


root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/tomcat-image# cat myapp/index.jsp
<h1>tomcat app1</h1>
root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/tomcat-image#

root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/tomcat-image# cat run_tomcat.sh
#!/bin/bash

echo "1.1.1.1 www.a.com" >> /etc/hosts

su - magedu -c "/apps/tomcat/bin/catalina.sh start"

tail -f /etc/hosts


root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/tomcat-image# cat build-command.sh
#!/bin/bash

#nerdctl build -t harbor.magedu.net/magedu/tomcat-app1:v1 .

#nerdctl push harbor.magedu.net/magedu/tomcat-app1:v1
docker build -t harbor.iclinux.com/magedu/tomcat-app1:v1 .
docker push  harbor.iclinux.com/magedu/tomcat-app1:v1
root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/tomcat-image# ./build-command.sh

# 测试镜像

root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/tomcat-image# docker run -ti --rm -p 8081:8080 harbor.iclinux.com/magedu/tomcat-app1:v1


root@prometheus-server01:/apps/prometheus/file_sd# curl  http://172.31.7.101:8081/myapp/index.jsp
<h1>tomcat app1</h1>

# 检查指标数据
root@prometheus-server01:/apps/prometheus/file_sd# curl  http://172.31.7.101:8081/myapp/metrics
<!doctype html><html lang="en"><head><title>HTTP Status 404 – Not Found</title><style type="text"

# 清理一些pod
root@k8s-master1:~/day20/1.prometheus-case-files# kubectl  delete  -f .

# 部署tomcat
root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/yaml# cat tomcat-deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: tomcat-deployment
  namespace: default
spec:
  selector:
    matchLabels:
     app: tomcat
  replicas: 1 # tells deployment to run 2 pods matching the template
  template: # create pods using pod definition in this template
    metadata:
      labels:
        app: tomcat
      annotations:
        prometheus.io/scrape: 'true'
        prometheus.io/port: "8080"
    spec:
      containers:
      - name: tomcat
        image: harbor.iclinux.com/magedu/tomcat-app1:v1
        imagePullPolicy: Always
        ports:
        - containerPort: 8080
        securityContext:
          privileged: true

root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/yaml# cat tomcat-svc.yaml
kind: Service  #service 类型
apiVersion: v1
metadata:
  annotations:
    prometheus.io/scrape: 'true'
    prometheus.io/port: "8080"
  name: tomcat-service
spec:
  selector:
    app: tomcat
  ports:
  - nodePort: 31080
    port: 80
    protocol: TCP
    targetPort: 8080
  type: NodePort


root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/yaml# kubectl  apply -f tomcat-deploy.yaml -f  tomcat-svc.yaml
deployment.apps/tomcat-deployment created
service/tomcat-service created

root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/yaml# curl  http://172.31.7.111:31080/myapp/
<h1>tomcat app1</h1>

root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/1.tomcat/yaml# curl  http://172.31.7.111:31080/metrics/
# HELP tomcat_session_active_total Number of active sessions
# TYPE tomcat_session_active_total gauge

# prome  配置指标发现
  - job_name: "tomcat-monitor-metrics"
    static_configs:
      - targets: ["172.31.7.111:31080"]
View Code

 

 

 

 

 

 

 

 

 2.2 redis

root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/2.redis/yaml# cat redis-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: redis
  namespace: magedu
spec:
  replicas: 1
  selector:
    matchLabels:
      app: redis
  template:
    metadata:
      labels:
        app: redis
    spec:
      containers:
      - name: redis
        image: redis:4.0.14
        resources:
          requests:
            cpu: 200m
            memory: 156Mi
        ports:
        - containerPort: 6379
      - name: redis-exporter
        image: oliver006/redis_exporter:latest
        resources:
          requests:
            cpu: 100m
            memory: 128Mi
        ports:
        - containerPort: 9121

root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/2.redis/yaml# cat redis-exporter-svc.yaml
kind: Service  #service 类型
apiVersion: v1
metadata:
  annotations:
    prometheus.io/scrape: 'true'
    prometheus.io/port: "9121"
  name: redis-exporter-service
  namespace: magedu
spec:
  selector:
    app: redis
  ports:
  - nodePort: 31082
    name: prom
    port: 9121
    protocol: TCP
    targetPort: 9121
  type: NodePort
root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/2.redis/yaml# cat redis-redis-svc.yaml
kind: Service  #service 类型
apiVersion: v1
metadata:
#  annotations:
#    prometheus.io/scrape: 'false'
  name: redis-redis-service
  namespace: magedu
spec:
  selector:
    app: redis
  ports:
  - nodePort: 31081
    name: redis
    port: 6379
    protocol: TCP
    targetPort: 6379
  type: NodePort

root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/2.redis/yaml# kubectl  apply -f redis-deployment.yaml -f redis-exporter-svc.yaml -f  redis-redis-svc.yaml
deployment.apps/redis created
service/redis-exporter-service created
service/redis-redis-service created

# 验证redis

root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/2.redis/yaml# apt install -y redis-server

root@k8s-master1:~/day20/1.prometheus-case-files/app-monitor-case/2.redis/yaml# redis-cli  -h 172.31.7.111 -p 31081
172.31.7.111:31081> info
# Server
redis_version:4.0.14
redis_git_sha1:00000000
redis_git_dirty:0
redis_build_id:165c932261a105d7
redis_mode:standalone

172.31.7.111:31081> set k1 v1
OK
172.31.7.111:31081> get k1
"v1"
172.31.7.111:31081> SELECT 2
OK
172.31.7.111:31081[2]> SELECT 1
OK
172.31.7.111:31081[1]>

172.31.7.111:31081[1]> SELECT 0
OK
172.31.7.111:31081> KEYS *
1) "k2"
2) "k1"

172.31.7.111:31081> SELECT 5
OK
172.31.7.111:31081[5]> set k1 v1
OK
172.31.7.111:31081[5]> set k2 v2
OK
172.31.7.111:31081[5]> set k2 v3
OK
172.31.7.111:31081[5]> set k3 v3
OK
172.31.7.111:31081[5]> set k4 v4
OK

172.31.7.111:31081[5]> KEYS *
1) "k2"
2) "k1"
3) "k4"
4) "k3"

root@k8s-master1:~# curl http://172.31.7.111:31082
<html>
<head><title>Redis Exporter v1.33.0</title></head>
<body>
<h1>Redis Exporter v1.33.0</h1>
<p><a href='/metrics'>Metrics</a></p>
</body>
</html>
View Code
  - job_name: "redis-monitor-metrics"
    static_configs:
     - targets: ["172.31.7.111:31082"]

 

 

 

 

 

 2.3 mysql

# 安装mysql
root@prometheus-server02:~# apt install -y mariadb-server
cp /etc/mysql/mariadb.conf.d/50-server.cnf{,.bak}
sed -i s'/127.0.0.1/0.0.0.0/g' /etc/mysql/mariadb.conf.d/50-server.cnf
systemctl  restart mysql

# 创建账号
mysql
 CREATE USER 'mysql_exporter'@'localhost' IDENTIFIED BY 'imnot007*';
GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'mysql_exporter'@'localhost';

# 验证
mysql -umysql_exporter -pimnot007* -hlocalhost

# 现在安装exporter
root@prometheus-server02:~# cd /usr/local/src/

root@prometheus-server02:/usr/local/src# wget https://github.com/prometheus/mysqld_exporter/releases/download/v0.14.0/mysqld_exporter-0.14.0.linux-amd64.tar.gz

root@prometheus-server02:/usr/local/src# tar xf mysqld_exporter-0.14.0.linux-amd64.tar.gz
root@prometheus-server02:/usr/local/src# cp  mysqld_exporter-0.14.0.linux-amd64/mysqld_exporter  /usr/local/bin/

tee /root/.my.cnf << "EOF"
[client]
user=mysql_exporter
password=imnot007*
EOF

tee /etc/systemd/system/mysqld_exporter.service << "EOF"
[Unit]
Description=Prometheus Node Exporter
After=network.target
[Service]
ExecStart=/usr/local/bin/mysqld_exporter --config.my-cnf=/root/.my.cnf
[Install]
WantedBy=multi-user.target

EOF

systemctl daemon-reload && systemctl restart mysqld_exporter && systemctl enable mysqld_exporter


root@prometheus-server02:/usr/local/src# curl  http://172.31.1.102:9104/metrics
# TYPE go_gc_cycles_automatic_gc_cycles_total counter
go_gc_cycles_automatic_gc_cycles_total 1
# HELP go_gc_cycles_forced_gc_cycles_total Count of completed GC cycles forced by the application.
# TYPE go_gc_cycles_forced_gc_cycles_total counter
go_gc_cycles_forced_gc_cycles_total 0
# HELP go_gc_cycles_total_gc_cycles_total Count of all completed GC cycles.
# TYPE go_gc_cycles_total_gc_cycles_total counter
go_gc_cycles_total_gc_cycles_total 1
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
100 24354    0 24354    0     0  1796k      0 --:--:-- --:--:-- --:--:-- 1829k
View Code
  - job_name: mysql-monitor-172.31.1.102
    static_configs:
      - targets: ['172.31.1.102:9104']

 

 

 

 

 2.4 HA proxy

tee -a /etc/haproxy/haproxy.cfg << "EOF"
listen stats
  bind :8899
  stats enable
  #stats hide-version
  stats uri /haproxy-status
  stats realm HAPorxy\ Stats\ Page
  stats auth haadmin:123456
  stats auth admin:123456
EOF
systemctl  reload haproxy

root@k8s-ha1:~# curl 172.31.7.109:8899/haproxy-status
<html><body><h1>401 Unauthorized</h1>
You need a valid user and password to access this content.
</body></html>
View Code

 

 

 

# 部署exporter
root@k8s-ha1:/usr/local/src# tar xf haproxy_exporter-0.15.0.linux-amd64.tar.gz
root@k8s-ha1:/usr/local/src# mv haproxy_exporter-0.15.0.linux-amd64/haproxy_exporter  /usr/local/bin/
root@k8s-ha1:/usr/local/src# haproxy_exporter --haproxy.scrape-uri=unix:/run/haproxy/admin.sock &
#另一种方式: haproxy_exporter  --haproxy.scrape-uri="http://haadmin:123456@172.0.0.1/haproxy-status;csv"

  - job_name: 'haproxy-monitor-metrics'
    static_configs:
    - targets: ['172.31.7.109:9101']
View Code

 

 

 

 

 

 

3、总结 Prometheus 基于 exporter 进行指标数据采集的流程

 

 

 编写应用程序所需要暴露参数的export代码,exporter启动后将会自动抽取数据并能够通过接口供给pormetheus获取,prom 获取数据存储下来,供给grafana展示使用,通过promsql监控参数异常情况后调用alermanager 进行告警

4、Prometheus 集合 AlertManager 实现邮件、钉钉、微信告警
基于钉钉告警模板与企业微信告警模板实现自定义告警内容
4.1 邮件告警

cd  /usr/local/src/
tar xf alertmanager-0.25.0.linux-amd64.tar.gz

root@prometheus-server02:/usr/local/src# mv alertmanager-0.25.0.linux-amd64 /apps/
root@prometheus-server02:/usr/local/src# ln -s /apps/alertmanager-0.25.0.linux-amd64/ /apps/alertmanager

tee /etc/systemd/system/alertmanager.service << "EOF"
[Unit]
Description=Prometheus alertmanager
After=network.target

[Service]
ExecStart=/apps/alertmanager/alertmanager --config.file="/apps/alertmanager/alertmanager.yml"

[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload && systemctl restart alertmanager && systemctl enable alertmanager

root@prometheus-server02:/usr/local/src# netstat  -ntlp|grep alert
tcp6       0      0 :::9094                 :::*                    LISTEN      87688/alertmanager
tcp6       0      0 :::9093                 :::*                    LISTEN      87688/alertmanager

# 配置alert manager
tee /apps/alertmanager/alertmanager.yml << "EOF"
global:
  resolve_timeout: 2m
  smtp_smarthost: 'smtp.mxhichina.com:465'
  smtp_from: 'mon@iclinux.com'
  smtp_auth_username: 'mon@iclinux.com'
  smtp_auth_password: 'xxxxxxxx'
  #smtp_hello: '@aliyun.com'
  smtp_require_tls: false

route: #route 用来设置报警的分发策略
  group_by: ['alertname'] #采用哪个标签来作为分组依据
  group_wait: 2s
  group_interval: 3s
  repeat_interval: 3s
  receiver: 'email' #设置接收人

receivers:
- name: 'email'
  #webhook_configs:
  #- url: 'http://127.0.0.1:5001/'
  email_configs:
    - to: 'mon@iclinux.com'

inhibit_rules: #抑制的规则
  - source_match: #源匹配级别,当匹配成功发出通知,但是其它'alertname', 'dev', 'instance'产生的  warning 级别的告警通知将被抑制
      severity: 'critical' #报警的事件级别
    target_match:
      severity: 'warning' #调用source_match 的severity 即如果已经有'critical' 级别的报警,那么将匹  配目标为新产生的告警级别为'warning' 的将被抑制
    equal: ['alertname', 'dev', 'instance'] #匹配那些对象的告警
EOF
systemctl  restart alertmanager.service

# 配置prometheus
root@prometheus-server01:/apps/prometheus# vim prometheus.yml
# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
           - 172.31.1.102:9093
rule_files:
  - "/apps/prometheus/rules/server_rules.yaml"

# 报警规则
cd /apps/prometheus && mkdir rules

tee /apps/prometheus/rules/server_rules.yaml << "EOF"
groups:
  - name: alertmanager_pod.rules
    rules:
    - alert: Pod_all_cpu_usage
      expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 1  # 告警规则,故意写错可以通过通过prosql查数的表达式
      for: 10s
      labels:
        severity: critical
        service: pods
      annotations:
        description: 容器 {{ $labels.name }} CPU 资源利用率大于 1% , (current value is {{ $value }})
        summary: Dev CPU 负载告警

    - alert: Pod_all_memory_usage
      #expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 10 #内存大于10%
      expr: sort_desc(avg by(name)(irate(node_memory_MemFree_bytes {name!=""}[5m]))) > 2 #内存大于2G,正确写法为2*1024*1024*1024
      for: 10s
      labels:
        severity: critical
      annotations:
        description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
        summary: Dev Memory 负载告警

    - alert: Pod_all_network_receive_usage
      expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 1  ## 为演示故意写错
      for: 10s
      labels:
        severity: critical
      annotations:
        description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }})

    - alert: node 内存可用大小
      expr: node_memory_MemFree_bytes > 1 #故意写错
      for: 10s
      labels:
        severity: critical
      annotations:
        description: node节点  {{ $labels.name }} 可用内存大于1字节,当前值为:  {{ $value }}

EOF


# 创建测试pod

root@k8s-master1:~/day20/1.prometheus-case-files# cat case1-daemonset-deploy-cadvisor.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: cadvisor
  namespace: monitoring
spec:
  selector:
    matchLabels:
      app: cAdvisor
  template:
    metadata:
      labels:
        app: cAdvisor
    spec:
      tolerations:    #污点容忍,忽略master的NoSchedule
        - effect: NoSchedule
          key: node-role.kubernetes.io/master
      hostNetwork: true
      restartPolicy: Always   # 重启策略
      containers:
      - name: cadvisor
        image: registry.cn-hangzhou.aliyuncs.com/zhangshijie/cadvisor-amd64:v0.45.0
        imagePullPolicy: IfNotPresent  # 镜像策略
        ports:
        - containerPort: 8080
        volumeMounts:
          - name: root
            mountPath: /rootfs
          - name: run
            mountPath: /var/run
          - name: sys
            mountPath: /sys
          - name: docker
            mountPath: /var/lib/docker
            #mountPath: /var/lib/containerd
      volumes:
      - name: root
        hostPath:
          path: /
      - name: run
        hostPath:
          path: /var/run
      - name: sys
        hostPath:
          path: /sys
      - name: docker
        hostPath:
          path: /var/lib/docker
          #path: /var/lib/containerd
root@k8s-master1:~/day20/1.prometheus-case-files# cat case2-daemonset-deploy-node-exporter.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: node-exporter
  namespace: monitoring
  labels:
    k8s-app: node-exporter
spec:
  selector:
    matchLabels:
        k8s-app: node-exporter
  template:
    metadata:
      labels:
        k8s-app: node-exporter
    spec:
      tolerations:
        - effect: NoSchedule
          key: node-role.kubernetes.io/master
      containers:
      - image: registry.cn-hangzhou.aliyuncs.com/zhangshijie/node-exporter:v1.5.0
        imagePullPolicy: IfNotPresent
        name: prometheus-node-exporter
        ports:
        - containerPort: 9100
          hostPort: 9100
          protocol: TCP
          name: metrics
        volumeMounts:
        - mountPath: /host/proc
          name: proc
        - mountPath: /host/sys
          name: sys
        - mountPath: /host
          name: rootfs
        args:
        - --path.procfs=/host/proc
        - --path.sysfs=/host/sys
        - --path.rootfs=/host
      volumes:
        - name: proc
          hostPath:
            path: /proc
        - name: sys
          hostPath:
            path: /sys
        - name: rootfs
          hostPath:
            path: /
      hostNetwork: true
      hostPID: true
---
apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scrape: "true"
  labels:
    k8s-app: node-exporter
  name: node-exporter
  namespace: monitoring
spec:
  type: NodePort
  ports:
  - name: http
    port: 9100
    nodePort: 39100
    protocol: TCP
  selector:
    k8s-app: node-exporter

root@k8s-master1:~/day20/1.prometheus-case-files# kubectl  apply -f case1-daemonset-deploy-cadvisor.yaml  -f case2-daemonset-deploy-node-exporter.yaml
daemonset.apps/cadvisor created
daemonset.apps/node-exporter created
service/node-exporter created

# prom 配置静态发现
  - job_name: "K8S-POD"
    static_configs:
      - targets: ["172.31.7.111:8080","172.31.7.112:8080","172.31.7.113:8080"]
  - job_name: "K8S-node"
    static_configs:
      - targets: ["172.31.7.111:9100","172.31.7.112:9100","172.31.7.113:9100"]
View Code

 

 

 

 

 

 

 

 

 

 

 

4.2 钉钉告警

 

 

 

 

 

 

 验证钉钉机器人

#!/bin/bash
source   /etc/profile
#PHONE=$1
#SUBJECT=$2
MESSAGE=$1

/usr/bin/curl -X "POST"  'https://oapi.dingtalk.com/robot/send?access_token=65cbfef3732b87f78a200973bbbed0d06f73ed38ae0ff04667fa1fec580faa41' \\
-H 'Content-Type: application/json' \\
-d '{"msgtype": "text",
    "text": {
         "content": "'${MESSAGE}'"
    }
  }'

#!/usr/bin/python3
import sys
import requests
import json
#钉钉告警:
def info(msg):
    url = 'https://oapi.dingtalk.com/robot/send?access_token=65cbfef3732b87f78a200973bbbed0d06f73ed38ae0ff04667fa1fec580faa41'

    headers = {
        'Content-Type': 'application/json;charset=utf-8'
    }
    formdata = {
        "msgtype": "text",
        "text": {"content":str(msg)}
    }
    #print(formdata)
    requests.post(url=url, data=json.dumps(formdata),headers=headers)
info(sys.argv[1])
View Code

 

安装插件

# 从github下载
cd /usr/local/src && wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v1.4.0/prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
root@prometheus-server03:/usr/local/src# tar xf prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
root@prometheus-server03:/usr/local/src# mv prometheus-webhook-dingtalk-1.4.0.linux-amd64 /apps/

ln -sf /apps/prometheus-webhook-dingtalk-1.4.0.linux-amd64/  /apps/prometheus-webhook-dingtalk
cd  /apps/prometheus-webhook-dingtalk
nohup ./prometheus-webhook-dingtalk --web.enable-ui --web.listen-address="0.0.0.0:8060" --ding.profile="alertname=https://oapi.dingtalk.com/robot/send?access_token=8a0a74aac1cac011698d64c95f9db734c1995b3f3ee0d6e5742d9ec4aadb2318" &
# 注意  alertname 为机器人的关键词,否则会报错

# 配置告警邮件 alartmanager 服务器

tee /apps/alertmanager/alertmanager.yml << "EOF"
global:
  resolve_timeout: 2m
  smtp_smarthost: 'smtp.mxhichina.com:465'
  smtp_from: 'mon@iclinux.com'
  smtp_auth_username: 'mon@iclinux.com'
  smtp_auth_password: 'xxxxxx'
  #smtp_hello: '@aliyun.com'
  smtp_require_tls: false

route: #route 用来设置报警的分发策略
  group_by: ['alertname'] #采用哪个标签来作为分组依据
  group_wait: 2s
  group_interval: 3s
  repeat_interval: 3s
  receiver: 'dingding' #设置接收人

receivers:
  - name: 'email'
    #webhook_configs:
    #- url: 'http://127.0.0.1:5001/'
    email_configs:
      - to: 'mon@iclinux.com'
  - name: dingding
    webhook_configs:
    - url: 'http://172.31.1.103:8060/dingtalk/alertname/send' # alertname 为钉钉机器人关键词
      send_resolved: true

inhibit_rules: #抑制的规则
  - source_match: #源匹配级别,当匹配成功发出通知,但是其它'alertname', 'dev', 'instance'产生的  warning 级别的告警通知将被抑制
      severity: 'critical' #报警的事件级别
    target_match:
      severity: 'warning' #调用source_match 的severity 即如果已经有'critical' 级别的报警,那么将匹  配目标为新产生的告警级别为'warning' 的将被抑制
    equal: ['alertname', 'dev', 'instance'] #匹配那些对象的告警
EOF
systemctl  restart alertmanager.service


# webhook-dingtalk 服务器配置模板

tee /apps/prometheus-webhook-dingtalk/template_dingtalk.yaml << "EOF"
{{ define "dingding.to.message" }}

{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}

=========  **监控告警** =========

**告警程序:**     Alertmanager
**告警类型:**    {{ $alert.Labels.alertname }}
**告警级别:**    {{ $alert.Labels.severity }} 级
**告警状态:**    {{ .Status }}
**故障主机:**    {{ $alert.Labels.instance }} {{ $alert.Labels.device }}
**告警主题:**    {{ .Annotations.summary }}
**告警详情:**    {{ $alert.Annotations.message }}{{ $alert.Annotations.description}}
**主机标签:**    {{ range .Labels.SortedPairs  }}  </br> [{{ .Name }}: {{ .Value | markdown | html }} ]
{{- end }} </br>

**故障时间:**    {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end =  =========
{{- end }}
{{- end }}

{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}

========= 告警恢复 =========
**告警程序:**     Alertmanager
**告警主题:**    {{ $alert.Annotations.summary }}
**告警主机:**    {{ .Labels.instance }}
**告警类型:**    {{ .Labels.alertname }}
**告警级别:**    {{ $alert.Labels.severity }} 级
**告警状态:**    {{   .Status }}
**告警详情:**    {{ $alert.Annotations.message }}{{ $alert.Annotations.description}}
**故障时间:**    {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
**恢复时间:**    {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}

========= = **end** =  =========
{{- end }}
{{- end }}
{{- end }}
EOF


tee /apps/prometheus-webhook-dingtalk/config.yml << "EOF"
## Request timeout
# timeout: 5s

## Customizable templates path
templates:
  - /apps/prometheus-webhook-dingtalk/template_dingtalk.yaml
#   - contrib/templates/legacy/template.tmpl

## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
# default_message:
#   title: '{{ template "legacy.title" . }}'
#   text: '{{ template "legacy.content" . }}'


## Targets, previously was known as "profiles"
targets:
  alertname:    # 此处为关键词
    url: https://oapi.dingtalk.com/robot/send?access_token=8a0a74aac1cac011698d64c95f9db734c1995b3f3ee0d6e5742d9ec4aadb2318
    # secret for signature
    # secret: SEC716a2b98ec7bbacf8a78f849caf17adf6450bb29b3d1b62ff53cd393839f1927
    message:
      # Use legacy template
      text: '{{ template "dingding.to.message" . }}'

EOF

./prometheus-webhook-dingtalk --web.listen-address="0.0.0.0:8060" --web.enable-ui --config.file="config.yml"
View Code

 

 

 

 

 

 

 

4.3 微信告警

 

 

 

 

 

 

 

 

 

 

 

 

# altermagager 设置
tee message_template.templ << "EOF"
{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 -}}
**********告警通知**********
告警类型: {{ $alert.Labels.alertname }}
告警级别: {{ $alert.Labels.severity }}
{{- end }}
=====================
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.description }}
故障时间: {{ $alert.StartsAt.Local.Format "2006-01-02 15:04:05" }}
{{ if gt (len $alert.Labels.instance) 0 -}}故障实例: {{ $alert.Labels.instance }}{{- end -}}
{{- end }}
{{- end }}

{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 -}}
**********恢复通知**********
告警类型: {{ $alert.Labels.alertname }}
告警级别: {{ $alert.Labels.severity }}
{{- end }}
=====================
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.description }}
故障时间: {{ $alert.StartsAt.Local.Format "2006-01-02 15:04:05" }}
恢复时间: {{ $alert.EndsAt.Local.Format "2006-01-02 15:04:05" }}
{{ if gt (len $alert.Labels.instance) 0 -}}故障实例: {{ $alert.Labels.instance }}{{- end -}}
{{- end }}
{{- end }}
{{- end }}
EOF
View Code

 

posted @ 2023-03-31 17:47  john221100  阅读(74)  评论(0编辑  收藏  举报