docker-compsoe部署prometheus、Grafana监控、钉钉告警(一)

docker-compsoe部署prometheus、Grafana监控、钉钉告警(一)

一、Prometheus

  1. 架构

  1. 部署
  • 机器规划

    服务器名称 IP Address 说明
    Prometheus主服务 128.0.255.96 定时抓取metrics、时序存储、API接口
    Alertmanager 128.0.255.96 告警消息处理和发送
    prometheus-webhook-dingtalk 128.0.255.96 告警消息推送到钉钉
  • 文件目录规划

    2.1 Prometheus server

    • 建prometheus相关目录:
    mkdir prometheus && cd prometheus
    mkdir config data rules
    
    • 编写prometheus配置文件prometheus.yml配置文件
    global:
      # 指定Prometheus抓取应用程序数据的间隔为15秒,默认1m,可每个job单独设置
      scrape_interval: 15s # By default, scrape targets every 15 seconds.
      #
      # 抓取超时时间,默认10s
      scrape_timeout: 5s
      #
      # 估算规则的默认周期 # 每15秒计算一次规则。默认1分钟
      evaluation_interval: 15s
      #
      # Attach these labels to any time series or alerts when communicating with
      # external systems (federation, remote storage, Alertmanager).
      external_labels:
    	monitor: 'line-monitor'
    
      # 普罗米修斯 规则文件
    rule_files:
      #- "../rules/*.yml"
      - "/etc/prometheus/rules/*_rules.yml"
    
    # prometheus自身的Alert功能是根据我们配置的 规则文件 进行触发的,但是它并没有告警发邮件的功能,发送邮件的这件事儿是由 Alertmanager来做的
    alerting:
      alertmanagers:
    	- static_configs:
    		- targets:
    			["128.0.255.96:9093"]
    
      # A scrape configuration containing exactly one endpoint to scrape:
      # Here it's Prometheus itself.
    scrape_configs:
      # The job name is added as a label \`job=<job_name>\` to any timeseries scraped from this config.
      # prometheus自身提供的metrics
      - job_name: 'prometheus'
    	# metrics_path defaults to '/metrics'
    	# scheme defaults to 'http'.
    	static_configs:
    	- targets: ['prometheus:9090']
    
      # node-exporter Linux机器或节点监控metrics
      - job_name: 'node-exporter'
    	metrics_path: /metrics
    	static_configs:
    	  - targets: ['centos1:9100','centos2:9100','prometheus-node-exporter:9100']
     
      # windows_exporter windows机器监控metrics 
      - job_name: 'windows_exporter'
    	static_configs:
    	- targets: ['128.0.23.17:9182','128.0.202.103:9182','128.0.202.105:9182']
    
      # Microsoft SQL Server 监控metrics
      - job_name: 'sql-exporter' #通过sql_exporter收集
    	static_configs:
    	- targets: ['sql-exporter:9399']
    	  labels:
    		exporter_type: sql-exporter
      - job_name: 'prometheus-mssql-exporter' #通过prometheus-mssql-exporter收集 官方推荐
    	static_configs:
    	- targets: ['128.0.255.96:14000']
    	  labels:
    		host: '128.0.202.103:1433'
    		exporter_type: prom-mssql-exporter
    		exported_instance: '128.0.202.103:1433'
    	- targets: ['128.0.255.96:14001']
    	  labels:
    		host: '128.0.23.17:1433'
    		exporter_type: prom-mssql-exporter
    		exported_instance: '128.0.23.17:1433'
    
      # redis_exporter Redis监控metrics
      - job_name: 'redis_exporter'
    	scrape_interval: 5s
    	static_configs:
    	- targets: ['128.0.255.96:9121','128.0.255.96:9122']
    
    • 编写docker-compose-prometheus.yml文件
    version: '3'
    services:
      prometheus:
    	image: prom/prometheus
    	container_name: prometheus
    	restart: always
    	ports:
    	  - "9090:9090"
    	volumes:
    	  - /home/prometheus/docker/prometheus/prometheus:/etc/prometheus
    	  - /home/prometheus/docker/prometheus/prometheus/data:/prometheus
    	command:
    	  - '--config.file=/etc/prometheus/config/prometheus.yml'
    	  - '--storage.tsdb.path=/prometheus'
    	networks:
    	- prometheus
    
    networks:
      prometheus:
    	name: prometheus
    	driver: bridge
    	ipam:
    	  driver: default
    	  config:
    	  - subnet: 10.24.0.0/16
    
    • 启动prometheus
    docker-compose -f docker-compose-prometheus.yml up -d
    
    • 验证部署是否成功

    浏览器打开 http://128.0.255.96:9090/

    浏览器打开 http://128.0.255.96:9090/metrics

    出现以上显示即为部署成功。

    因为prometheus.yml文件中job_name: 'prometheus'配置了prometheus自身metrics的收集,所以'Targets'中可以看到http://prometheus:9090/metrics是在线的

    • 部署完后目录结构


    2.2 Alertmanager

    • 创建文件目录
    mkdir alertmanager && cd alertmanager
    mkdir config data dingtalk
    
    • 编写alertmanager.yml配置文件
    global:
      # 在没有报警的情况下声明为已解决的时间
      resolve_timeout: 1m
    
      # The smarthost and SMTP sender used for mail notifications.
      # smtp配置
      smtp_from: "186****7521@163.com"
      smtp_smarthost: 'smtp.163.com:465'
      smtp_auth_username: "186****7521@163.com"
      smtp_auth_password: "TVJT********BMKB"
      smtp_require_tls: false
    
    # email、企业微信的模板配置存放位置,钉钉的模板会单独讲如果配置。
    templates:
      - '/config/templates/*.tmpl'
    
    # 所有报警信息进入后的根路由,用来设置报警的分发策略  
    route:
      # The labels by which incoming alerts are grouped together. For example,
      # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
      # be batched into a single group.
      # 这里的标签列表是接收到报警信息后的重新分组标签,例如,接收到的报警信息里面有许多具有 cluster=A 和 alertname=LatncyHigh 这样的标签的报警信息将会批量被聚合到一个分组里面
      group_by: ['alertname', 'cluster', 'service']
    
      # When a new group of alerts is created by an incoming alert, wait at
      # least 'group_wait' to send the initial notification.
      # This way ensures that you get multiple alerts for the same group that start
      # firing shortly after another are batched together on the first
      # notification.
      # 当一个新的报警分组被创建后,需要等待至少group_wait时间来初始化通知,这种方式可以确保您能有足够的时间为同一分组来获取多个警报,然后一起触发这个报警信息。
      group_wait: 5s
    
      # When the first notification was sent, wait 'group_interval' to send a batch
      # of new alerts that started firing for that group.
      # 当第一个报警发送后,等待'group_interval'时间来发送新的一组报警信息。
      group_interval: 30s
    
      # If an alert has successfully been sent, wait 'repeat_interval' to
      # resend them.
      # 如果一个报警信息已经发送成功了,等待'repeat_interval'时间来重新发送他们
      repeat_interval: 1m
    
      # 默认的receiver:如果一个报警没有被一个route匹配,则发送给默认的接收器
      receiver: 'default'
    
      # 子路由规则
      routes:
      - receiver: default
    	match_re:
    	  notify_type: email # label匹配email
    	group_wait: 5s
      - receiver: webhook
    	match_re:
    	  notify_type: dingtalk # label匹配dingtalk
    	group_wait: 5s
    
    # 报警接收者,即发送方式配置
    receivers:
    - name: 'default'
      email_configs:
      - to: '463***624@qq.com'
        html: '{{ template "alert.html" . }}'
        headers: { Subject: "[WARN] 报警邮件" }
    - name: 'webhook'
      webhook_configs:
      - url: 'http://128.0.255.96:8060/dingtalk/webhook1/send'
    	send_resolved: true
    
    # 抑制配置
    inhibit_rules:
      - source_match: # 源标签警报触发时抑制含有目标标签的警报
    	  severity: 'critical'
    	target_match: # 目标标签警报触发时抑制含有目标标签的警报
    	  severity: 'warning'
    	equal: ['alertname', 'dev', 'instance']
    
    • 编写alert_email.tmpl文件

    创建文件夹

    mkdir templates && cd templates && vim alert_email.tmpl
    
    {{ define "alert.html" }}
    <table>
    	<tr><td>报警名</td><td>开始时间</td></tr>
    	{{ range $i, $alert := .Alerts }}
    		<tr><td>{{ index $alert.Labels "alertname" }}</td><td>{{ $alert.StartsAt }}</td></tr>
    	{{ end }}
    </table>
    {{ end }}
    
    • 编写docker-compose-alertmanager.yml文件
    version: '3'
    services:
      alertmanager:
    	image: prom/alertmanager:latest
    	container_name: altermanager
    	hostname: altermanager
    	restart: always
    	ports:
    	  - '9093:9093'
    	volumes:
    	  - '/home/prometheus/docker/prometheus/alertmanager/config:/config'
    	  - '/home/prometheus/docker/prometheus/alertmanager/data:/alertmanager/data'
    	command:
    	  - '--config.file=/config/alertmanager.yml'
    	networks:
    	- prometheus
    
    networks:
      prometheus:
    	name: prometheus
    
    • 启动prometheus
    docker-compose -f docker-compose-alertmanager.yml up -d
    
    • 验证部署是否成功

    浏览器打开 http://128.0.255.96:9093/

    • 部署完后目录结构

    2.3 Pushgateway(未使用)

posted @   Nine4酷  阅读(511)  评论(0编辑  收藏  举报
编辑推荐:
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
阅读排行:
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 按钮权限的设计及实现
· 25岁的心里话
点击右上角即可分享
微信分享提示