docker-compsoe部署prometheus、Grafana监控、钉钉告警（一）

一、Prometheus

架构

部署

机器规划

服务器名称	IP Address	说明
Prometheus主服务	128.0.255.96	定时抓取metrics、时序存储、API接口
Alertmanager	128.0.255.96	告警消息处理和发送
prometheus-webhook-dingtalk	128.0.255.96	告警消息推送到钉钉

文件目录规划

2.1 Prometheus server

mkdir prometheus && cd prometheus
mkdir config data rules

编写prometheus配置文件prometheus.yml配置文件

global:
  # 指定Prometheus抓取应用程序数据的间隔为15秒，默认1m，可每个job单独设置
  scrape_interval: 15s # By default, scrape targets every 15 seconds.
  #
  # 抓取超时时间，默认10s
  scrape_timeout: 5s
  #
  # 估算规则的默认周期 # 每15秒计算一次规则。默认1分钟
  evaluation_interval: 15s
  #
  # Attach these labels to any time series or alerts when communicating with
  # external systems (federation, remote storage, Alertmanager).
  external_labels:
	monitor: 'line-monitor'

  # 普罗米修斯 规则文件
rule_files:
  #- "../rules/*.yml"
  - "/etc/prometheus/rules/*_rules.yml"

# prometheus自身的Alert功能是根据我们配置的 规则文件 进行触发的，但是它并没有告警发邮件的功能，发送邮件的这件事儿是由 Alertmanager来做的
alerting:
  alertmanagers:
	- static_configs:
		- targets:
			["128.0.255.96:9093"]

  # A scrape configuration containing exactly one endpoint to scrape:
  # Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label \`job=<job_name>\` to any timeseries scraped from this config.
  # prometheus自身提供的metrics
  - job_name: 'prometheus'
	# metrics_path defaults to '/metrics'
	# scheme defaults to 'http'.
	static_configs:
	- targets: ['prometheus:9090']

  # node-exporter Linux机器或节点监控metrics
  - job_name: 'node-exporter'
	metrics_path: /metrics
	static_configs:
	  - targets: ['centos1:9100','centos2:9100','prometheus-node-exporter:9100']
 
  # windows_exporter windows机器监控metrics 
  - job_name: 'windows_exporter'
	static_configs:
	- targets: ['128.0.23.17:9182','128.0.202.103:9182','128.0.202.105:9182']

  # Microsoft SQL Server 监控metrics
  - job_name: 'sql-exporter' #通过sql_exporter收集
	static_configs:
	- targets: ['sql-exporter:9399']
	  labels:
		exporter_type: sql-exporter
  - job_name: 'prometheus-mssql-exporter' #通过prometheus-mssql-exporter收集 官方推荐
	static_configs:
	- targets: ['128.0.255.96:14000']
	  labels:
		host: '128.0.202.103:1433'
		exporter_type: prom-mssql-exporter
		exported_instance: '128.0.202.103:1433'
	- targets: ['128.0.255.96:14001']
	  labels:
		host: '128.0.23.17:1433'
		exporter_type: prom-mssql-exporter
		exported_instance: '128.0.23.17:1433'

  # redis_exporter Redis监控metrics
  - job_name: 'redis_exporter'
	scrape_interval: 5s
	static_configs:
	- targets: ['128.0.255.96:9121','128.0.255.96:9122']

编写docker-compose-prometheus.yml文件

version: '3'
services:
  prometheus:
	image: prom/prometheus
	container_name: prometheus
	restart: always
	ports:
	  - "9090:9090"
	volumes:
	  - /home/prometheus/docker/prometheus/prometheus:/etc/prometheus
	  - /home/prometheus/docker/prometheus/prometheus/data:/prometheus
	command:
	  - '--config.file=/etc/prometheus/config/prometheus.yml'
	  - '--storage.tsdb.path=/prometheus'
	networks:
	- prometheus

networks:
  prometheus:
	name: prometheus
	driver: bridge
	ipam:
	  driver: default
	  config:
	  - subnet: 10.24.0.0/16

启动prometheus

docker-compose -f docker-compose-prometheus.yml up -d

验证部署是否成功

浏览器打开 http://128.0.255.96:9090/

浏览器打开 http://128.0.255.96:9090/metrics

出现以上显示即为部署成功。

因为prometheus.yml文件中job_name: 'prometheus'配置了prometheus自身metrics的收集，所以'Targets'中可以看到http://prometheus:9090/metrics是在线的

2.2 Alertmanager

mkdir alertmanager && cd alertmanager
mkdir config data dingtalk

编写alertmanager.yml配置文件

global:
  # 在没有报警的情况下声明为已解决的时间
  resolve_timeout: 1m

  # The smarthost and SMTP sender used for mail notifications.
  # smtp配置
  smtp_from: "186****7521@163.com"
  smtp_smarthost: 'smtp.163.com:465'
  smtp_auth_username: "186****7521@163.com"
  smtp_auth_password: "TVJT********BMKB"
  smtp_require_tls: false

# email、企业微信的模板配置存放位置，钉钉的模板会单独讲如果配置。
templates:
  - '/config/templates/*.tmpl'

# 所有报警信息进入后的根路由，用来设置报警的分发策略  
route:
  # The labels by which incoming alerts are grouped together. For example,
  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
  # be batched into a single group.
  # 这里的标签列表是接收到报警信息后的重新分组标签，例如，接收到的报警信息里面有许多具有 cluster=A 和 alertname=LatncyHigh 这样的标签的报警信息将会批量被聚合到一个分组里面
  group_by: ['alertname', 'cluster', 'service']

  # When a new group of alerts is created by an incoming alert, wait at
  # least 'group_wait' to send the initial notification.
  # This way ensures that you get multiple alerts for the same group that start
  # firing shortly after another are batched together on the first
  # notification.
  # 当一个新的报警分组被创建后，需要等待至少group_wait时间来初始化通知，这种方式可以确保您能有足够的时间为同一分组来获取多个警报，然后一起触发这个报警信息。
  group_wait: 5s

  # When the first notification was sent, wait 'group_interval' to send a batch
  # of new alerts that started firing for that group.
  # 当第一个报警发送后，等待'group_interval'时间来发送新的一组报警信息。
  group_interval: 30s

  # If an alert has successfully been sent, wait 'repeat_interval' to
  # resend them.
  # 如果一个报警信息已经发送成功了，等待'repeat_interval'时间来重新发送他们
  repeat_interval: 1m

  # 默认的receiver：如果一个报警没有被一个route匹配，则发送给默认的接收器
  receiver: 'default'

  # 子路由规则
  routes:
  - receiver: default
	match_re:
	  notify_type: email # label匹配email
	group_wait: 5s
  - receiver: webhook
	match_re:
	  notify_type: dingtalk # label匹配dingtalk
	group_wait: 5s

# 报警接收者，即发送方式配置
receivers:
- name: 'default'
  email_configs:
  - to: '463***624@qq.com'
    html: '{{ template "alert.html" . }}'
    headers: { Subject: "[WARN] 报警邮件" }
- name: 'webhook'
  webhook_configs:
  - url: 'http://128.0.255.96:8060/dingtalk/webhook1/send'
	send_resolved: true

# 抑制配置
inhibit_rules:
  - source_match: # 源标签警报触发时抑制含有目标标签的警报
	  severity: 'critical'
	target_match: # 目标标签警报触发时抑制含有目标标签的警报
	  severity: 'warning'
	equal: ['alertname', 'dev', 'instance']

编写alert_email.tmpl文件

创建文件夹

mkdir templates && cd templates && vim alert_email.tmpl

{{ define "alert.html" }}
<table>
	<tr><td>报警名</td><td>开始时间</td></tr>
	{{ range $i, $alert := .Alerts }}
		<tr><td>{{ index $alert.Labels "alertname" }}</td><td>{{ $alert.StartsAt }}</td></tr>
	{{ end }}
</table>
{{ end }}

编写docker-compose-alertmanager.yml文件

version: '3'
services:
  alertmanager:
	image: prom/alertmanager:latest
	container_name: altermanager
	hostname: altermanager
	restart: always
	ports:
	  - '9093:9093'
	volumes:
	  - '/home/prometheus/docker/prometheus/alertmanager/config:/config'
	  - '/home/prometheus/docker/prometheus/alertmanager/data:/alertmanager/data'
	command:
	  - '--config.file=/config/alertmanager.yml'
	networks:
	- prometheus

networks:
  prometheus:
	name: prometheus

启动prometheus

docker-compose -f docker-compose-alertmanager.yml up -d

验证部署是否成功

浏览器打开 http://128.0.255.96:9093/

2.3 Pushgateway(未使用)

posted @ 2023-04-19 23:53 Nine4酷阅读(584) 评论(0) 收藏举报

刷新页面返回顶部

docker-compsoe部署prometheus、Grafana监控、钉钉告警（一）

docker-compsoe部署prometheus、Grafana监控、钉钉告警（一）

一、Prometheus

公告