prometheus容器部署(prom、rule、alert、dingding)
prometheus
先创建好映射目录/data/prometheus,然后把配置文件放上去,包括prom、record、alert配置,然后再启动容器
#cat prometheus.yml global: scrape_interval: 60s evaluation_interval: 60s rule_files: - "/etc/prometheus/node-exporter-record-rules.yml" - "/etc/prometheus/node-exporter-alert-rules.yml" alerting: alertmanagers: - static_configs: - targets: - 192.168.18.19:9093 scrape_configs: - job_name: prometheus static_configs: - targets: ['localhost:9090'] labels: instance: prometheus - job_name: linux static_configs: - targets: ['10.0.23.211:9100'] labels: instance: 10.0.23.211 - targets: ['10.0.23.210:9100'] labels: instance: 10.0.23.210
# cat node-exporter-record-rules.yml groups: - name: linux rules: - expr: up record: node_exporter:up labels: desc: "节点是否在线, 在线1,不在线0" unit: " " job: "linux" - expr: time() - node_boot_time_seconds{} record: node_exporter:node_uptime labels: desc: "节点的运行时间" unit: "s" job: "linux" ############################################################################################## # cpu # - expr: (1 - avg by (environment,instance) (irate(node_cpu_seconds_total{job="linux",mode="idle"}[5m]))) * 100 record: node_exporter:cpu:total:percent labels: desc: "节点的cpu总消耗百分比" unit: "%" job: "linux" - expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job="linux",mode="idle"}[5m]))) * 100 record: node_exporter:cpu:idle:percent labels: desc: "节点的cpu idle百分比" unit: "%" job: "linux" - expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job="linux",mode="iowait"}[5m]))) * 100 record: node_exporter:cpu:iowait:percent labels: desc: "节点的cpu iowait百分比" unit: "%" job: "linux" - expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job="linux",mode="system"}[5m]))) * 100 record: node_exporter:cpu:system:percent labels: desc: "节点的cpu system百分比" unit: "%" job: "linux" - expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job="linux",mode="user"}[5m]))) * 100 record: node_exporter:cpu:user:percent labels: desc: "节点的cpu user百分比" unit: "%" job: "linux" - expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job="linux",mode=~"softirq|nice|irq|steal"}[5m]))) * 100 record: node_exporter:cpu:other:percent labels: desc: "节点的cpu 其他的百分比" unit: "%" job: "linux" ############################################################################################## ############################################################################################## # memory # - expr: node_memory_MemTotal_bytes{job="linux"} record: node_exporter:memory:total labels: desc: "节点的内存总量" unit: byte job: "linux" - expr: node_memory_MemFree_bytes{job="linux"} record: node_exporter:memory:free labels: desc: "节点的剩余内存量" unit: byte job: "linux" - expr: node_memory_MemTotal_bytes{job="linux"} - node_memory_MemFree_bytes{job="linux"} record: node_exporter:memory:used labels: desc: "节点的已使用内存量" unit: byte job: "linux" - expr: node_memory_MemTotal_bytes{job="linux"} - node_memory_MemAvailable_bytes{job="linux"} record: node_exporter:memory:actualused labels: desc: "节点用户实际使用的内存量" unit: byte job: "linux" - expr: (1-(node_memory_MemAvailable_bytes{job="linux"} / (node_memory_MemTotal_bytes{job="linux"})))* 100 record: node_exporter:memory:used:percent labels: desc: "节点的内存使用百分比" unit: "%" job: "linux" - expr: ((node_memory_MemAvailable_bytes{job="linux"} / (node_memory_MemTotal_bytes{job="linux"})))* 100 record: node_exporter:memory:free:percent labels: desc: "节点的内存剩余百分比" unit: "%" job: "linux" ############################################################################################## # load # - expr: sum by (instance) (node_load1{job="linux"}) record: node_exporter:load:load1 labels: desc: "系统1分钟负载" unit: " " job: "linux" - expr: sum by (instance) (node_load5{job="linux"}) record: node_exporter:load:load5 labels: desc: "系统5分钟负载" unit: " " job: "linux" - expr: sum by (instance) (node_load15{job="linux"}) record: node_exporter:load:load15 labels: desc: "系统15分钟负载" unit: " " job: "linux" ############################################################################################## # disk # - expr: node_filesystem_size_bytes{job="linux" ,fstype=~"ext4|xfs"} record: node_exporter:disk:usage:total labels: desc: "节点的磁盘总量" unit: byte job: "linux" - expr: node_filesystem_avail_bytes{job="linux",fstype=~"ext4|xfs"} record: node_exporter:disk:usage:free labels: desc: "节点的磁盘剩余空间" unit: byte job: "linux" - expr: node_filesystem_size_bytes{job="linux",fstype=~"ext4|xfs"} - node_filesystem_avail_bytes{job="linux",fstype=~"ext4|xfs"} record: node_exporter:disk:usage:used labels: desc: "节点的磁盘使用的空间" unit: byte job: "linux" - expr: (1 - node_filesystem_avail_bytes{job="linux",fstype=~"ext4|xfs"} / node_filesystem_size_bytes{job="linux",fstype=~"ext4|xfs"}) * 100 record: node_exporter:disk:used:percent labels: desc: "节点的磁盘的使用百分比" unit: "%" job: "linux" - expr: irate(node_disk_reads_completed_total{job="linux"}[1m]) record: node_exporter:disk:read:count:rate labels: desc: "节点的磁盘读取速率" unit: "次/秒" job: "linux" - expr: irate(node_disk_writes_completed_total{job="linux"}[1m]) record: node_exporter:disk:write:count:rate labels: desc: "节点的磁盘写入速率" unit: "次/秒" job: "linux" - expr: (irate(node_disk_written_bytes_total{job="linux"}[1m]))/1024/1024 record: node_exporter:disk:read:mb:rate labels: desc: "节点的设备读取MB速率" unit: "MB/s" job: "linux" - expr: (irate(node_disk_read_bytes_total{job="linux"}[1m]))/1024/1024 record: node_exporter:disk:write:mb:rate labels: desc: "节点的设备写入MB速率" unit: "MB/s" job: "linux" ############################################################################################## # filesystem # - expr: (1 -node_filesystem_files_free{job="linux",fstype=~"ext4|xfs"} / node_filesystem_files{job="linux",fstype=~"ext4|xfs"}) * 100 record: node_exporter:filesystem:used:percent labels: desc: "节点的inode的剩余可用的百分比" unit: "%" job: "linux" ############################################################################################# # filefd # - expr: node_filefd_allocated{job="linux"} record: node_exporter:filefd_allocated:count labels: desc: "节点的文件描述符打开个数" unit: "%" job: "linux" - expr: node_filefd_allocated{job="linux"}/node_filefd_maximum{job="linux"} * 100 record: node_exporter:filefd_allocated:percent labels: desc: "节点的文件描述符打开百分比" unit: "%" job: "linux" ############################################################################################# # network # - expr: avg by (environment,instance,device) (irate(node_network_receive_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m])) record: node_exporter:network:netin:bit:rate labels: desc: "节点网卡eth0每秒接收的比特数" unit: "bit/s" job: "linux" - expr: avg by (environment,instance,device) (irate(node_network_transmit_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m])) record: node_exporter:network:netout:bit:rate labels: desc: "节点网卡eth0每秒发送的比特数" unit: "bit/s" job: "linux" - expr: avg by (environment,instance,device) (irate(node_network_receive_packets_total{device=~"eth0|eth1|ens33|ens37"}[1m])) record: node_exporter:network:netin:packet:rate labels: desc: "节点网卡每秒接收的数据包个数" unit: "个/秒" job: "linux" - expr: avg by (environment,instance,device) (irate(node_network_transmit_packets_total{device=~"eth0|eth1|ens33|ens37"}[1m])) record: node_exporter:network:netout:packet:rate labels: desc: "节点网卡发送的数据包个数" unit: "个/秒" job: "linux" - expr: avg by (environment,instance,device) (irate(node_network_receive_errs_total{device=~"eth0|eth1|ens33|ens37"}[1m])) record: node_exporter:network:netin:error:rate labels: desc: "节点设备驱动器检测到的接收错误包的数量" unit: "个/秒" job: "linux" - expr: avg by (environment,instance,device) (irate(node_network_transmit_errs_total{device=~"eth0|eth1|ens33|ens37"}[1m])) record: node_exporter:network:netout:error:rate labels: desc: "节点设备驱动器检测到的发送错误包的数量" unit: "个/秒" job: "linux" - expr: node_tcp_connection_states{job="linux", state="established"} record: node_exporter:network:tcp:established:count labels: desc: "节点当前established的个数" unit: "个" job: "linux" - expr: node_tcp_connection_states{job="linux", state="time_wait"} record: node_exporter:network:tcp:timewait:count labels: desc: "节点timewait的连接数" unit: "个" job: "linux" - expr: sum by (environment,instance) (node_tcp_connection_states{job="linux"}) record: node_exporter:network:tcp:total:count labels: desc: "节点tcp连接总数" unit: "个" job: "linux" ############################################################################################# # process # - expr: node_processes_state{state="Z"} record: node_exporter:process:zoom:total:count labels: desc: "节点当前状态为zoom的个数" unit: "个" job: "linux" ############################################################################################# # other # - expr: abs(node_timex_offset_seconds{job="linux"}) record: node_exporter:time:offset labels: desc: "节点的时间偏差" unit: "s" job: "linux" ############################################################################################# - expr: count by (instance) ( count by (instance,cpu) (node_cpu_seconds_total{ mode='system'}) ) record: node_exporter:cpu:count #
# cat node-exporter-alert-rules.yml groups: - name: node-exporter-alert rules: - alert: node-exporter-down expr: node_exporter:up == 0 for: 1m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} 宕机了" description: "instance: {{ $labels.instance }} \n- job: {{ $labels.job }} 关机了, 时间已经1分钟了。" value: "{{ $value }}" instance: "{{ $labels.instance }}" - alert: node-exporter-cpu-high expr: node_exporter:cpu:total:percent > 80 for: 3m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} cpu 使用率高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" - alert: node-exporter-cpu-iowait-high expr: node_exporter:cpu:iowait:percent >= 12 for: 3m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} cpu iowait 使用率高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" - alert: node-exporter-memory-high expr: node_exporter:memory:used:percent > 80 for: 3m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} memory 使用率高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" - alert: node-exporter-disk-high expr: node_exporter:disk:used:percent > 80 for: 10m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} disk 使用率高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" - alert: node-exporter-inode-high expr: node_exporter:filesystem:used:percent > 80 for: 10m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} inode 使用率高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" - alert: node-exporter-filefd-allocated-percent-high expr: node_exporter:filefd_allocated:percent > 80 for: 10m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} 打开文件描述符 高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}"
docker run -d -p 9090:9090 \ -v /data/prometheus:/etc/prometheus/ -v /data/promdata:/prometheus \ -v /etc/localtime:/etc/localtime:ro -v /etc/timezone:/etc/timezone:ro prom/prometheus \ --config.file=/etc/prometheus/prometheus.yml \ --storage.tsdb.path=/prometheus \ --web.console.libraries=/usr/share/prometheus/console_libraries \ --web.console.templates=/usr/share/prometheus/consoles \ --web.enable-admin-api \ # 控制对admin HTTP API的访问,其中包括删除时间序列等功能 --web.enable-lifecycle # 支持热更新,直接执行localhost:9090/-/reload立即生效
下载node_exporter后放入/usr/local/bin/
chmod +x node_exporter
groupadd -r prometheus useradd -r -g prometheus -s /sbin/nologin -M -c "prometheus Daemons" prometheus cat << EOF > /usr/lib/systemd/system/node_exporter.service [Service] User=prometheus Group=prometheus ExecStart=/usr/local/bin/node_exporter [Install] WantedBy=multi-user.target [Unit] Description=node_exporter After=network.target EOF systemctl start node_exporter systemctl status node_exporter systemctl enable node_exporter
alertmanager
docker run -d -p 9093:9093 -v /etc/localtime:/etc/localtime:ro -v /etc/timezone:/etc/timezone:ro -v /etc/timezone:/etc/timezone:ro prom/alertmanager
alert没有映射配置文件,直接到容器里修改后重启
报警接收者走的是钉钉的webhook
cat /etc/alertmanager/alertmanager.yml global: resolve_timeout: 5m route: receiver: webhook group_wait: 30s group_interval: 5m repeat_interval: 4h group_by: [alertname] routes: - receiver: webhook group_wait: 10s match: team: node receivers: - name: webhook webhook_configs: - send_resolved: true url: http://192.168.18.19:8060/dingtalk/webhook1/send
prometheus-webhook-dingtalk
启动时带上钉钉机器人给到的token启动
docker run -d --restart always -p 8060:8060 -v /etc/localtime:/etc/localtime:ro -v /etc/timezone:/etc/timezone:ro timonwong/prometheus-webhook-dingtalk --ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=xxx"