me小怪兽

导航

blackbox_exporter安装及监控

本文主要介绍如何使用blackbox_exporter的收集被监控主机的网站状态、端口等信息,借助 Prometheus 最终以仪表盘的形式显示在 Grafana 中。
blackbox_exporter是Prometheus 官方提供的 exporter 之一,可以提供 http、dns、tcp、icmp 的监控数据采集。
2.blackbox_exporter 应用场景
HTTP 测试
定义 Request Header 信息
判断 Http status / Http Respones Header / Http Body 内容
TCP 测试
业务组件端口状态监听
应用层协议定义与监听
ICMP 测试
主机探活机制
POST 测试
接口联通性
SSL 证书过期时间
3. 安装blackbox_exporter
3.1 各个版本的blackbox_exporter如下:
# wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.15.1/blackbox_exporter-0.15.1.linux-amd64.tar.gz
# tar -xvf blackbox_exporter-0.15.1.linux-amd64.tar.gz
# mv blackbox_exporter-0.15.1.linux-amd64 /usr/local/blackbox_exporter
3.3 创建systemd服务
vim /lib/systemd/system/blackbox_exporter.service

[Unit]
Description=blackbox_exporter
After=network.target
[Service]
ExecStart=/usr/local/blackbox_exporter/blackbox_exporter --config.file=/usr/local/blackbox_exporter/blackbox.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target

# systemctl daemon-reload
# systemctl start blackbox_exporter  && systemctl enable blackbox_exporter

3.5 验证是否启动成功 默认监听端口为9115
# systemctl status blackbox_exporter
# netstat -lnpt|grep 9115


在prometheus.yml中加入blackbox_exporter

[root@prometheus prometheus]# cat prometheus.yml

# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
       - 172.16.1.12:20016 #告警节点的地址和端口

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  - "rules/*.yml"  #开启配置监控模板,适用于所有主机,需要自定义,注意目录mkdir prometheus/rules

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['0.0.0.0:9090']
- job_name: 'MySQL'
    static_configs:
      - targets: ['172.16.1.3:9104']
      - targets: ['172.16.1.4:9104']
  - job_name: 'Redis'
    static_configs:
      - targets: ['172.16.1.12:9121']
      - targets: ['172.16.1.13:9121']
  - job_name: 'pika'
    static_configs:
      - targets: ['172.16.1.15:9121']
  - job_name: 'elasticsearch'
    static_configs:
      - targets: ['172.16.1.12:9114']
  - job_name: 'PostgreSQL'
    static_configs:
      - targets: ['172.16.1.12:9187']
      - targets: ['172.16.1.3:9187']
  - job_name: 'Node'
    static_configs:
      - targets: ['172.16.1.2:20015','172.16.1.3:20015','172.16.1.4:20015','172.16.1.5:20015','172.16.1.6:20015','172.16.1.7:20015','172.16.1.8:20015','172.16.1.9:20015','172.16.1.10:20015','172.16.1.12:20015']
  - job_name: 'Nginx'
    static_configs:
      - targets: ['172.16.1.12:9113']

  - job_name: "Port_status"
    metrics_path: /probe
    params:
      module: [tcp_connect]
    file_sd_configs:
    - files:
      - "/usr/local/prometheus/file_sd/port.yml"
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: 172.16.1.12:9115


  - job_name: http-status
    metrics_path: /probe
    params: 
      module: [http_2xx]
    static_configs:
    - targets:
      - https://mar.abk.com
      - https://ip.abk.com
      labels:   #自定义标签,附加在target上
        group: web
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: 172.16.1.12:9115


  - job_name: 'blackbox_check_hosts'
    metrics_path: /probe
    params:
      module: [icmp]
    static_configs:
    - targets:
      - 172.16.1.2
      - 172.16.1.3
      - 172.16.1.4
      - 172.16.1.5
      - 172.16.1.6
      - 172.16.1.7
      - 172.16.1.8
      - 172.16.1.9
      - 172.16.1.10
      labels:
        group: icmp
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: 172.16.1.12:9115

  - job_name: 'pushgateway'
    honor_labels: true
    static_configs:
      - targets: ['172.16.1.12:20018']
        labels:
          instance: pushgateway

[root@prometheus prometheus]# cat rules/blackbox.yml

groups:

- name: BlackboxExporter

  rules:

#    - alert: Blackbox探测失败
#      expr: 'probe_success == 0'
#      for: 0m
#      labels:
#        severity: 严重
#      annotations:
#        summary: Blackbox 探测失败 (instance {{ $labels.instance }})
#        description: "探测失败\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: Blackbox配置重新加载失败
      expr: 'blackbox_exporter_config_last_reload_successful != 1'
      for: 0m
      labels:
        severity: 警告
      annotations:
        summary: Blackbox 配置重新加载失败 (instance {{ $labels.instance }})
        description: "Blackbox配置重新加载失败\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: Blackbox探测时间慢
      expr: 'avg_over_time(probe_duration_seconds[1m]) > 5'
      for: 1m
      labels:
        severity: 警告
      annotations:
        summary: Blackbox 探测时间慢 (instance {{ $labels.instance }})
        description: "Blackbox探测花了5秒钟以上的时间完成\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: Blackbox探测HTTP失败
      expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
      for: 0m
      labels:
        severity: 严重
      annotations:
        summary: Blackbox 探测HTTP失败 (instance {{ $labels.instance }})
        description: "HTTP状态代码不是200-399\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: Blackbox SSL证书于30天内过期
      expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 30'
      for: 0m
      labels:
        severity: 警告
      annotations:
        summary: Blackbox SSL证书于30天内过期 (instance {{ $labels.instance }})
        description: "SSL证书将在30天内过期\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: Blackbox SSL证书于3天内过期
      expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
      for: 0m
      labels:
        severity: 严重
      annotations:
        summary: Blackbox SSL证书于3天内过期 (instance {{ $labels.instance }})
        description: "SSL证书将在3天内过期\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: BlackboxSSL证书已过期
      expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
      for: 0m
      labels:
        severity: 严重
      annotations:
        summary: Blackbox SSL证书已过期 (instance {{ $labels.instance }})
        description: "SSL证书已经过期\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: Blackbox探测慢速HTTP
      expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 5'
      for: 1m
      labels:
        severity: 警告
      annotations:
        summary: Blackbox 探测慢速HTTP (instance {{ $labels.instance }})
        description: "HTTP请求花费了超过5s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: Blackbox探测慢速ping
      expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 5'
      for: 1m
      labels:
        severity: 警告
      annotations:
        summary: Blackbox 探测慢速ping (instance {{ $labels.instance }})
        description: "Blackbox ping耗时超过5s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

[root@prometheus prometheus]#  cat prometheus/file_sd/port.yml 
#- targets:
#  - 172.16.1.2:20015
#  - 172.16.1.3:20015
#  - 172.16.1.4:20015
#  - 172.16.1.5:20015
#  - 172.16.1.6:20015
#  - 172.16.1.7:20015
#  - 172.16.1.8:20015
#  - 172.16.1.9:20015
#  - 172.16.1.10:20015
#  labels:
##    group: 自定义
##    tag: 自定义
#    group: 生产平台
#    tag: node_exporter
- targets:
  - 172.16.1.2:22
  - 172.16.1.3:22
  - 172.16.1.4:22
  - 172.16.1.5:22
  - 172.16.1.6:22
  - 172.16.1.7:22
  - 172.16.1.8:22
  - 172.16.1.9:22
  - 172.16.1.10:22
  labels:
    group: 生产平台 
    tag: sshd
- targets:
  - 172.16.1.3:3306
  labels:
    group: 生产平台
    tag: mysql_master
- targets:
  - 172.16.1.4:3306
  labels:
    group: 生产平台
    tag: mysql_slave
- targets:
  - 172.16.1.8:6379
  labels:
    group: 生产平台
    tag: redis
- targets:
  - 172.16.1.2:15432
  labels:
    group: 生产平台
    tag: pgsql
- targets:
  - 172.16.1.3:9201
  labels:
    group: 生产平台
    tag: es_1
- targets:
  - 172.16.1.4:9201
  labels:
    group: 生产平台
    tag: es_2
- targets:
  - 172.16.1.5:9201
  labels:
    group: 生产平台
    tag: es_3
- targets:
  - 172.16.1.2:8888
  labels:
    group: 生产平台
    tag: oss
- targets:
  - 172.17.1.12:443
  labels:
    group: 生产平台
    tag: nginx

访问blackbox

 

访问promethues

 访问grafana

导入模版编号16292

 

posted on 2022-08-31 23:54  me小怪兽  阅读(1717)  评论(0编辑  收藏  举报