blackbox_exporter安装及监控
本文主要介绍如何使用blackbox_exporter的收集被监控主机的网站状态、端口等信息,借助 Prometheus 最终以仪表盘的形式显示在 Grafana 中。
blackbox_exporter是Prometheus 官方提供的 exporter 之一,可以提供 http、dns、tcp、icmp 的监控数据采集。
2.blackbox_exporter 应用场景
HTTP 测试
定义 Request Header 信息
判断 Http status / Http Respones Header / Http Body 内容
TCP 测试
业务组件端口状态监听
应用层协议定义与监听
ICMP 测试
主机探活机制
POST 测试
接口联通性
SSL 证书过期时间
3. 安装blackbox_exporter
3.1 各个版本的blackbox_exporter如下:
# wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.15.1/blackbox_exporter-0.15.1.linux-amd64.tar.gz
# tar -xvf blackbox_exporter-0.15.1.linux-amd64.tar.gz
# mv blackbox_exporter-0.15.1.linux-amd64 /usr/local/blackbox_exporter
3.3 创建systemd服务
vim /lib/systemd/system/blackbox_exporter.service
[Unit] Description=blackbox_exporter After=network.target [Service] ExecStart=/usr/local/blackbox_exporter/blackbox_exporter --config.file=/usr/local/blackbox_exporter/blackbox.yml Restart=on-failure [Install] WantedBy=multi-user.target
# systemctl daemon-reload
# systemctl start blackbox_exporter && systemctl enable blackbox_exporter
3.5 验证是否启动成功 默认监听端口为9115
# systemctl status blackbox_exporter
# netstat -lnpt|grep 9115
在prometheus.yml中加入blackbox_exporter
[root@prometheus prometheus]# cat prometheus.yml
# my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: - 172.16.1.12:20016 #告警节点的地址和端口 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: # - "first_rules.yml" # - "second_rules.yml" - "rules/*.yml" #开启配置监控模板,适用于所有主机,需要自定义,注意目录mkdir prometheus/rules # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. scrape_configs: # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. - job_name: 'prometheus' # metrics_path defaults to '/metrics' # scheme defaults to 'http'. static_configs: - targets: ['0.0.0.0:9090'] - job_name: 'MySQL' static_configs: - targets: ['172.16.1.3:9104'] - targets: ['172.16.1.4:9104'] - job_name: 'Redis' static_configs: - targets: ['172.16.1.12:9121'] - targets: ['172.16.1.13:9121'] - job_name: 'pika' static_configs: - targets: ['172.16.1.15:9121'] - job_name: 'elasticsearch' static_configs: - targets: ['172.16.1.12:9114'] - job_name: 'PostgreSQL' static_configs: - targets: ['172.16.1.12:9187'] - targets: ['172.16.1.3:9187'] - job_name: 'Node' static_configs: - targets: ['172.16.1.2:20015','172.16.1.3:20015','172.16.1.4:20015','172.16.1.5:20015','172.16.1.6:20015','172.16.1.7:20015','172.16.1.8:20015','172.16.1.9:20015','172.16.1.10:20015','172.16.1.12:20015'] - job_name: 'Nginx' static_configs: - targets: ['172.16.1.12:9113'] - job_name: "Port_status" metrics_path: /probe params: module: [tcp_connect] file_sd_configs: - files: - "/usr/local/prometheus/file_sd/port.yml" relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 172.16.1.12:9115 - job_name: http-status metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - https://mar.abk.com - https://ip.abk.com labels: #自定义标签,附加在target上 group: web relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 172.16.1.12:9115 - job_name: 'blackbox_check_hosts' metrics_path: /probe params: module: [icmp] static_configs: - targets: - 172.16.1.2 - 172.16.1.3 - 172.16.1.4 - 172.16.1.5 - 172.16.1.6 - 172.16.1.7 - 172.16.1.8 - 172.16.1.9 - 172.16.1.10 labels: group: icmp relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 172.16.1.12:9115 - job_name: 'pushgateway' honor_labels: true static_configs: - targets: ['172.16.1.12:20018'] labels: instance: pushgateway
[root@prometheus prometheus]# cat rules/blackbox.yml
groups: - name: BlackboxExporter rules: # - alert: Blackbox探测失败 # expr: 'probe_success == 0' # for: 0m # labels: # severity: 严重 # annotations: # summary: Blackbox 探测失败 (instance {{ $labels.instance }}) # description: "探测失败\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: Blackbox配置重新加载失败 expr: 'blackbox_exporter_config_last_reload_successful != 1' for: 0m labels: severity: 警告 annotations: summary: Blackbox 配置重新加载失败 (instance {{ $labels.instance }}) description: "Blackbox配置重新加载失败\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: Blackbox探测时间慢 expr: 'avg_over_time(probe_duration_seconds[1m]) > 5' for: 1m labels: severity: 警告 annotations: summary: Blackbox 探测时间慢 (instance {{ $labels.instance }}) description: "Blackbox探测花了5秒钟以上的时间完成\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: Blackbox探测HTTP失败 expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400' for: 0m labels: severity: 严重 annotations: summary: Blackbox 探测HTTP失败 (instance {{ $labels.instance }}) description: "HTTP状态代码不是200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: Blackbox SSL证书于30天内过期 expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 30' for: 0m labels: severity: 警告 annotations: summary: Blackbox SSL证书于30天内过期 (instance {{ $labels.instance }}) description: "SSL证书将在30天内过期\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: Blackbox SSL证书于3天内过期 expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3' for: 0m labels: severity: 严重 annotations: summary: Blackbox SSL证书于3天内过期 (instance {{ $labels.instance }}) description: "SSL证书将在3天内过期\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: BlackboxSSL证书已过期 expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0' for: 0m labels: severity: 严重 annotations: summary: Blackbox SSL证书已过期 (instance {{ $labels.instance }}) description: "SSL证书已经过期\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: Blackbox探测慢速HTTP expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 5' for: 1m labels: severity: 警告 annotations: summary: Blackbox 探测慢速HTTP (instance {{ $labels.instance }}) description: "HTTP请求花费了超过5s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: Blackbox探测慢速ping expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 5' for: 1m labels: severity: 警告 annotations: summary: Blackbox 探测慢速ping (instance {{ $labels.instance }}) description: "Blackbox ping耗时超过5s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
[root@prometheus prometheus]# cat prometheus/file_sd/port.yml #- targets: # - 172.16.1.2:20015 # - 172.16.1.3:20015 # - 172.16.1.4:20015 # - 172.16.1.5:20015 # - 172.16.1.6:20015 # - 172.16.1.7:20015 # - 172.16.1.8:20015 # - 172.16.1.9:20015 # - 172.16.1.10:20015 # labels: ## group: 自定义 ## tag: 自定义 # group: 生产平台 # tag: node_exporter - targets: - 172.16.1.2:22 - 172.16.1.3:22 - 172.16.1.4:22 - 172.16.1.5:22 - 172.16.1.6:22 - 172.16.1.7:22 - 172.16.1.8:22 - 172.16.1.9:22 - 172.16.1.10:22 labels: group: 生产平台 tag: sshd - targets: - 172.16.1.3:3306 labels: group: 生产平台 tag: mysql_master - targets: - 172.16.1.4:3306 labels: group: 生产平台 tag: mysql_slave - targets: - 172.16.1.8:6379 labels: group: 生产平台 tag: redis - targets: - 172.16.1.2:15432 labels: group: 生产平台 tag: pgsql - targets: - 172.16.1.3:9201 labels: group: 生产平台 tag: es_1 - targets: - 172.16.1.4:9201 labels: group: 生产平台 tag: es_2 - targets: - 172.16.1.5:9201 labels: group: 生产平台 tag: es_3 - targets: - 172.16.1.2:8888 labels: group: 生产平台 tag: oss - targets: - 172.17.1.12:443 labels: group: 生产平台 tag: nginx
访问blackbox
访问promethues
访问grafana
导入模版编号16292