25 Prometheus和alertmanager高可用--Thanos

一、prometheus高可用第一种方式

1.准备3台centos服务器
2.设置计算机名
3.安装docker和docker-compose

安装prometheus

# 2台安装prometheus服务
mkdir /data/

cd /data/

git clone https://gitee.com/linge365/docker-prometheus.git

cd docker-prometheus

root@os:/data/docker-prometheus# cat docker-compose.yaml
version: '3.3'

volumes:
  prometheus_data: {}
  grafana_data: {}

networks:
  monitoring:
    driver: bridge

services:
  prometheus:
    image: prom/prometheus:v2.37.6
    container_name: prometheus
    restart: always
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - ./prometheus/:/etc/prometheus/
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
      - '--web.console.templates=/usr/share/prometheus/consoles'
      #热加载配置
      - '--web.enable-lifecycle'
      #api配置
      # - '--web.enable-admin-api'
      #历史数据最大保留时间，默认15天
      - '--storage.tsdb.retention.time=30d'  
    networks:
      - monitoring
    links:
      - alertmanager
      - cadvisor
      - node_exporter
    expose:
      - '9090'
    ports:
      - 9090:9090
    depends_on:
      - cadvisor

  alertmanager:
    image: prom/alertmanager:v0.25.0
    container_name: alertmanager
    restart: always
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - ./alertmanager/:/etc/alertmanager/
    command:
      - '--config.file=/etc/alertmanager/config.yml'
      - '--storage.path=/alertmanager'
    networks:
      - monitoring
    expose:
      - '9093'
    ports:
      - 9093:9093

  cadvisor:
    image: google/cadvisor:latest
    container_name: cadvisor
    restart: always
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    networks:
      - monitoring
    expose:
      - '8080'
    ports:
    - 8080:8080

  node_exporter:
    image: prom/node-exporter:v1.5.0
    container_name: node-exporter
    restart: always
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command: 
      - '--path.procfs=/host/proc' 
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc|rootfs/var/lib/docker)($$|/)'
    networks:
      - monitoring
    ports:
      - '9100:9100'

  grafana:
    image: grafana/grafana:9.4.3
    container_name: grafana
    restart: always
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - grafana_data:/var/lib/grafana
      - ./grafana/provisioning/:/etc/grafana/provisioning/
    env_file:
      - ./grafana/config.monitoring
    networks:
      - monitoring
    links:
      - prometheus
    ports:
      - 3000:3000
    depends_on:
      - prometheus

docker-compose.yml

docker-compose up -d

# 配置邮件告警
vim alertmanager/config.yml
smtp_auth_password: 'your-password'

# 修改prometheus.yml文件
cat >prometheus/prometheus.yml<<"EOF"
# 全局配置
global:
  scrape_interval:     15s # 将搜刮间隔设置为每15秒一次。默认是每1分钟一次。
  evaluation_interval: 15s # 每15秒评估一次规则。默认是每1分钟一次。

# Alertmanager 配置
alerting:
  alertmanagers:
  - static_configs:
    - targets: ['alertmanager:9093']

# 报警(触发器)配置
rule_files:
  - "alert.yml"

# 搜刮配置
scrape_configs:
  - job_name: 'prometheus'
    # 覆盖全局默认值，每15秒从该作业中刮取一次目标
    scrape_interval: 15s
    static_configs:
    - targets: ['192.168.10.14:9090','192.168.10.129:9090']
  - job_name: 'alertmanager'
    scrape_interval: 15s
    static_configs:
    - targets: ['192.168.10.14:9093','192.168.10.129:9093']
  - job_name: 'cadvisor'
    scrape_interval: 15s
    static_configs:
    - targets: ['192.168.10.14:8080']
      labels:
        instance: Prometheus01
    - targets: ['192.168.10.129:8080']
      labels:
        instance: Prometheus02
  - job_name: 'node-exporter'
    scrape_interval: 15s
    static_configs:
    - targets: ['192.168.10.14:9100']
      labels:
        instance: Prometheus01
    - targets: ['192.168.10.129:9100']
      labels:
        instance: Prometheus02
EOF

# 监控11.100客户端
安装node_exporter
mkdir /data/node_exporter -p

cd /data/node_exporter

cat > docker-compose.yaml <<"EOF"
version: '3.3'
services:
  node_exporter:
    image: prom/node-exporter:v1.5.0
    container_name: node-exporter
    restart: always
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command: 
      - '--path.procfs=/host/proc' 
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc|rootfs/var/lib/docker)($$|/)'
    ports:
      - '9100:9100'
EOF

docker-compose up -d

# 2台prometheus添加配置
cat >> /data/docker-prometheus/prometheus/prometheus.yml<<"EOF"
    - targets: ['192.168.10.100:9100']
      labels:
        instance: test服务器 
EOF

curl -X POST http://localhost:9090/-/reload

二、Prometheus高可用第二种方式

Prometheus在小规模的部署中，完全不需要依赖其他组件就可以达到监控的目的。但在应对大规模、大数据量的集群时，就存在缺少集群、数据可靠性保障的支持。Prometheus虽然支持联邦部署模式，但这个架构还是会有其他问题，如数据会被重复储存在两个地方，还有被拉取的Prometheus机器有可能发生超时现象。另外，监控数据分散在多台Prometheus监控节点，在变动或者长期存储时依赖本地磁盘，本地磁盘一般不是高可用的存储，监控数据就变得没有可靠性保障。

为了解决上述问题，产生了多种解决方案，Thanos方案是最成熟、广泛使用的

2.1 Thanos概述

Thanos 是一个基于 Prometheus 实现的监控方案，其主要设计目的是解决原生 Prometheus 上的痛点，并且做进一步的提升，主要的特性有：全局查询，高可用，动态拓展，长期存储。

架构

Thanos 主要由如下几个特定功能的组件组成：

边车组件（Sidecar）：连接到 Prometheus，并把 Prometheus 暴露给查询网关（Querier/Query），以供实时查询，并且可以上传 Prometheus 数据到云存储，以供长期保存（和Receiver二选一）
查询网关（Querier）：实现 Prometheus API 以聚合来自底层组件（如边车组件 Sidecar，或是存储网关 Store Gateway）的数据
存储网关（Store Gateway）：将云存储中的数据内容暴露出来（可选）
压缩器（Compactor）：将云存储中的数据进行压缩和下采样和保留（可选）
接收器（Receiver）：从 Prometheus 的远程写入 WAL 接收数据，将其暴露出去或者上传到云存储（和Sidecar二选一）
规则组件（Ruler）：根据 Thanos 中的数据评估记录和警报规则（可选）
查询前端：实现 Prometheus 的 API，将其代理给 Query，同时缓存响应（可选）

Prometheus开启api

因为Thanos Querier需要调用Prometheus API 来查询数据，所以prometheus需要开启api功能，配置如下

修改docker-compose.yml配置文件

--web.enable-admin-api

应用配置：docker-compose up -d

2.2 二进制安装Thanos

github地址

wget https://github.com/thanos-io/thanos/releases/download/v0.31.0/thanos-0.31.0.linux-amd64.tar.gz

tar xf thanos-0.31.0.linux-amd64.tar.gz

mv thanos-0.31.0.linux-amd64 /opt/prometheus/thanos

chown -R prometheus.prometheus /opt/prometheus/thanos

2.2.1 安装 Thanos Sidecar

cat >/etc/systemd/system/thanos-sidecar.service<<"EOF"
[Unit]
Description=Thanos SideCar
After=network-online.target

[Service]
User=prometheus
Group=prometheus
Restart=on-failure
ExecStart=/opt/prometheus/thanos/bin/thanos sidecar \
    --http-address              0.0.0.0:19191 \
    --grpc-address              0.0.0.0:19090 \  
    --tsdb.path /opt/prometheus/prometheus/data \
    --prometheus.url "http://localhost:9090"

[Install]
WantedBy=multi-user.target
EOF

这里如果需要上传到对象存储还需要加上 --objstore.config-file /etc/thanos/bucket_config.yaml，指定对象存储相关的参数。

启动并检查

systemctl start thanos-sidecar && systemctl status thanos-sidecar

systemctl daemon-reload && systemctl enable thanos-sidecar

2.2.2 安装 Thanos Query

cat >/etc/systemd/system/thanos-query.service<<"EOF"
[Unit]
Description=Thanos Query
After=network-online.target

[Service]
User=prometheus
Group=prometheus
Restart=on-failure
ExecStart=/opt/prometheus/thanos/bin/thanos query \
    --log.level=debug \
    --http-address 0.0.0.0:19192 \              # Endpoint for Thanos Querier UI
    --grpc-address 0.0.0.0:19092 \
    --query.replica-label=prometheus_replica \
    --store=192.168.11.60:19090 \
    --store=192.168.11.61:19090

[Install]
WantedBy=multi-user.target
EOF

systemctl start thanos-query && systemctl status thanos-query
systemctl daemon-reload && systemctl enable thanos-query

2.3 docker 安装Thanos（Sidecar和Querier）

修改prometheus.yml文件

# 增加如下配置：
global:
  #与外部系统通信时对时间序列或者告警信息添加的标签
  external_labels:
    cluster: 'Thanos'

curl -X POST http://localhost:9090/-/reload


# 安装Thanos Sidecar和 Thanos Querier
cd /data/docker-prometheus/
cp docker-compose.yaml{,.bak1}

# 在文件末尾追加
cat >>docker-compose.yaml<<"EOF"
  thanos-sidecar:
    image: thanosio/thanos:v0.31.0
    container_name: thanos-sidecar
    restart: always
    command:
      - sidecar
      - --http-address=0.0.0.0:19191
      - --grpc-address=0.0.0.0:19090
      - --tsdb.path=prometheus_data
      - --prometheus.url=http://192.168.10.14:9090  #192.168.11.60 prometheus01上使用这个配置
      #- --prometheus.url=http://192.168.10.129:9090 #192.168.11.61 prometheus02上使用这个配置
    ports:
      - '19090:19090'
      - '19191:19191'
  thanos-query:
    image: thanosio/thanos:v0.31.0
    container_name: thanos-query
    restart: always
    command:
      - query
      - --http-address=0.0.0.0:19192
      - --grpc-address=0.0.0.0:19092
      - --query.replica-label=prometheus_replica
      - --store=192.168.10.14:19090
      - --store=192.168.10.129:19090
    ports:
      - '19192:19192'
      - '19092:19092'
EOF

启动：docker-compose up -d

检查：

http://192.168.10.14:19192/stores

http://192.168.10.129:19192/stores

2.4 Grafana修改数据源

http://192.168.10.14:3000/

http://192.168.10.129:3000/

设置--DataSource---Prometheus--URL填写：http://192.168.10.14:19192(prometheus02的URL填写对应IP)

添加dashboard“1860”，显示图形化界面

至此prometheus高可用完成

三、Thanos远程存储

默认prometheus的数据存储受--storage.tsdb.retention.time=30d控制，我们可以使用Thanos把prometheus的数据永久保存。

Thanos官方文档

要在生产环境使用最好使用 Stable 状态的，比如 S3 或者兼容 S3 的服务，比如 Ceph、Minio 等等。

支持的存储类型

Provider	Maturity	Aimed For	Auto-tested on CI	Maintainers
Google Cloud Storage	Stable	Production Usage	yes	@bwplotka
AWS/S3 (and all S3-compatible storages e.g disk-based Minio )	Stable	Production Usage	yes	@bwplotka
Azure Storage Account	Stable	Production Usage	no	@vglafirov
OpenStack Swift	Beta (working PoC)	Production Usage	yes	@FUSAKLA
Tencent COS	Beta	Production Usage	no	@jojohappy,@hanjm
AliYun OSS	Beta	Production Usage	no	@shaulboozhiao,@wujinhu
Local Filesystem	Stable	Testing and Demo only	yes	@bwplotka
Oracle Cloud Infrastructure Object Storage	Beta	Production Usage	yes	@aarontams,@gaurav-05,@ericrrath

对于国内用户当然最方便的还是直接使用阿里云 OSS 或者腾讯云 COS 这样的服务，但是很多时候可能我们的服务并不是跑在公有云上面的，所以这里我们用 Minio 来部署一个兼容 S3 协议的对象存储服务。

thanos.yaml配置

minio配置

type: s3
config:
  bucket: ""
  endpoint: ip:9000
  access_key: ""
  secret_key: ""
  insecure: true

阿里云oss配置

type: ALIYUNOSS
config:
  endpoint: ""
  bucket: ""
  access_key_id: ""
  access_key_secret: ""

thanos-sidecar配置远程存储

Thanos远程存储是通过配置Sidecar读取Prometheus收集的数据，并写入到远程的对象存储中。所以在Sidecar中增加如下配置：

--objstore.config-file=/etc/thanos/thanos.yaml

注：配置远程存储后，需通过thanos Store Gateway来查看远程存储数据

四、alertmanager 高可用

测试：

停止node-exporter

手动测试alertmanaer接口

2台prometheus执行：

curl -X POST -H "Content-Type: application/json" -d '{
    "alerts": [
        {
            "status": "firing",
            "labels": {
                "severity": "critical",
                "alertname": "HighErrorRate",
                "instance": "server1"
            },
            "annotations": {
                "summary": "High error rate detected!",
                "description": "This is a description of the alert."
            },
            "startsAt": "2023-09-13T14:30:00Z",
            "endsAt": "2023-09-13T15:00:00Z"
        }
    ]
}' http://localhost:9093/api/v1/alerts

检查：

http://192.168.10.14:9093/#/alerts

http://192.168.10.129:9093/#/alerts

结果：

同一台告警信息，发送了2次。

由于ALertmanager之间不存在并不了解彼此的存在，因此则会出现告警通知被不同的Alertmanager重复发送多次的问题。

为了解决这一问题，Alertmanager在集群配置中引入了Gossip机制，Gossip机制为多个Alertmanager之间提供了信息传递的机制。确保及时在多个Alertmanager分别接收到相同告警信息的情况下，也只有一个告警通知被发送给Receiver

4.1 alertmanager高可用配置（集群）

prometheus01服务器配置docker-compose.yaml：

cd /data/docker-prometheus

vim docker-compose.yaml
# 增加下面的配置
  alertmanager:
    command:
      - '--log.level=debug'
      - '--cluster.listen-address=0.0.0.0:8001'
    ports:
      - '8001:8001'

注：
如果是二进制安装的alertmanager，只需要增加一行配置:--cluster.listen-address=0.0.0.0:8001


docker-compose up -d

prometheus02服务器配置docker-compose.yaml：

cd /data/docker-prometheus

vim docker-compose.yaml
# 192.168.10.14:8001为prometheus01的地址，根据实际修改
  alertmanager:
    command:
      - '--log.level=debug'
      - '--cluster.listen-address=0.0.0.0:8002'
      - '--cluster.peer=192.168.10.14:8001'
    ports:
      - '8002:8002'

# 注意：如果是二进制安装的alertmanager，只需要增加2行配置:--cluster.listen-address=0.0.0.0:8001和--cluster.peer=192.168.10.14:8001

docker-compose up -d

检查：

http://192.168.10.14:9093/#/alerts

http://192.168.10.129:9093/#/alerts

再次测试告警，可以看到只发送了一次

五、Prometheus和alertmanager集群整合

由于实现了Gossip机制，在Prometheus和Alertmanager实例之间不要使用任何负载均衡，需要确保Prometheus将告警发送到所有的Alertmanager实例中

2台prometheus修改配置如下，2台都要操作。

# Alertmanager 配置
alerting:
  alertmanagers:
  - static_configs:
    - targets: ['192.168.10.14:9093','192.168.10.129:9093']

重载配置：

curl -X POST http://localhost:9090/-/reload

测试：

docker stop node-exporter

5.1 停机测试

把prometheus01服务器关闭

打开thanos-query检查：

http://192.168.10.129:19192/stores

总结

通过安装Thanos Sidecar和Thanos Querier完成了prometheus的高可用。

通过alertmanager的集群配置（自带Gossip机制）来解决告警重复发送问题。

每次修改prometheus配置，添加grafana的Dashboard图形都需要在2台机器上都要操作。可以使用consul，dns等自动发现功能来减少配置带来的麻烦。

posted on 2024-05-06 18:07 杨梅冲阅读(491) 评论(0) 编辑收藏举报