Prometheus+Grafana+Altermanager监控告警(一)
https://blog.csdn.net/yanggd1987/article/details/108807171 https://help.aliyun.com/document_detail/123394.html https://blog.csdn.net/baidu_36943075/article/details/91829364 https://blog.csdn.net/liukuan73/article/details/78881008 https://blog.csdn.net/aixiaoyang168/article/details/98474494
docker rm -f prometheus docker run --name=prometheus -d \ --restart=always \ -p 9090:9090 \ -v /data/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \ -v /data/prometheus/rules.yml:/etc/prometheus/rules.yml \ -v /data/prometheus/data:/data \ -v /data/prometheus/host_discovery_data:/host_discovery_data \ -v /data/prometheus/prometheus_rules:/prometheus_rules \ -v /etc/localtime:/etc/localtime \ prom/prometheus:v2.30.2 \ --config.file=/etc/prometheus/prometheus.yml \ --storage.tsdb.path=/data \ --storage.tsdb.retention=30d \ --web.external-url=http://10.5.250.10 \ --web.enable-lifecycle
rules.yml是个目录,告警规则在Prometheus_rules里面
global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. scrape_timeout: 15s evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). rule_files: - /prometheus_rules/*.rules scrape_configs: - job_name: prometheus static_configs: - targets: - localhost:9090 - job_name: 'host_discovery' file_sd_configs: - files: - "/host_discovery_data/*.json" refresh_interval: 3s - job_name: "kube-state-metrics" scheme: https tls_config: insecure_skip_verify: true #使用apiserver授权部分解密的token值,以文件形式存储 bearer_token_file: /data/xn-secret # k8s自动发现具体配置 kubernetes_sd_configs: # 使用endpoint级别自动发现 - role: endpoints api_server: "https://10.3.218.10:16443" tls_config: insecure_skip_verify: true bearer_token_file: /data/xn-secret relabel_configs: - source_labels: [__meta_kubernetes_service_name] # 只保留指定匹配正则的标签,不匹配则删除 action: keep #regex: '^(kube-state-metrics)$' regex: '^(prometheus-operator-kube-state-metrics)$' # - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] # 只保留指定匹配正则的标签,不匹配则删除 # action: keep # regex: true - source_labels: [__address__] action: replace target_label: instance - target_label: __address__ # 使用replacement值替换__address__默认值 replacement: 10.3.218.10:16443 - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_name, __meta_kubernetes_pod_container_port_number] # 正则匹配 regex: ([^;]+);([^;]+);([^;]+) # 使用replacement值替换__metrics_path__默认值 target_label: __metrics_path__ # 自行构建的apiserver proxy url replacement: /api/v1/namespaces/${1}/pods/http:${2}:${3}/proxy/metrics - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace # 将标签__meta_kubernetes_namespace修改为kubernetes_namespace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace # 将标签__meta_kubernetes_service_name修改为service_name target_label: service_name - job_name: "kube-node-exporter" scheme: https tls_config: insecure_skip_verify: true #使用apiserver授权部分解密的token值,以文件形式存储 bearer_token_file: /data/xn-secret # k8s自动发现具体配置 kubernetes_sd_configs: # 使用endpoint级别自动发现 - role: endpoints api_server: "https://10.3.218.10:16443" tls_config: insecure_skip_verify: true bearer_token_file: /data/xn-secret relabel_configs: - source_labels: [__meta_kubernetes_service_name] # 只保留指定匹配正则的标签,不匹配则删除 action: keep regex: '^(prometheus-operator-prometheus-node-exporter)$' - source_labels: [__address__] action: replace target_label: instance - target_label: __address__ # 使用replacement值替换__address__默认值 replacement: 10.3.218.10:16443 - source_labels: [__meta_kubernetes_endpoint_node_name] # 正则匹配 regex: (.+) # 使用replacement值替换__metrics_path__默认值 target_label: __metrics_path__ # 自行构建的apiserver proxy url replacement: /api/v1/nodes/${1}:9100/proxy/metrics - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace # 将标签__meta_kubernetes_namespace修改为kubernetes_namespace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace # 将标签__meta_kubernetes_service_name修改为service_name target_label: service_name #pods #- job_name: "kube-pods" # scheme: https # tls_config: # insecure_skip_verify: true # #使用apiserver授权部分解密的token值,以文件形式存储 # bearer_token_file: /data/xn-secret # # k8s自动发现具体配置 # kubernetes_sd_configs: # # 使用endpoint级别自动发现 # - role: pod # api_server: "https://10.3.218.10:16443" # tls_config: # insecure_skip_verify: true # bearer_token_file: /data/xn-secret # relabel_configs: # - source_labels: [__address__] # action: replace # target_label: instance # - target_label: __address__ # # 使用replacement值替换__address__默认值 # replacement: 10.3.218.10:16443 # - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_name, __meta_kubernetes_pod_container_port_number] # # 正则匹配 # regex: ([^;]+);([^;]+);([^;]+) # # 使用replacement值替换__metrics_path__默认值 # target_label: __metrics_path__ # # 自行构建的apiserver proxy url # replacement: /api/v1/namespaces/${1}/pods/http:${2}:${3}/proxy/metrics # - action: labelmap # regex: __meta_kubernetes_service_label_(.+) # - source_labels: [__meta_kubernetes_namespace] # action: replace # # 将标签__meta_kubernetes_namespace修改为kubernetes_namespace # target_label: kubernetes_namespace # - source_labels: [__meta_kubernetes_service_name] # action: replace # # 将标签__meta_kubernetes_service_name修改为service_name # target_label: service_name # kubelet - job_name: "kube-node-kubelet" scheme: https tls_config: insecure_skip_verify: true bearer_token_file: /data/xn-secret kubernetes_sd_configs: - role: node api_server: "https://10.3.218.10:16443" tls_config: insecure_skip_verify: true bearer_token_file: /data/xn-secret relabel_configs: - target_label: __address__ # 使用replacement值替换__address__默认值 replacement: 10.3.218.10:16443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) # 使用replacement值替换__metrics_path__默认值 target_label: __metrics_path__ replacement: /api/v1/nodes/${1}:10250/proxy/metrics - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace target_label: service_name - source_labels: [__meta_kubernetes_node_address_InternalIP] separator: ; regex: (.*) target_label: IP replacement: $1 action: replace # advisor - job_name: "kube-node-cadvisor" scheme: https tls_config: insecure_skip_verify: true bearer_token_file: /data/xn-secret kubernetes_sd_configs: - role: node api_server: "https://10.3.218.10:16443" tls_config: insecure_skip_verify: true bearer_token_file: /data/xn-secret relabel_configs: - target_label: __address__ # 使用replacement值替换__address__默认值 replacement: 10.3.218.10:16443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) # 使用replacement值替换__metrics_path__默认值 target_label: __metrics_path__ replacement: /api/v1/nodes/${1}:10250/proxy/metrics/cadvisor - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace target_label: service_name alerting: alertmanagers: - static_configs: - targets: ['10.5.250.10:9093']
10.3.238.75 xinxi-prod-redis-caiwujihe01 10.3.238.76 xinxi-prod-redis-caiwujihe02 10.5.250.29 xinxi-prod-redis-caiwushoudan01 10.5.250.188 xinxi-prod-redis-caiwushoudan02 10.5.250.8 xinxi-prod-redis-caigou 10.5.250.133 xinxi-prod-redis-tanxiao 10.5.250.133 xinxi-prod-redis-caigoufentan 10.5.250.175 xinxi-prod-redis-fawuhetong 10.3.238.86 xinxi-prod-redis-fawushoudan01 10.3.238.87 xinxi-prod-redis-fawushoudan02 10.3.238.173 xinxi-prod-redis-shucang01 10.3.238.174 xinxi-prod-redis-shucang02 10.3.238.186 xinxi-prod-redis-touzi 10.3.238.55 xinxi-prod-redis-rencai01 10.3.238.56 xinxi-prod-redis-rencai02 10.3.215.182 xinxi-prod-redis-changqijili01 10.3.215.195 xinxi-prod-redis-changqijili02 10.3.238.245 xinxi-prod-redis-zijinjianguan01 10.3.238.246 xinxi-prod-redis-zijinjianguan02 10.3.248.6 xinxi-prod-redis-caigouToC01 10.3.248.7 xinxi-prod-redis-caigouToC02
cat redis.json
[{"targets": ["10.3.238.75:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.75", "alias": "xinxi-prod-redis-caiwujihe01"}}, {"targets": ["10.3.238.76:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.76", "alias": "xinxi-prod-redis-caiwujihe02"}}, {"targets": ["10.5.250.29:9100"], "labels": {"cluster": "redis", "instance": "10.5.250.29", "alias": "xinxi-prod-redis-caiwushoudan01"}}, {"targets": ["10.5.250.188:9100"], "labels": {"cluster": "redis", "instance": "10.5.250.188", "alias": "xinxi-prod-redis-caiwushoudan02"}}, {"targets": ["10.5.250.8:9100"], "labels": {"cluster": "redis", "instance": "10.5.250.8", "alias": "xinxi-prod-redis-caigou"}}, {"targets": ["10.5.250.133:9100"], "labels": {"cluster": "redis", "instance": "10.5.250.133", "alias": "xinxi-prod-redis-tanxiao"}}, {"targets": ["10.5.250.133:9100"], "labels": {"cluster": "redis", "instance": "10.5.250.133", "alias": "xinxi-prod-redis-caigoufentan"}}, {"targets": ["10.5.250.175:9100"], "labels": {"cluster": "redis", "instance": "10.5.250.175", "alias": "xinxi-prod-redis-fawuhetong"}}, {"targets": ["10.3.238.86:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.86", "alias": "xinxi-prod-redis-fawushoudan01"}}, {"targets": ["10.3.238.87:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.87", "alias": "xinxi-prod-redis-fawushoudan02"}}, {"targets": ["10.3.238.173:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.173", "alias": "xinxi-prod-redis-shucang01"}}, {"targets": ["10.3.238.174:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.174", "alias": "xinxi-prod-redis-shucang02"}}, {"targets": ["10.3.238.186:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.186", "alias": "xinxi-prod-redis-touzi"}}, {"targets": ["10.3.238.55:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.55", "alias": "xinxi-prod-redis-rencai01"}}, {"targets": ["10.3.238.56:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.56", "alias": "xinxi-prod-redis-rencai02"}}, {"targets": ["10.3.215.182:9100"], "labels": {"cluster": "redis", "instance": "10.3.215.182", "alias": "xinxi-prod-redis-changqijili01"}}, {"targets": ["10.3.215.195:9100"], "labels": {"cluster": "redis", "instance": "10.3.215.195", "alias": "xinxi-prod-redis-changqijili02"}}, {"targets": ["10.3.238.245:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.245", "alias": "xinxi-prod-redis-zijinjianguan01"}}, {"targets": ["10.3.238.246:9100"], "labels": {"cluster": "redis", "instance": "10.3.238.246", "alias": "xinxi-prod-redis-zijinjianguan02"}}, {"targets": ["10.3.248.6:9100"], "labels": {"cluster": "redis", "instance": "10.3.248.6", "alias": "xinxi-prod-redis-caigouToC01"}}, {"targets": ["10.3.248.7:9100"], "labels": {"cluster": "redis", "instance": "10.3.248.7", "alias": "xinxi-prod-redis-caigouToC02"}}]
转换脚本
# -*- coding: utf-8 -*- import os import logging import json import time def log_level(level): if level == "DEBUG": logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s %(message)s', datefmt='%Y-%m-%d,%H:%M:%S', ) logging.info("log_level:%s", log_level) logging.info("Debug mode") else: logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s %(message)s', datefmt='%Y-%m-%d,%H:%M:%S', ) def read_list_file(): dirnames = 'host_discovery_data' read_file_names = [name for name in os.listdir(dirnames) if name.endswith('.list')] for read_file_name in read_file_names: fo = open(dirnames + '/' + read_file_name, "r") node_dict_list = list() for fo_line in fo.readlines(): fo_line = fo_line.rstrip("\n") # for fo_line_element in fo_line.split(): # logging.debug("file_name: %s - fo_line_element: %s", read_file_name, fo_line_element) node_ip = fo_line.split()[0] node_alias = fo_line.split()[1] # logging.debug("node_ip: %s - node_alias: %s", node_ip, node_alias) node_dict = dict() node_labels_dict = dict() node_labels_dict['cluster'] = read_file_name.replace('.list', '') node_dict['targets'] = [node_ip + ':9100'] node_labels_dict['instance'] = node_ip node_labels_dict['alias'] = node_alias node_dict['labels'] = node_labels_dict node_dict_list.append(node_dict) # logging.debug("node_dict_list: %s - type %s", node_dict_list, type(node_dict_list)) node_json = json.dumps(node_dict_list) logging.debug("node_json: %s - type %s", node_json, type(node_json)) # 将json 写入文件 node_write_filename = dirnames + '/' + read_file_name.replace('.list', '') + '.json' fw = open(node_write_filename, "w") fw.write(node_json) def run(): # 设置LOG 级别 level = "INFO" log_level(level) # 持续运行 while True: read_list_file() time.sleep(10) if __name__ == '__main__': run()