【监控】prometheus监控安装
部署
wget https://github.com/prometheus/prometheus/releases/download/v2.28.0/prometheus-2.28.0.linux-amd64.tar.gz
tar xf prometheus-2.28.0.linux-amd64.tar.gz
mv prometheus-2.28.0.linux-amd64 /usr/local/prometheus-2.28.0
vim /usr/local/prometheus-2.28.0/prometheus.yml
# my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: # - alertmanager:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: # - "first_rules.yml" # - "second_rules.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. scrape_configs: # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. - job_name: 'prometheus' # metrics_path defaults to '/metrics' # scheme defaults to 'http'. static_configs: - targets: ['localhost:9090']
vim /usr/lib/systemd/system/prometheus.service
[Unit] Description=Prometheus Services After=network.target remote-fs.target [Service] Type=simple ExecStart=/usr/local/prometheus-2.28.0/prometheus --config.file=/usr/local/prometheus-2.28.0/prometheus.yml --storage.tsdb.path=/usr/local/prometheus-2.28.0/ Restart=on-failure RestartSec=5 [Install] WantedBy=multi-user.target
systemctl restart prometheus.service
监控
wget https://github.com/prometheus/node_exporter/releases/download/v1.1.2/node_exporter-1.1.2.linux-amd64.tar.gz
tar xf node_exporter-1.1.2.linux-amd64.tar.gz
mv node_exporter-1.1.2.linux-amd64 /usr/local/node_exporter
cat > /usr/lib/systemd/system/node_exporter.service << EOF
[Unit]
Description=Prometheus Node Exporter Services
After=network.target remote-fs.target
[Service]
Type=simple
ExecStart=/usr/local/node_exporter/node_exporter
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl start node_exporter
telegram报警
git clone https://github.com/nopp/alertmanager-webhook-telegram-python.git
yum install python3 pip3
cd alertmanager-webhook-telegram-python/
pip3 install -r requirements.txt
pip3 install python-dateutil
vim flaskAlert.py
import telegram, json, logging from dateutil import parser from flask import Flask from flask import request from flask_basicauth import BasicAuth app = Flask(__name__) app.secret_key = 'lAlAlA123' basic_auth = BasicAuth(app) # Yes need to have -, change it! chatID = "" # 更改 # Authentication conf, change it! app.config['BASIC_AUTH_FORCE'] = True app.config['BASIC_AUTH_USERNAME'] = '' #更改 app.config['BASIC_AUTH_PASSWORD'] = '' #更改 # Bot token, change it! bot = telegram.Bot(token="") #更改 @app.route('/alert', methods = ['POST']) def postAlertmanager(): try: content = json.loads(request.get_data()) for alert in content['alerts']: message = "Status: "+alert['status']+"\n" if 'name' in alert['labels']: message += "Instance: "+alert['labels']['instance']+"("+alert['labels']['name']+")\n" else: message += "Instance: "+alert['labels']['instance']+"\n" if 'info' in alert['annotations']: message += "Info: "+alert['annotations']['info']+"\n" if 'summary' in alert['annotations']: message += "Summary: "+alert['annotations']['summary']+"\n" if 'description' in alert['annotations']: message += "Description: "+alert['annotations']['description']+"\n" if alert['status'] == "resolved": correctDate = parser.parse(alert['endsAt']).strftime('%Y-%m-%d %H:%M:%S') message += "Resolved: "+correctDate elif alert['status'] == "firing": correctDate = parser.parse(alert['startsAt']).strftime('%Y-%m-%d %H:%M:%S') message += "Started: "+correctDate bot.sendMessage(chat_id=chatID, text=message) return "Alert OK", 200 except RetryAfter: sleep(30) bot.sendMessage(chat_id=chatID, text=message) return "Alert OK", 200 except TimedOut as e: sleep(60) bot.sendMessage(chat_id=chatID, text=message) return "Alert OK", 200 except NetworkError as e: sleep(60) bot.sendMessage(chat_id=chatID, text=message) return "Alert OK", 200 except Exception as error: bot.sendMessage(chat_id=chatID, text="Error: "+str(error)) app.logger.info("\t%s",error) return "Alert fail", 200 if __name__ == '__main__': logging.basicConfig(level=logging.INFO) app.run(host='0.0.0.0', port=9119)
nohup python3 flaskAlert.py &
测试
curl -XPOST --data '{"status":"resolved","groupLabels":{"alertname":"instance_down"},"commonAnnotations":{"description":"i-0d7188fkl90bac100 of job ec2-sp-node_exporter has been down for more than 2 minutes.","summary":"Instance i-0d7188fkl90bac100 down"},"alerts":[{"status":"resolved","labels":{"name":"olokinho01-prod","instance":"i-0d7188fkl90bac100","job":"ec2-sp-node_exporter","alertname":"instance_down","os":"linux","severity":"page"},"endsAt":"2019-07-01T16:16:19.376244942-03:00","generatorURL":"http://pmts.io:9090","startsAt":"2019-07-01T16:02:19.376245319-03:00","annotations":{"description":"i-0d7188fkl90bac100 of job ec2-sp-node_exporter has been down for more than 2 minutes.","summary":"Instance i-0d7188fkl90bac100 down"}}],"version":"4","receiver":"infra-alert","externalURL":"http://alm.io:9093","commonLabels":{"name":"olokinho01-prod","instance":"i-0d7188fkl90bac100","job":"ec2-sp-node_exporter","alertname":"instance_down","os":"linux","severity":"page"}}' http://username:password@flaskAlert:9119/alert
安装alertmanager
wget https://github.com/prometheus/alertmanager/releases/download/v0.22.2/alertmanager-0.22.2.linux-amd64.tar.gz
tar xf alertmanager-0.22.2.linux-amd64.tar.gz
mv alertmanager-0.22.2.linux-amd64 /usr/local/alertmanager
cd /usr/local/alertmanager/
route: group_by: ['alertname'] group_wait: 30s group_interval: 5m repeat_interval: 1h receiver: 'alertmananger-bot' receivers: - name: 'alertmananger-bot' webhook_configs: - send_resolved: true url: http://127.0.0.1:9119/alert http_config: basic_auth: username: 'goroutine' password: 'goroutine-12345' templates: - '/usr/local/alertmanager/test.tmpl' ########### /usr/local/alertmanager/test.rmpl ############ {{ define "test.html" }} {{ range .Alerts }} <pre> 故障实例: {{ .Labels.instance }} 故障概要: {{ .Annotations.summary }} 故障描述: {{ .Annotations.description }} 告警级别: {{ .Labels.severity }} 告警时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} </pre> {{ end }} {{ end }}
nohup /usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml --storage.path=/usr/local/alertmanager/data > /usr/local/alertmanager/alertmanager.log &
测试
#!/usr/bin/env bash alerts_message='[ { "labels": { "alertname": "DiskRunningFull", "dev": "sda1", "instance": "example1", "msgtype": "testing" }, "annotations": { "info": "The disk sda1 is running full", "summary": "please check the instance example1" } }, { "labels": { "alertname": "DiskRunningFull", "dev": "sda2", "instance": "example1", "msgtype": "testing" }, "annotations": { "info": "The disk sda2 is running full", "summary": "please check the instance example1", "runbook": "the following link http://test-url should be clickable" } } ]' curl -XPOST -d"$alerts_message" http://127.0.0.1:9093/api/v1/alerts
prometheus修改
/usr/local/prometheus-2.28.0/prometheus.yml
alerting: alertmanagers: - static_configs: - targets: - alertmanager:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - "rules/*.yml"
rules/base_rules.yml
groups: - name: node-exporter-alert rules: - alert: node-exporter-down expr: node_exporter:up == 0 for: 1m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} 宕机了" description: "instance: {{ $labels.instance }} \n- job: {{ $labels.job }} 关机了, 时间已经1分钟了。" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-cpu-high expr: node_exporter:cpu:total:percent > 80 for: 3m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} cpu 使用率高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-cpu-iowait-high expr: node_exporter:cpu:iowait:percent >= 12 for: 3m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} cpu iowait 使用率高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-load-load1-high expr: (node_exporter:load:load1) > (node_exporter:cpu:count) * 1.2 for: 3m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} load1 使用率高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-memory-high expr: node_exporter:memory:used:percent > 85 for: 3m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} memory 使用率高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-disk-high expr: node_exporter:disk:used:percent > 88 for: 10m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} disk 使用率高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-disk-read:count-high expr: node_exporter:disk:read:count:rate > 3000 for: 2m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} iops read 使用率高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-disk-write-count-high expr: node_exporter:disk:write:count:rate > 3000 for: 2m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} iops write 使用率高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-disk-read-mb-high expr: node_exporter:disk:read:mb:rate > 60 for: 2m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} 读取字节数 高于 {{ $value }}" description: "" instance: "{{ $labels.instance }}" value: "{{ $value }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-disk-write-mb-high expr: node_exporter:disk:write:mb:rate > 60 for: 2m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} 写入字节数 高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-filefd-allocated-percent-high expr: node_exporter:filefd_allocated:percent > 80 for: 10m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} 打开文件描述符 高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-network-netin-error-rate-high expr: node_exporter:network:netin:error:rate > 4 for: 1m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} 包进入的错误速率 高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-network-netin-packet-rate-high expr: node_exporter:network:netin:packet:rate > 35000 for: 1m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} 包进入速率 高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-network-netout-packet-rate-high expr: node_exporter:network:netout:packet:rate > 35000 for: 1m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} 包流出速率 高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-network-tcp-total-count-high expr: node_exporter:network:tcp:total:count > 40000 for: 1m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} tcp连接数量 高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-process-zoom-total-count-high expr: node_exporter:process:zoom:total:count > 10 for: 10m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} 僵死进程数量 高于 {{ $value }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud" - alert: node-exporter-time-offset-high expr: node_exporter:time:offset > 0.03 for: 2m labels: severity: info annotations: summary: "instance: {{ $labels.instance }} {{ $labels.desc }} {{ $value }} {{ $labels.unit }}" description: "" value: "{{ $value }}" instance: "{{ $labels.instance }}" grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} " type: "google-cloud"
systemctl restart prometheus