监控工具 - Prometheus 在 Rockly Linux 9 的二进制安装
下载安装包
https://prometheus.io/download/
# Prometheus 选择LTS版本下载
https://github.com/prometheus/prometheus/releases/download/v2.53.2/prometheus-2.53.2.linux-amd64.tar.gz
# alertmanager
https://github.com/prometheus/alertmanager/releases/download/v0.27.0/alertmanager-0.27.0.linux-amd64.tar.gz
# promlens
https://github.com/prometheus/promlens/releases/download/v0.3.0/promlens-0.3.0.linux-amd64.tar.gz
# pushgateway
https://github.com/prometheus/pushgateway/releases/download/v1.9.0/pushgateway-1.9.0.linux-amd64.tar.gz
二进制方式安装配置
[root@node200 ~]# useradd prometheus && echo "prometheus:prometheus"|chpasswd && chage -M 99999 prometheus
[root@node200 ~]#
[root@node200 ~]# cat /etc/passwd |grep prometheus
prometheus:x:1001:1001::/home/prometheus:/bin/bash
[root@node200 ~]#
[root@node200 ~]# ll /home
total 4
drwx------. 14 anliven anliven 4096 Sep 13 14:24 anliven
drwx------ 3 prometheus prometheus 78 Sep 13 14:27 prometheus
[root@node200 ~]#
[root@node200 ~]# systemctl status firewalld.service
○ firewalld.service - firewalld - dynamic firewall daemon
Loaded: loaded (/usr/lib/systemd/system/firewalld.service; disabled; preset: enabled)
Active: inactive (dead)
Docs: man:firewalld(1)
[root@node200 ~]#
[root@node200 ~]# ll Prometheus/
total 181232
-rw-r--r-- 1 anliven anliven 30866868 Sep 13 13:59 alertmanager-0.27.0.linux-amd64.tar.gz
-rw-r--r-- 1 anliven anliven 11269099 Sep 13 14:00 blackbox_exporter-0.25.0.linux-amd64.tar.gz
-rw-r--r-- 1 anliven anliven 10676343 Sep 13 14:00 node_exporter-1.8.2.linux-amd64.tar.gz
-rw-r--r-- 1 anliven anliven 104212702 Sep 13 13:59 prometheus-2.53.2.linux-amd64.tar.gz
-rw-r--r-- 1 anliven anliven 17982288 Sep 13 14:03 promlens-0.3.0.linux-amd64.tar.gz
-rw-r--r-- 1 anliven anliven 10563386 Sep 13 14:03 pushgateway-1.9.0.linux-amd64.tar.gz
[root@node200 ~]# cd Prometheus/
[root@node200 Prometheus]#
[root@node200 Prometheus]# tar -zxf prometheus-2.53.2.linux-amd64.tar.gz -C /opt
[root@node200 Prometheus]#
[root@node200 Prometheus]# cd /opt
[root@node200 opt]# chown -R prometheus:prometheus /opt/prometheus-2.53.2.linux-amd64
[root@node200 opt]# ln -sv /opt/prometheus-2.53.2.linux-amd64 prometheus
'prometheus' -> '/opt/prometheus-2.53.2.linux-amd64'
[root@node200 opt]#
[root@node200 opt]# ll |grep prometheus
lrwxrwxrwx 1 root root 34 Sep 13 14:58 prometheus -> /opt/prometheus-2.53.2.linux-amd64
drwxr-xr-x 4 prometheus prometheus 132 Aug 9 23:16 prometheus-2.53.2.linux-amd64
[root@node200 opt]# cd
[root@node200 ~]# vim /usr/lib/systemd/system/prometheus.service
[root@node200 ~]# cat /usr/lib/systemd/system/prometheus.service
[Unit]
Description=Prometheus server daemon
After=network.target
[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/opt/prometheus/prometheus \
--config.file "/opt/prometheus/prometheus.yml" \
--storage.tsdb.path "/opt/prometheus/data" \
--storage.tsdb.retention=15d \
--web.console.templates="/opt/prometheus/consoles" \
--web.console.libraries="/opt/prometheus/console_libraries" \
--web.max-connections=512 \
--web.enable-lifecycle \
--web.listen-address="0.0.0.0:9090"
Restart=on-failure
[Install]
WantedBy=multi-user.target
[root@node200 ~]#
[root@node200 ~]# systemctl daemon-reload
[root@node200 ~]# systemctl enable prometheus.service
Created symlink /etc/systemd/system/multi-user.target.wants/prometheus.service → /usr/lib/systemd/system/prometheus.service.
[root@node200 ~]# systemctl start prometheus
[root@node200 ~]# systemctl status prometheus
● prometheus.service - Prometheus server daemon
Loaded: loaded (/usr/lib/systemd/system/prometheus.service; enabled; preset: disabled)
Active: active (running) since Fri 2024-09-13 15:06:33 CST; 2s ago
Main PID: 3982 (prometheus)
Tasks: 8 (limit: 48820)
Memory: 19.3M
CPU: 86ms
CGroup: /system.slice/prometheus.service
└─3982 /opt/prometheus/prometheus --config.file /opt/prometheus/prometheus.yml --storage.tsdb.path /opt/prometheus/data --stor>
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.025Z caller=head.go:721 level=info component=tsdb msg="Replaying WAL, this>
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.026Z caller=head.go:793 level=info component=tsdb msg="WAL segment loaded">
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.026Z caller=head.go:830 level=info component=tsdb msg="WAL replay complete>
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.030Z caller=main.go:1169 level=info fs_type=XFS_SUPER_MAGIC
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.030Z caller=main.go:1172 level=info msg="TSDB started"
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.030Z caller=main.go:1354 level=info msg="Loading configuration file" filen>
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.037Z caller=main.go:1391 level=info msg="updated GOGC" old=100 new=75
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.038Z caller=main.go:1402 level=info msg="Completed loading of configuratio>
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.038Z caller=main.go:1133 level=info msg="Server is ready to receive web re>
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.038Z caller=manager.go:164 level=info component="rule manager" msg="Starti>
lines 1-20/20 (END)
^C
[root@node200 opt]#
登录页面
http://192.168.16.200:9090/ 默认是Graph页面
prometheus命令帮助信息
[root@node200 ~]# /opt/prometheus/prometheus -h
usage: prometheus [<flags>]
The Prometheus monitoring server
Flags:
-h, --[no-]help Show context-sensitive help (also try --help-long and --help-man).
--[no-]version Show application version.
--config.file="prometheus.yml"
Prometheus configuration file path.
--web.listen-address="0.0.0.0:9090"
Address to listen on for UI, API, and telemetry.
--auto-gomemlimit.ratio=0.9
The ratio of reserved GOMEMLIMIT memory to the detected maximum container or system memory
--web.config.file="" [EXPERIMENTAL] Path to configuration file that can enable TLS or authentication.
--web.read-timeout=5m Maximum duration before timing out read of the request, and closing idle connections.
--web.max-connections=512 Maximum number of simultaneous connections.
--web.external-url=<URL> The URL under which Prometheus is externally reachable (for example, if Prometheus is served via a reverse
proxy). Used for generating relative and absolute links back to Prometheus itself. If the URL has a path
portion, it will be used to prefix all HTTP endpoints served by Prometheus. If omitted, relevant URL
components will be derived automatically.
--web.route-prefix=<path> Prefix for the internal routes of web endpoints. Defaults to path of --web.external-url.
--web.user-assets=<path> Path to static asset directory, available at /user.
--[no-]web.enable-lifecycle
Enable shutdown and reload via HTTP request.
--[no-]web.enable-admin-api
Enable API endpoints for admin control actions.
--[no-]web.enable-remote-write-receiver
Enable API endpoint accepting remote write requests.
--web.console.templates="consoles"
Path to the console template directory, available at /consoles.
--web.console.libraries="console_libraries"
Path to the console library directory.
--web.page-title="Prometheus Time Series Collection and Processing Server"
Document title of Prometheus instance.
--web.cors.origin=".*" Regex for CORS origin. It is fully anchored. Example: 'https?://(domain1|domain2)\.com'
--storage.tsdb.path="data/"
Base path for metrics storage. Use with server mode only.
--storage.tsdb.retention=STORAGE.TSDB.RETENTION
[DEPRECATED] How long to retain samples in storage. This flag has been deprecated, use
"storage.tsdb.retention.time" instead. Use with server mode only.
--storage.tsdb.retention.time=STORAGE.TSDB.RETENTION.TIME
How long to retain samples in storage. When this flag is set it overrides "storage.tsdb.retention".
If neither this flag nor "storage.tsdb.retention" nor "storage.tsdb.retention.size" is set, the retention
time defaults to 15d. Units Supported: y, w, d, h, m, s, ms. Use with server mode only.
--storage.tsdb.retention.size=STORAGE.TSDB.RETENTION.SIZE
Maximum number of bytes that can be stored for blocks. A unit is required, supported units: B, KB, MB, GB,
TB, PB, EB. Ex: "512MB". Based on powers-of-2, so 1KB is 1024B. Use with server mode only.
--[no-]storage.tsdb.no-lockfile
Do not create lockfile in data directory. Use with server mode only.
--storage.tsdb.head-chunks-write-queue-size=0
Size of the queue through which head chunks are written to the disk to be m-mapped, 0 disables the queue
completely. Experimental. Use with server mode only.
--storage.agent.path="data-agent/"
Base path for metrics storage. Use with agent mode only.
--[no-]storage.agent.wal-compression
Compress the agent WAL. Use with agent mode only.
--storage.agent.retention.min-time=STORAGE.AGENT.RETENTION.MIN-TIME
Minimum age samples may be before being considered for deletion when the WAL is truncated Use with agent
mode only.
--storage.agent.retention.max-time=STORAGE.AGENT.RETENTION.MAX-TIME
Maximum age samples may be before being forcibly deleted when the WAL is truncated Use with agent mode
only.
--[no-]storage.agent.no-lockfile
Do not create lockfile in data directory. Use with agent mode only.
--storage.remote.flush-deadline=<duration>
How long to wait flushing sample on shutdown or config reload.
--storage.remote.read-sample-limit=5e7
Maximum overall number of samples to return via the remote read interface, in a single query. 0 means no
limit. This limit is ignored for streamed response types. Use with server mode only.
--storage.remote.read-concurrent-limit=10
Maximum number of concurrent remote read calls. 0 means no limit. Use with server mode only.
--storage.remote.read-max-bytes-in-frame=1048576
Maximum number of bytes in a single frame for streaming remote read response types before marshalling.
Note that client might have limit on frame size as well. 1MB as recommended by protobuf by default.
Use with server mode only.
--rules.alert.for-outage-tolerance=1h
Max time to tolerate prometheus outage for restoring "for" state of alert. Use with server mode only.
--rules.alert.for-grace-period=10m
Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured
"for" time greater than grace period. Use with server mode only.
--rules.alert.resend-delay=1m
Minimum amount of time to wait before resending an alert to Alertmanager. Use with server mode only.
--rules.max-concurrent-evals=4
Global concurrency limit for independent rules that can run concurrently. When set, "query.max-concurrency"
may need to be adjusted accordingly. Use with server mode only.
--alertmanager.notification-queue-capacity=10000
The capacity of the queue for pending Alertmanager notifications. Use with server mode only.
--query.lookback-delta=5m The maximum lookback duration for retrieving metrics during expression evaluations and federation. Use with
server mode only.
--query.timeout=2m Maximum time a query may take before being aborted. Use with server mode only.
--query.max-concurrency=20
Maximum number of queries executed concurrently. Use with server mode only.
--query.max-samples=50000000
Maximum number of samples a single query can load into memory. Note that queries will fail if they try to
load more samples than this into memory, so this also limits the number of samples a query can return.
Use with server mode only.
--enable-feature= ... Comma separated feature names to enable. Valid options: agent, auto-gomemlimit, exemplar-storage,
expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions,
remote-write-receiver (DEPRECATED), extra-scrape-metrics, new-service-discovery-manager, auto-gomaxprocs,
no-default-scrape-port, native-histograms, otlp-write-receiver, created-timestamp-zero-ingestion,
concurrent-rule-eval. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details.
--log.level=info Only log messages with the given severity or above. One of: [debug, info, warn, error]
--log.format=logfmt Output format of log messages. One of: [logfmt, json]
更新Prometheus配置
修改配置文件prometheus.yml
- 重启服务方式:重启Prometheus服务即可加载配置文件
- 热加载方式:通过API发送post请求
curl -X POST http://192.168.16.200:9090/-/reload
- 热加载方式需要在Prometheus服务启动时指定
--web.enable-lifecycle
验证Prometheus配置
通过promtool工具核查配置。
[root@node200 ~]# /opt/prometheus/promtool check config /opt/prometheus/prometheus.yml
Checking /opt/prometheus/prometheus.yml
SUCCESS: /opt/prometheus/prometheus.yml is valid prometheus config file syntax
[root@node200 ~]#
[root@node200 ~]# /opt/prometheus/promtool -h
usage: promtool [<flags>] <command> [<args> ...]
Tooling for the Prometheus monitoring system.
Flags:
-h, --[no-]help Show context-sensitive help (also try --help-long and --help-man).
--[no-]version Show application version.
--[no-]experimental Enable experimental commands.
--enable-feature= ... Comma separated feature names to enable (only PromQL related and no-default-scrape-port). See
https://prometheus.io/docs/prometheus/latest/feature_flags/ for the options and more details.
Commands:
help [<command>...]
Show help.
check service-discovery [<flags>] <config-file> <job>
Perform service discovery for the given job name and report the results, including relabeling.
check config [<flags>] <config-files>...
Check if the config files are valid or not.
check web-config <web-config-files>...
Check if the web config files are valid or not.
check healthy [<flags>]
Check if the Prometheus server is healthy.
check ready [<flags>]
Check if the Prometheus server is ready.
check rules [<flags>] [<rule-files>...]
Check if the rule files are valid or not.
check metrics
Pass Prometheus metrics over stdin to lint them for consistency and correctness.
examples:
$ cat metrics.prom | promtool check metrics
$ curl -s http://localhost:9090/metrics | promtool check metrics
query instant [<flags>] <server> <expr>
Run instant query.
query range [<flags>] <server> <expr>
Run range query.
query series --match=MATCH [<flags>] <server>
Run series query.
query labels [<flags>] <server> <name>
Run labels query.
query analyze --server=SERVER --type=TYPE --match=MATCH [<flags>]
Run queries against your Prometheus to analyze the usage pattern of certain metrics.
debug pprof <server>
Fetch profiling debug information.
debug metrics <server>
Fetch metrics debug information.
debug all <server>
Fetch all debug information.
push metrics [<flags>] <remote-write-url> [<metric-files>...]
Push metrics to a prometheus remote write (for testing purpose only).
test rules [<flags>] <test-rule-file>...
Unit tests for rules.
tsdb bench write [<flags>] [<file>]
Run a write performance benchmark.
tsdb analyze [<flags>] [<db path>] [<block id>]
Analyze churn, label pair cardinality and compaction efficiency.
tsdb list [<flags>] [<db path>]
List tsdb blocks.
tsdb dump [<flags>] [<db path>]
Dump samples from a TSDB.
tsdb dump-openmetrics [<flags>] [<db path>]
[Experimental] Dump samples from a TSDB into OpenMetrics text format, excluding native histograms and staleness markers, which are not
representable in OpenMetrics.
tsdb create-blocks-from openmetrics <input file> [<output directory>]
Import samples from OpenMetrics input and produce TSDB blocks. Please refer to the storage docs for more details.
tsdb create-blocks-from rules --start=START [<flags>] <rule-files>...
Create blocks of data for new recording rules.
promql format <query>
Format PromQL query to pretty printed form.
promql label-matchers set [<flags>] <query> <name> <value>
Set a label matcher in the query.
promql label-matchers delete <query> <name>
Delete a label from the query.
[root@node200 ~]#
Docker方式快速安装
docker pull prom/prometheus:v2.53.2
mkdir -p /etc/prometheus/data
docker run -itd --name prometheus \
-p 9090:9090 \
-v /etc/prometheus:/etc/prometheus \
-v /etc/prometheus/data:/prometheus \
prom/prometheus:v2.53.2
常用页面
- 默认页面 http://192.168.16.200:9090/graph
- 告警页面 http://192.168.16.200:9090/alerts
- Targets信息 http://192.168.16.200:9090/targets
- 指标信息Metrics http://<targets-ip>:9090/metrics
- http://192.168.16.200:9090/status
- http://192.168.16.200:9090/tsdb-status
- http://192.168.16.200:9090/flags
- http://192.168.16.200:9090/config
- http://192.168.16.200:9090/rules
- http://192.168.16.200:9090/service-discovery
行动是绝望的解药!
欢迎转载和引用,但请在明显处保留原文链接和原作者信息!
本博客内容多为个人工作与学习的记录,少数内容来自于网络并略有修改,已尽力标明原文链接和转载说明。如有冒犯,即刻删除!
以所舍,求所得,有所获,方所成。