TPNG(telegraf+prometheus+grafana+alertmanager)+容器方式部署总结
前言:
prometheus采集数据的的时间间隔建议:小于15s
prometheus检查规则的时间间隔建议:小于15s
实际生产环境中部署的为10s
telegraf采集数据的时间间隔根据采集机器数量决定
当采集交换机数量达到30台以上时建议设置为:110s
数据刷新时间间隔设置为:110s
grafana:设置默认采集prometheus数据时间设置为:10s
达到的效果完美监控linux主机+snmp实现监控交换机。
扩展:
可以在 inputs.snmp 里加入 fieldpass 来实现过滤。针对监控项(自动发现)可以加的过滤参数如下:
1. fieldpass 可以理为匹配该名称监控项才可采集
2. fielddrop 匹配该名称的监控项不进行采集
一. 安装TPNG
#安装 grafana
#!/bin/bash
start() {
cat <<EOF
1.需要用到的命令wget apt-get tar
EOF
}
install_1() {
install_grafana="grafana-8.5.2.linux-amd64.tar.gz"
echo "1. 开始检测是否安装grafana"
ss -ntulp |grep grafana
if [ $? = 0 ]; then
echo "绘图软件grafana已安装"
else
echo "开始安装grafana$"
mkdir -p /opt/granfana
wget https://dl.grafana.com/oss/release/${install_grafana}
tar -xvzf ${install_grafana} -C /opt/granfana
[ $? = 0 ] && echo "下载解压完成" || echo "下载解压失败"
fi
}
tuoguan() {
echo "开始创建grafana托管文件"
touch /etc/systemd/system/grafana.service
cat <<EOF >/etc/systemd/system/grafana.service
[Unit]
Description="grafana"
After=network.target
[Service]
Type=simple
ExecStart=/opt/granfana/grafana-8.5.2/bin/grafana-server web > /opt/granfana/grafana-8.5.2/bin/web.log
WorkingDirectory=/opt/granfana/grafana-8.5.2/bin #执行文件目录
Restart=always
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl enable grafana
systemctl restart grafana
systemctl status grafana
}
main () {
start
install_1
tuoguan
}
main
# 安装 alertmanager
#!/bin/bash
start() {
cat <<EOF
1.需要用到的命令wget apt-get tar
EOF
}
install_1() {
ss -ntulp |grep alertmanager
VERSION=alertmanager-0.24.0.linux-amd64.tar.gz
if [ $? = 0 ]; then
echo "alertmanager 已安装"
else
echo "开始安装 ${VERSION}"
wget https://github.com/prometheus/alertmanager/releases/download/v0.24.0/${VERSION}
tar -xvzf ${VERSION} -C /opt/alertmanager
touch /etc/systemd/system/alertmanager.service
cat <<EOF >/etc/systemd/system/alertmanager.service
[Unit]
Description="alertmanager"
After=network.target
[Service]
ExecStart=/opt/alertmanger/alertmanager-0.24.0.linux-amd64/alertmanager --config.file=/opt/alertmanger/alertmanager-0.24.0.linux-amd64/alertmanager.yml --storage.path=/opt/alertmanger/alertmanager-0.24.0.linux-amd64/data --web.listen-address=:9093 --data.retention=120h
WorkingDirectory=/opt/alertmanger/alertmanager-0.24.0.linux-amd64
Restart=always
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl enable alertmanager
systemctl restart alertmanager
systemctl status alertmanager
}
main() {
start
install_1
}
main
# 安装 prometheus
#!/bin/bash
# install prometheus
mkdir -p /opt/prometheus
wget https://s3-gz01.didistatic.com/n9e-pub/prome/prometheus-2.28.0.linux-amd64.tar.gz -O prometheus-2.28.0.linux-amd64.tar.gz
tar xf prometheus-2.28.0.linux-amd64.tar.gz
cp -far prometheus-2.28.0.linux-amd64/* /opt/prometheus/
# service
cat <<EOF >/etc/systemd/system/prometheus.service
[Unit]
Description="prometheus"
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
ExecStart=/opt/prometheus/prometheus --config.file=/opt/prometheus/prometheus.yml --storage.tsdb.path=/opt/prometheus/data --web.enable-lifecycle --enable-feature=remote-write-receiver --query.lookback-delta=2m --web.listen-address=:8091 --storage.tsdb.retention=90d
Restart=on-failure
SuccessExitStatus=0
LimitNOFILE=65536
StandardOutput=syslog
StandardError=syslog
SyslogIdentifier=prometheus
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl enable prometheus
systemctl restart prometheus
systemctl status prometheus
# 安装telegraf
#!/bin/sh
version=1.20.4
tarball=telegraf-${version}_linux_amd64.tar.gz
wget https://dl.influxdata.com/telegraf/releases/$tarball
tar xzvf $tarball
mkdir -p /opt/telegraf
cp -far telegraf-${version}/usr/bin/telegraf /opt/telegraf
cat <<EOF > /opt/telegraf/telegraf.conf
[global_tags]
[agent]
interval = "10s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "0s"
flush_interval = "10s"
flush_jitter = "0s"
precision = ""
hostname = ""
omit_hostname = false
[[outputs.opentsdb]]
host = "http://127.0.0.1"
port = 19000
http_batch_size = 50
http_path = "/opentsdb/put"
debug = false
separator = "_"
[[inputs.cpu]]
percpu = true
totalcpu = true
collect_cpu_time = false
report_active = true
[[inputs.disk]]
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
[[inputs.diskio]]
[[inputs.kernel]]
[[inputs.mem]]
[[inputs.processes]]
[[inputs.system]]
fielddrop = ["uptime_format"]
[[inputs.net]]
ignore_protocol_stats = true
EOF
cat <<EOF > /etc/systemd/system/telegraf.service
[Unit]
Description="telegraf"
After=network.target
[Service]
Type=simple
ExecStart=/opt/telegraf/telegraf --config telegraf.conf --config-directory /opt/telegraf/telegraf.d
WorkingDirectory=/opt/telegraf
SuccessExitStatus=0
LimitNOFILE=65536
StandardOutput=syslog
StandardError=syslog
SyslogIdentifier=telegraf
KillMode=process
KillSignal=SIGQUIT
TimeoutStopSec=5
Restart=always
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl enable telegraf
systemctl restart telegraf
systemctl status telegraf
二. 安装snmp
1. 安装snmp
服务端:snmpd
客户端:snmp
apt-get install -y snmpd snmp
2. 本地测试snmp
snmpwalk -c public -v 2c localhost .1.3.6.1.2.1.1
扩展
-c : 团体字
-v : snmp版本
注: 将目录 "/usr/share/snmp/mibs/" 中的mibs文件替换 不然无法获取某些 oid .
三. 配置
1. 配置:telegraf 监控linux
监控linux----------------------------------------------------------> 配置文件:telegraf.conf cat <<EOF > /opt/telegraf/telegraf.conf [agent] #https://docs.influxdata.com/telegraf/v1.16/administration/configuration/#agent-configuration interval = "110s" #50s采集一次 flush_interval = "110s" #60spull一次暴露在路径上 round_interval = true metric_batch_size = 9000 metric_buffer_limit = 18000 collection_jitter = "0s" flush_jitter = "0s" precision = "" hostname = "192.168.10.5" omit_hostname = false [global_tags] TPNG = "CISHI" [[inputs.cpu]] percpu = true totalcpu = true collect_cpu_time = false report_active = true [[inputs.disk]] ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] [[inputs.diskio]] [[inputs.kernel]] [[inputs.mem]] [[inputs.processes]] [[inputs.system]] fielddrop = ["uptime_format"] [[inputs.net]] ignore_protocol_stats = true [[outputs.prometheus_client]] # https://github.com/influxdata/telegraf/blob/master/plugins/outputs/prometheus_client/README.md ## Address to listen on. listen = ":9273" metric_version = 2 path="/metrics" #string_as_label = true export_timestamp = true [[inputs.snmp]] agents = [ "ip:161" ] version = 2 community = "团体名" agent_host_tag = "ident" #fieldpass = ["ifInOctets","ifAdminStatus","ifOutOctets","ifOperStatus","ifMtu","ifSpeed"] fieldpass = ["ifHCInOctets","ifHCOutOctets","ifHighSpeed"] [[inputs.snmp.field]] #指标huawei交换机具体名称表头 name = "hostname" oid = "SNMPv2-MIB::sysName.0" is_tag = true #作为标签使用 [[inputs.snmp.table]] #针对要采集的对象:为进出口流量. 表内容 name = "interface" #任意命名指定采集对象的搜索前缀lable inherit_tags = [ "hostname" ] #代理主机标签即要从顶级配置继承哪些标记并在输出中使用这些标记 oid = "IF-MIB::ifXTable" #需要采集的指标lable #[[inputs.snmp.table]] #name = "interface" #inherit_tags = [ "hostname" ] #oid = "IF-MIB::ifTable" [[inputs.snmp.table.field]] #指标内容:端口作为标签使用 /“表头” name = "ifDescr" #任意命名为:ifDeser oid = "IF-MIB::ifName" #指标ID is_tag = true #命名的此标签是否使用/是/否 [inputs.snmp.tags] #定义插件标签 addr = "插件标签" #[[inputs.exec]] #commands = ["/tmp/test.sh"] #timeout = "5m" #data_format = "influx" #name_suffix = "_mycollector" #[[outputs.opentsdb]] #host = "http://ip" #port = 19000 #http_batch_size = 1000 #http_path = "/opentsdb/put" #debug = false #separator = "_" EOF
2. 监控交换机32位(只适用于小业务大业务推荐使用64位)
#32位交换机接口进出口流量: [[inputs.snmp]] agents = [ "10.0.0.134:161" ] version = 2 community = "tuanti" agent_host_tag = "ident" fieldpass = ["ifInOctets","ifAdminStatus","ifOutOctets","ifOperStatus","ifMtu","ifSpeed"] [[inputs.snmp.field]] name = "hostname" oid = "SNMPv2-MIB::sysName.0" is_tag = true [[inputs.snmp.table]] name = "interface" inherit_tags = [ "hostname" ] oid = "IF-MIB::ifTable" [[inputs.snmp.table.field]] name = "ifDescr" oid = "IF-MIB::ifDescr" is_tag = true #64位交换机进出口流量 64位 --------------- 成功 -------------------------------------------------------------------------- 自动发现转换 [[inputs.snmp]] agents = [ "10.0.0.134:16161" ] version = 2 community = "团体名" agent_host_tag = "ident" fieldpass = ["ifHCInOctets","ifHCOutOctets","ifHighSpeed"] [[inputs.snmp.field]] #指标huawei交换机具体名称表头 name = "hostname" oid = "SNMPv2-MIB::sysName.0" is_tag = true #作为标签使用 [[inputs.snmp.table]] #针对要采集的对象表为"ifXTable表" / oid = "SNMPv2-MIB::sysName.0" # 亦可 name = "interface" inherit_tags = [ "hostname" ] oid = "IF-MIB::ifXTable" [[inputs.snmp.table.field]] #端口名称 ----作为标签使用 name = "ifDescr" oid = "IF-MIB::ifName" is_tag = true [inputs.snmp.tags] #定义插件标签 addr = "进出口流量" #加入获取端口状态oid表ifMIB::ifoperstatus 32位与64位聚合:-------------------------------------------成功--------------- [[inputs.snmp]] agents = [ ":161" ] version = 2 community = "" agent_host_tag = "ident" fieldpass = ["ifHCInOctets","ifHCOutOctets","ifHighSpeed","ifOperStatus","ifAdminStatus"] [[inputs.snmp.field]] #指标huawei交换机具体名称表头 name = "hostname" oid = "SNMPv2-MIB::sysName.0" is_tag = true #作为标签使用 [[inputs.snmp.table]] #针对要采集的对象:为进出口流量. 表内容 name = "interface" #任意命名指定采集对象的搜索前缀lable inherit_tags = [ "hostname" ] #代理主机标签即要从顶级配置继承哪些标记并在输出中使用这些标记 oid = "IF-MIB::ifXTable" #需要采集的指标lable [[inputs.snmp.table]] #针对要采集的对象:为进出口流量. 表内容 name = "interface" #任意命名指定采集对象的搜索前缀lable inherit_tags = [ "hostname" ] #代理主机标签即要从顶级配置继承哪些标记并在输出中使用这些标记 oid = "IF-MIB::ifTable" [[inputs.snmp.table.field]] #指标内容:端口作为标签使用 /“表头” name = "ifDescr" #任意命名为:ifDeser oid = "IF-MIB::ifName" #指标ID is_tag = true #命名的此标签是否使用/是/否 [[inputs.snmp.table.field]] #指标内容:端口作为标签使用 /“表头” name = "ifAlias" #任意命名为:ifDeser oid = "IF-MIB::ifAlias" #指标ID is_tag = true [inputs.snmp.tags] #定义插件标签 addr ="" 将64位lable作为单独32位的标签---------------------------------------------------------- [[inputs.snmp]] agents = [ "" ] version = 2 community = "团体名" agent_host_tag = "ident" fieldpass = ["ifHCInOctets","ifHCOutOctets","ifHighSpeed","ifOperStatus","ifAdminStatus"] [[inputs.snmp.field]] #指标huawei交换机具体名称表头 name = "hostname" oid = "SNMPv2-MIB::sysName.0" is_tag = true #作为标签使用 [[inputs.snmp.table]] #针对要采集的对象:为进出口流量. 表内容 name = "interface" #任意命名指定采集对象的搜索前缀lable inherit_tags = [ "hostname" ] #代理主机标签即要从顶级配置继承哪些标记并在输出中使用这些标记 oid = "IF-MIB::ifTable" [[inputs.snmp.table.field]] #指标内容:端口作为标签使用 /“表头” name = "ifDescr" #任意命名为:ifDeser oid = "IF-MIB::ifName" #指标ID is_tag = true #命名的此标签是否使用/是/否 [[inputs.snmp.table.field]] #指标内容:端口作为标签使用 /“表头” name = "ifAlias" #任意命名为:ifDeser oid = "IF-MIB::ifAlias" #指标ID is_tag = true [inputs.snmp.tags] #定义插件标签 addr =""
监控交换机cpu/mem/温度/运行时长/等
[[inputs.snmp]]
agents = [
"ip"
]
version = 2
community = ""
agent_host_tag = "ident"
[[inputs.snmp.field]] #指标交换机品牌lable
name = "hostname"
oid = "SNMPv2-MIB::sysName.0"
is_tag = true #作为标签使用
[[inputs.snmp.field]] #交换机温度
oid = "SNMPv2-SMI::enterprises.2011.5.25.31.1.1.1.1.11.68943881"
name = "wendu"
[[inputs.snmp.field]] #交换机cpu利用率
oid = "SNMPv2-SMI::enterprises.2011.5.25.31.1.1.1.1.5.68943881"
name = "CpuUsage"
[[inputs.snmp.field]] #内存使用率
oid = "SNMPv2-SMI::enterprises.2011.5.25.31.1.1.1.1.7.68943881"
name = "MemUsage"
[[inputs.snmp.field]] #运行时间
oid = "DISMAN-EVENT-MIB::sysUpTimeInstance"
name = "uptime"
[inputs.snmp.tags] #定义插件标签
addr = "cpu_mem相关"
详解:
容器方式docker-compose文件
version: "3.0"
services:
telegraf: #采集工具一个
image: telegraf:1.16.0
restart: always
container_name: telegraf
hostname: telegraf
environment:
TZ: Asia/Shanghai
ports:
- 9273:9273
volumes:
- ./telegraf/telegraf.conf:/etc/telegraf/telegraf.conf
- ./telegraf/telegraf.d/:/etc/telegraf/telegraf.d/
- ./telegraf/mibs/:/usr/share/snmp/mibs/
- ./telegraf/etc:/etc/snmp/
command: telegraf --config /etc/telegraf/telegraf.conf --config-directory /etc/telegraf/telegraf.d
prometheus: #服务端兼容夜莺
image: prom/prometheus:latest
restart: always
container_name: prometheus
hostname: prometheus
environment:
TZ: Asia/Shanghai
ports:
- 9099:9090
volumes:
- ./prometheus:/etc/prometheus
- ./prometheus_data:/prometheus/data
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
- "--enable-feature=remote-write-receiver"
- "--query.lookback-delta=2m"
grafana: #图形处理一个
image: grafana/grafana
container_name: grafana
hostname: grafana
restart: unless-stopped
environment:
TZ: Asia/Shanghai
ports:
- 3000:3000
env_file:
- ./grafana/config.monitoring
alertmanager: #报警装置alertmanager
image: prom/alertmanager:latest
restart: always
container_name: alertmanager
hostname: alertmanager
ports:
- 9093:9093
volumes:
- ./alertmanager:/etc/alertmanager #配置文件持久化/微信告警模板持久化
- ./alertmanager/data:/alertmanager/data #数据存放目录持久化
command:
- --config.file=/etc/alertmanager/alertmanager.yml
61,0-1 底端
prometheus配置文件详情:
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "rules/*.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['ip:8091']
- job_name: "telegraf"
static_configs:
- targets: ['ip:9273', 'ip:9276', 'ip:9275']
实际效果图展示: