高可用集群参见https://www.cnblogs.com/xiaoyou2018/p/14243099.html
服务器公网IP:122.xxx.xxx.220
服务器内网IP:192.168.1.190
采用docker安装Prometheus、grafana、altermanager、cadvisor
实现对服务器硬件、容器、web站点、接口返回内容、证书的监控
mkdir -p /data/prometheus
cd !$
mkdir -p {conf,prometheus,rules}
cd /data/prometheus/conf
vi prometheus.yml (yml文件格式一定要注意“空格”,要全部对齐、一致,不然报错,每次修改完后热更一下Prometheus服务)
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.(拉取 targets 的默认时间间隔)
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.(执行 rules 的时间间隔)
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['192.168.1.190:9093']
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/etc/prometheus/rules/*.yml"
- "rules.yml"
#- "node_down.yml"
#- "memory.yml"
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['122.xxx.xxx.220:9090']
- job_name: 'cadvisor'
static_configs:
- targets: ['122.xxx.xxx.220:8080','192.168.1.213:8080','192.168.1.215:8080','192.168.1.216:8080','192.168.1.53:8080','192.168.1.54:8080']
# 以下为各节点类型分组
# 数仓服务器
- job_name: '数仓服务器'
scrape_interval: 8s
static_configs:
- targets: ['192.168.1.45:9100','192.168.1.46:9100','192.168.1.47:9100','192.168.1.48:9100','192.168.1.44:9100','192.168.1.51:9100','192.168.1.52:9100','192.168.1.23:9100','192.168.1.211:9100','192.168.1.202:9100','192.168.1.203:9
100','192.168.1.23:9100','192.168.1.61:9100']
#测试环境K8S服务器
- job_name: '测试环境K8S服务器'
scrape_interval: 8s
static_configs:
- targets: ['192.168.1.213:9100','192.168.1.215:9100','192.168.1.216:9100','192.168.1.53:9100','192.168.1.54:9100']
#空闲服务器
- job_name: '空闲服务器'
scrape_interval: 15s
static_configs:
- targets: ['192.168.1.193:9182']
labels: ####################用来在grafana大盘中显示主机名
hostname: 测试服务器1
# web站点检测
- job_name: "blackbox_web"
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
file_sd_configs:
- refresh_interval: 1m
files:
- "/etc/prometheus/blackbox-dis.yml"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.190:9115
# 接口返回内容检测
- job_name: "blackbox_check"
metrics_path: /probe
params:
module: [http_2xx_check] # Look for a HTTP 200 response.
file_sd_configs:
- refresh_interval: 1m
files:
- "/etc/prometheus/blackbox-check.yml"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.190:9115
#端口检测
- job_name: 'blackbox_tcp'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets:
- 192.168.1.45:9100
- 192.168.1.190:9093
- 192.168.1.212:6380
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.190:9115 # Blackbox exporter
热更新
curl -X POST http://122.xxx.xxx.220:9090/-/reload
vi alertmanager.yml
global: resolve_timeout: 5m route: group_by: ['alertname'] # 分组名 receiver: webhook group_wait: 30s # 当收到告警的时候,等待十秒看是否还有告警,如果有就一起发出去 group_interval: 1m # 各个分组之间发送警告间隔时间 repeat_interval: 48h # 重复报警的间隔时间 receivers: - name: webhook webhook_configs: - url: http://192.168.1.190:8060/dingtalk/webhook1/send send_resolved: true inhibit_rules: #告警收敛 - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']
vi docker-compose-monitor.yml
version: '2' networks: monitor: driver: bridge services: prometheus: image: prom/prometheus container_name: prometheus hostname: prometheus restart: always volumes: - /data/prometheus/conf/prometheus.yml:/etc/prometheus/prometheus.yml
- /data/prometheus/prometheus:/prometheus - /data/prometheus/rules/:/etc/prometheus/rules
- /etc/localtime:/etc/localtime command: [ "--config.file=/etc/prometheus/prometheus.yml", "--web.enable-lifecycle",
"--web.enable-admin-api",
"--storage.tsdb.retention.time=30d"
]
ports: - '9090:9090' networks: - monitor alertmanager: image: prom/alertmanager container_name: alertmanager hostname: alertmanager restart: always volumes: - /data/prometheus/conf/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- /etc/localtime:/etc/localtime
ports: - '9093:9093' networks: - monitor grafana: image: grafana/grafana container_name: grafana hostname: grafana restart: always ports: - '3000:3000' networks: - monitor # node-exporter: # image: quay.io/prometheus/node-exporter # container_name: node-exporter # hostname: node-exporter # restart: always # ports: # - '9100:9100' # networks: # - monitor cadvisor: image: google/cadvisor:latest container_name: cadvisor hostname: cadvisor restart: always volumes: - /:/rootfs:ro - /var/run:/var/run:rw - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro ports: - '8080:8080' networks: - monitor
# 使用docker-composer命令启动yml里配置好的各容器
docker-compose -f /data/prometheus/conf/docker-compose-monitor.yml up -d
#删除所有创建的容器
# 删除容器: docker-compose -f /data/prometheus/conf/docker-compose-monitor.yml kill docker-compose -f /data/prometheus/conf/docker-compose-monitor.yml rm
脚本安装node-exporter
#!/bin/bash #Supports System:Ubuntu16.04,CentOS7 cd /opt wget https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz tar -zxvf node_exporter-1.0.1.linux-amd64.tar.gz mv /opt/node_exporter-1.0.1.linux-amd64 node_exporter #rm -rf /opt/node_exporter-1.0.1.linux-amd64.tar.gz groupadd prometheus useradd -g prometheus -s /sbin/nologin prometheus -M chown -R prometheus:prometheus /opt/node_exporter cat > node_exporter.service << EOF [Unit] Description=node_exporter Documentation=https://prometheus.io/ After=network.target [Service] Type=simple User=prometheus ExecStart=/opt/node_exporter/node_exporter Restart=on-failure [Install] WantedBy=multi-user.target EOF mv /opt/node_exporter.service /etc/systemd/system/ chown prometheus:prometheus /etc/systemd/system/node_exporter.service systemctl daemon-reload systemctl start node_exporter.service systemctl enable node_exporter.service echo "请使用curl localhost:9100命令测试是否安装成功"
cadvisor安装
docker run -d -p 8080:8080 --name cadvisor -v /:/rootfs:ro -v /var/run:/var/run:rw -v /sys:/sys:ro -v /var/lib/docker/:/var/lib/docker:ro -v /dev/disk/:/dev/disk:ro google/cadvisor:latest
blackbox_exporter 安装
wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz tar -zxvf blackbox_exporter-0.18.0.linux-amd64.tar.gz -C /usr/local/ mv /usr/local/blackbox_exporter-0.18.0.linux-amd64/ /usr/local/blackbox vi /etc/systemd/system/blackbox_exporter.service [Unit] Description=blackbox_exporter After=network.target [Service] WorkingDirectory=/usr/local/blackbox ExecStart=/usr/local/blackbox/blackbox_exporter \ --config.file=/usr/local/blackbox/blackbox.yml [Install] WantedBy=multi-user.target
systemctl start blackbox_exporter
systemctl enable blackbox_exporter
修改配置文件,实现监控网站和监控网站、接口返回内容(修改完后要重启blackbox服务)
cd /usr/local/blackbox/
vi blackbox.yml
modules: http_2xx: prober: http http_2xx_check: prober: http # 下面这段是需要添加的内容 timeout: 5s http: #valid_http_versions: ["HTTP/1.1", "HTTP/2"] valid_status_codes: [] method: GET #headers: #Host:test.kaboy.net/MessageMon.aspx #Accept-Language: en-US #Origin:test.kaboy.net fail_if_body_matches_regexp: # 如果我get的url地址返回的正文中有"fail",那么就会失败,则probe_success值为0 - "#fail#" fail_if_body_not_matches_regexp: - "#SUCCESS#" # 如果我get的url地址返回的正文中没有"success",那么就会失败,则probe_success值为0 http_post_2xx: prober: http http: method: POST tcp_connect: prober: tcp pop3s_banner: prober: tcp tcp: query_response: - expect: "^+OK" tls: true tls_config: insecure_skip_verify: false ssh_banner: prober: tcp tcp: query_response: - expect: "^SSH-2.0-" irc_banner: prober: tcp tcp: query_response: - send: "NICK prober" - send: "USER prober prober prober :prober" - expect: "PING :([^ ]+)" send: "PONG ${1}" - expect: "^:[^ ]+ 001" icmp: prober: icmp
进入容器创建blackbox-dis.yml、blackbox-check.yml
docker exec -it prometheus /bin/sh
vi /etc/prometheus/blackbox-dis.yml
- targets: - https://meeuapp.cn #- https://test.kaboy.net/MessageMon.aspx #- https://www.baidu.com
vi /etc/prometheus/blackbox-check.yml
- targets: #- https://meeuapp.cn - https://test.kaboy.net/MessageMon.aspx #这个站点返回值是success #- https://www.baidu.com
systemctl restart blackbox_exporter
创建rule规则文件
vi /data/prometheus/rules/node_exporter.yml
groups: - name: 主机状态-监控告警 rules: - alert: 主机状态 expr: up == 0 for: 1m labels: status: 非常严重 annotations: summary: "{{$labels.instance}}:服务器宕机" description: "{{$labels.instance}}:服务器延时超过5分钟" - alert: CPU使用情况 expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 80 for: 1m labels: status: 一般告警 annotations: summary: "{{$labels.mountpoint}} CPU使用率过高!" description: "{{$labels.mountpoint }} CPU使用大于80%(目前使用:{{$value}}%)" - alert: 内存使用 expr: round(100- node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes*100) > 90 for: 1m labels: severity: warning annotations: summary: "内存使用率过高" description: "当前使用率{{ $value }}%" - alert: IO性能 expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60 for: 1m labels: status: 严重告警 annotations: summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!" description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})" - alert: 网络 expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400 for: 1m labels: status: 严重告警 annotations: summary: "{{$labels.mountpoint}} 流入网络带宽过高!" description: "{{$labels.mountpoint }}流入网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}" - alert: TCP会话 expr: node_netstat_Tcp_CurrEstab > 1000 for: 1m labels: status: 严重告警 annotations: summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!" description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)" - alert: 磁盘容量 expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 90 for: 1m labels: status: 严重告警 annotations: summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!" description: "{{$labels.mountpoint }} 磁盘分区使用大于90%(目前使用:{{$value}}%)"
vi /data/prometheus/rules/blackbox_exporter.yml
groups: - name: 站点状态-监控告警 rules: - alert: 网络检测 expr: probe_success == 0 for: 1m labels: status: 严重告警 annotations: summary: "{{$labels.instance}} 不能访问" description: "{{$labels.instance}} 不能访问"
vi /data/prometheus/rules/ssl.yml
groups: - name: check_ssl_status rules: - alert: "ssl证书过期警告" expr: (probe_ssl_earliest_cert_expiry - time())/86400 <15 for: 1h labels: severity: warn annotations: description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书' summary: "ssl证书过期警告"
vi /data/prometheus/rules/docker.yml
groups: - name: Docker containers monitoring rules: - alert: ContainerKilled expr: time() - container_last_seen > 60 for: 5m labels: severity: warning annotations: summary: "Container killed (instance {{ $labels.instance }})" description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerCpuUsage expr: (sum(rate(container_cpu_usage_seconds_total[3m])) BY (instance, name) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container CPU usage (instance {{ $labels.instance }})" description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerMemoryUsage expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container Memory usage (instance {{ $labels.instance }})" description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerVolumeUsage expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container Volume usage (instance {{ $labels.instance }})" description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerVolumeIoUsage expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "Container Volume IO usage (instance {{ $labels.instance }})" description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ContainerHighThrottleRate expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 for: 5m labels: severity: warning annotations: summary: "Container high throttle rate (instance {{ $labels.instance }})" description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PgbouncerActiveConnectinos expr: pgbouncer_pools_server_active_connections > 200 for: 5m labels: severity: warning annotations: summary: "PGBouncer active connectinos (instance {{ $labels.instance }})" description: "PGBouncer pools are filling up\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PgbouncerErrors expr: increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[5m]) > 10 for: 5m labels: severity: warning annotations: summary: "PGBouncer errors (instance {{ $labels.instance }})" description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: PgbouncerMaxConnections expr: rate(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[1m]) > 0 for: 5m labels: severity: critical annotations: summary: "PGBouncer max connections (instance {{ $labels.instance }})" description: "The number of PGBouncer client connections has reached max_client_conn.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: SidekiqQueueSize expr: sidekiq_queue_size{} > 100 for: 5m labels: severity: warning annotations: summary: "Sidekiq queue size (instance {{ $labels.instance }})" description: "Sidekiq queue {{ $labels.name }} is growing\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: SidekiqSchedulingLatencyTooHigh expr: max(sidekiq_queue_latency) > 120 for: 5m labels: severity: critical annotations: summary: "Sidekiq scheduling latency too high (instance {{ $labels.instance }})" description: "Sidekiq jobs are taking more than 2 minutes to be picked up. Users may be seeing delays in background processing.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ConsulServiceHealthcheckFailed expr: consul_catalog_service_node_healthy == 0 for: 5m labels: severity: critical annotations: summary: "Consul service healthcheck failed (instance {{ $labels.instance }})" description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ConsulMissingMasterNode expr: consul_raft_peers < 3 for: 5m labels: severity: critical annotations: summary: "Consul missing master node (instance {{ $labels.instance }})" description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: ConsulAgentUnhealthy expr: consul_health_node_status{status="critical"} == 1 for: 5m labels: severity: critical annotations: summary: "Consul agent unhealthy (instance {{ $labels.instance }})" description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
Prometheus
http://122.xxx.xxx.220:9090/
grafana
http://122.xxx.xxx.220:3000/
node exporter模板8919
black exporter模板9965 7587
docker 模板 193
钉钉告警
钉钉添加机器人
钉钉机器人的webhook: https://oapi.dingtalk.com/robot/send?access_token=xxx
使用docker安装Prometheus-webhook-dingtalk
docker pull timonwong/prometheus-webhook-dingtalk docker run -d --restart always --name dingding -p 8060:8060 -v /etc/localtime:/etc/localtime timonwong/prometheus-webhook-dingtalk --ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=xxxxxxx"
当触发rule规则时
网站检测、接口返回内容检测
钉钉告警中展示image图片
https://www.cnblogs.com/xiaoyou2018/p/15722796.html
grafana nginx代理
grafana.ini配置
/usr/share/grafana # cat /etc/grafana/grafana.ini ##################### Grafana Configuration Example ##################### # # Everything has defaults so you only need to uncomment things you want to # change # possible values : production, development ;app_mode = production # instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty ;instance_name = ${HOSTNAME} #################################### Paths #################################### [paths] # Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used) ;data = /var/lib/grafana # Temporary files in `data` directory older than given duration will be removed temp_data_lifetime = 24h # Directory where grafana can store logs ;logs = /var/log/grafana # Directory where grafana will automatically scan and look for plugins ;plugins = /var/lib/grafana/plugins # folder that contains provisioning config files that grafana will apply on startup and while running. ;provisioning = conf/provisioning #################################### Server #################################### [server] # Protocol (http, https, h2, socket) ;protocol = http # The ip address to bind to, empty will bind to all interfaces ;http_addr = # The http port to use ;enforce_domain = true # The public facing domain name used to access grafana from a browser domain = aa.midust.com serve_from_sub_path = true # Redirect to correct domain if host header does not match domain # Prevents DNS rebinding attacks ;enforce_domain = false # The full public facing url you use in browser, used for redirects and emails # If you use reverse proxy and sub path specify full url (with sub path) root_url = %(protocol)s://%(domain)s/grafana/ # Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons. # Log web requests ;router_logging = false # the path relative working path ;static_root_path = public # enable gzip ;enable_gzip = false # https certs & key file ;cert_file = ;cert_key = # Unix socket path ;socket = #################################### Database #################################### [database] # You can configure the database connection by specifying type, host, name, user and password # as separate properties or as on string using the url properties. # Either "mysql", "postgres" or "sqlite3", it's your choice ;type = sqlite3 ;host = 127.0.0.1:3306 ;name = grafana ;user = root # If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;""" ;password = # Use either URL or the previous fields to configure the database # Example: mysql://user:secret@host:port/database ;url = # For "postgres" only, either "disable", "require" or "verify-full" ;ssl_mode = disable ;ca_cert_path = ;client_key_path = ;client_cert_path = ;server_cert_name = # For "sqlite3" only, path relative to data_path setting ;path = grafana.db # Max idle conn setting default is 2 ;max_idle_conn = 2 # Max conn setting default is 0 (mean not set) ;max_open_conn = # Connection Max Lifetime default is 14400 (means 14400 seconds or 4 hours) ;conn_max_lifetime = 14400 # Set to true to log the sql calls and execution times. ;log_queries = # For "sqlite3" only. cache mode setting used for connecting to the database. (private, shared) ;cache_mode = private #################################### Cache server ############################# [remote_cache] # Either "redis", "memcached" or "database" default is "database" ;type = database # cache connectionstring options # database: will use Grafana primary database. # redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=0,ssl=false`. Only addr is required. ssl may be 'true', 'false', or 'insecure'. # memcache: 127.0.0.1:11211 ;connstr = #################################### Data proxy ########################### [dataproxy] # This enables data proxy logging, default is false ;logging = false # How long the data proxy waits before timing out, default is 30 seconds. # This setting also applies to core backend HTTP data sources where query requests use an HTTP client with timeout set. ;timeout = 30 # How many seconds the data proxy waits before sending a keepalive probe request. ;keep_alive_seconds = 30 # How many seconds the data proxy waits for a successful TLS Handshake before timing out. ;tls_handshake_timeout_seconds = 10 # How many seconds the data proxy will wait for a server's first response headers after # fully writing the request headers if the request has an "Expect: 100-continue" # header. A value of 0 will result in the body being sent immediately, without # waiting for the server to approve. ;expect_continue_timeout_seconds = 1 # The maximum number of idle connections that Grafana will keep alive. ;max_idle_connections = 100 # How many seconds the data proxy keeps an idle connection open before timing out. ;idle_conn_timeout_seconds = 90 # If enabled and user is not anonymous, data proxy will add X-Grafana-User header with username into the request, default is false. ;send_user_header = false #################################### Analytics #################################### [analytics] # Server reporting, sends usage counters to stats.grafana.org every 24 hours. # No ip addresses are being tracked, only simple counters to track # running instances, dashboard and error counts. It is very helpful to us. # Change this option to false to disable reporting. ;reporting_enabled = true # Set to false to disable all checks to https://grafana.net # for new versions (grafana itself and plugins), check is used # in some UI views to notify that grafana or plugin update exists # This option does not cause any auto updates, nor send any information # only a GET request to http://grafana.com to get latest versions ;check_for_updates = true # Google Analytics universal tracking code, only enabled if you specify an id here ;google_analytics_ua_id = # Google Tag Manager ID, only enabled if you specify an id here ;google_tag_manager_id = #################################### Security #################################### [security] # disable creation of admin user on first start of grafana ;disable_initial_admin_creation = false # default admin user, created on startup ;admin_user = admin # default admin password, can be changed before first start of grafana, or in profile settings ;admin_password = admin # used for signing ;secret_key = SW2YcwTIb9zpOOhoPsMm # disable gravatar profile images ;disable_gravatar = false # data source proxy whitelist (ip_or_domain:port separated by spaces) ;data_source_proxy_whitelist = # disable protection against brute force login attempts ;disable_brute_force_login_protection = false # set to true if you host Grafana behind HTTPS. default is false. ;cookie_secure = false # set cookie SameSite attribute. defaults to `lax`. can be set to "lax", "strict", "none" and "disabled" ;cookie_samesite = lax # set to true if you want to allow browsers to render Grafana in a <frame>, <iframe>, <embed> or <object>. default is false. ;allow_embedding = false # Set to true if you want to enable http strict transport security (HSTS) response header. # This is only sent when HTTPS is enabled in this configuration. # HSTS tells browsers that the site should only be accessed using HTTPS. ;strict_transport_security = false # Sets how long a browser should cache HSTS. Only applied if strict_transport_security is enabled. ;strict_transport_security_max_age_seconds = 86400 # Set to true if to enable HSTS preloading option. Only applied if strict_transport_security is enabled. ;strict_transport_security_preload = false # Set to true if to enable the HSTS includeSubDomains option. Only applied if strict_transport_security is enabled. ;strict_transport_security_subdomains = false # Set to true to enable the X-Content-Type-Options response header. # The X-Content-Type-Options response HTTP header is a marker used by the server to indicate that the MIME types advertised # in the Content-Type headers should not be changed and be followed. ;x_content_type_options = true # Set to true to enable the X-XSS-Protection header, which tells browsers to stop pages from loading # when they detect reflected cross-site scripting (XSS) attacks. ;x_xss_protection = true #################################### Snapshots ########################### [snapshots] # snapshot sharing options ;external_enabled = true ;external_snapshot_url = https://snapshots-origin.raintank.io ;external_snapshot_name = Publish to snapshot.raintank.io # Set to true to enable this Grafana instance act as an external snapshot server and allow unauthenticated requests for # creating and deleting snapshots. ;public_mode = false # remove expired snapshot ;snapshot_remove_expired = true #################################### Dashboards History ################## [dashboards] # Number dashboard versions to keep (per dashboard). Default: 20, Minimum: 1 ;versions_to_keep = 20 # Minimum dashboard refresh interval. When set, this will restrict users to set the refresh interval of a dashboard lower than given interval. Per default this is 5 seconds. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. ;min_refresh_interval = 5s # Path to the default home dashboard. If this value is empty, then Grafana uses StaticRootPath + "dashboards/home.json" ;default_home_dashboard_path = #################################### Users ############################### [users] # disable user signup / registration ;allow_sign_up = true # Allow non admin users to create organizations ;allow_org_create = true # Set to true to automatically assign new users to the default organization (id 1) ;auto_assign_org = true # Set this value to automatically add new users to the provided organization (if auto_assign_org above is set to true) ;auto_assign_org_id = 1 # Default role new users will be automatically assigned (if disabled above is set to true) ;auto_assign_org_role = Viewer # Require email validation before sign up completes ;verify_email_enabled = false # Background text for the user field on the login page ;login_hint = email or username ;password_hint = password # Default UI theme ("dark" or "light") ;default_theme = dark # External user management, these options affect the organization users view ;external_manage_link_url = ;external_manage_link_name = ;external_manage_info = # Viewers can edit/inspect dashboard settings in the browser. But not save the dashboard. ;viewers_can_edit = false # Editors can administrate dashboard, folders and teams they create ;editors_can_admin = false # The duration in time a user invitation remains valid before expiring. This setting should be expressed as a duration. Examples: 6h (hours), 2d (days), 1w (week). Default is 24h (24 hours). The minimum supported duration is 15m (15 minutes). ;user_invite_max_lifetime_duration = 24h [auth] # Login cookie name ;login_cookie_name = grafana_session # The maximum lifetime (duration) an authenticated user can be inactive before being required to login at next visit. Default is 7 days (7d). This setting should be expressed as a duration, e.g. 5m (minutes), 6h (hours), 10d (days), 2w (weeks), 1M (month). The lifetime resets at each successful token rotation. ;login_maximum_inactive_lifetime_duration = # The maximum lifetime (duration) an authenticated user can be logged in since login time before being required to login. Default is 30 days (30d). This setting should be expressed as a duration, e.g. 5m (minutes), 6h (hours), 10d (days), 2w (weeks), 1M (month). ;login_maximum_lifetime_duration = # How often should auth tokens be rotated for authenticated users when being active. The default is each 10 minutes. ;token_rotation_interval_minutes = 10 # Set to true to disable (hide) the login form, useful if you use OAuth, defaults to false ;disable_login_form = false # Set to true to disable the signout link in the side menu. useful if you use auth.proxy, defaults to false ;disable_signout_menu = false # URL to redirect the user to after sign out ;signout_redirect_url = # Set to true to attempt login with OAuth automatically, skipping the login screen. # This setting is ignored if multiple OAuth providers are configured. ;oauth_auto_login = false # OAuth state max age cookie duration in seconds. Defaults to 600 seconds. ;oauth_state_cookie_max_age = 600 # limit of api_key seconds to live before expiration ;api_key_max_seconds_to_live = -1 # Set to true to enable SigV4 authentication option for HTTP-based datasources. ;sigv4_auth_enabled = false #################################### Anonymous Auth ###################### [auth.anonymous] # enable anonymous access ;enabled = false # specify organization name that should be used for unauthenticated users ;org_name = Main Org. # specify role for unauthenticated users ;org_role = Viewer # mask the Grafana version number for unauthenticated users ;hide_version = false #################################### GitHub Auth ########################## [auth.github] ;enabled = false ;allow_sign_up = true ;client_id = some_id ;client_secret = some_secret ;scopes = user:email,read:org ;auth_url = https://github.com/login/oauth/authorize ;token_url = https://github.com/login/oauth/access_token ;api_url = https://api.github.com/user ;allowed_domains = ;team_ids = ;allowed_organizations = #################################### GitLab Auth ######################### [auth.gitlab] ;enabled = false ;allow_sign_up = true ;client_id = some_id ;client_secret = some_secret ;scopes = api ;auth_url = https://gitlab.com/oauth/authorize ;token_url = https://gitlab.com/oauth/token ;api_url = https://gitlab.com/api/v4 ;allowed_domains = ;allowed_groups = #################################### Google Auth ########################## [auth.google] ;enabled = false ;allow_sign_up = true ;client_id = some_client_id ;client_secret = some_client_secret ;scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email ;auth_url = https://accounts.google.com/o/oauth2/auth ;token_url = https://accounts.google.com/o/oauth2/token ;api_url = https://www.googleapis.com/oauth2/v1/userinfo ;allowed_domains = ;hosted_domain = #################################### Grafana.com Auth #################### [auth.grafana_com] ;enabled = false ;allow_sign_up = true ;client_id = some_id ;client_secret = some_secret ;scopes = user:email ;allowed_organizations = #################################### Azure AD OAuth ####################### [auth.azuread] ;name = Azure AD ;enabled = false ;allow_sign_up = true ;client_id = some_client_id ;client_secret = some_client_secret ;scopes = openid email profile ;auth_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/authorize ;token_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/token ;allowed_domains = ;allowed_groups = #################################### Okta OAuth ####################### [auth.okta] ;name = Okta ;enabled = false ;allow_sign_up = true ;client_id = some_id ;client_secret = some_secret ;scopes = openid profile email groups ;auth_url = https://<tenant-id>.okta.com/oauth2/v1/authorize ;token_url = https://<tenant-id>.okta.com/oauth2/v1/token ;api_url = https://<tenant-id>.okta.com/oauth2/v1/userinfo ;allowed_domains = ;allowed_groups = ;role_attribute_path = #################################### Generic OAuth ########################## [auth.generic_oauth] ;enabled = false ;name = OAuth ;allow_sign_up = true ;client_id = some_id ;client_secret = some_secret ;scopes = user:email,read:org ;email_attribute_name = email:primary ;email_attribute_path = ;login_attribute_path = ;id_token_attribute_name = ;auth_url = https://foo.bar/login/oauth/authorize ;token_url = https://foo.bar/login/oauth/access_token ;api_url = https://foo.bar/user ;allowed_domains = ;team_ids = ;allowed_organizations = ;role_attribute_path = ;tls_skip_verify_insecure = false ;tls_client_cert = ;tls_client_key = ;tls_client_ca = #################################### Basic Auth ########################## [auth.basic] ;enabled = true #################################### Auth Proxy ########################## [auth.proxy] ;enabled = false ;header_name = X-WEBAUTH-USER ;header_property = username ;auto_sign_up = true ;sync_ttl = 60 ;whitelist = 192.168.1.1, 192.168.2.1 ;headers = Email:X-User-Email, Name:X-User-Name # Read the auth proxy docs for details on what the setting below enables ;enable_login_token = false #################################### Auth LDAP ########################## [auth.ldap] ;enabled = false ;config_file = /etc/grafana/ldap.toml ;allow_sign_up = true # LDAP backround sync (Enterprise only) # At 1 am every day ;sync_cron = "0 0 1 * * *" ;active_sync_enabled = true #################################### SMTP / Emailing ########################## [smtp] ;enabled = false ;host = localhost:25 ;user = # If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;""" ;password = ;cert_file = ;key_file = ;skip_verify = false ;from_address = admin@grafana.localhost ;from_name = Grafana # EHLO identity in SMTP dialog (defaults to instance_name) ;ehlo_identity = dashboard.example.com # SMTP startTLS policy (defaults to 'OpportunisticStartTLS') ;startTLS_policy = NoStartTLS [emails] ;welcome_email_on_sign_up = false ;templates_pattern = emails/*.html #################################### Logging ########################## [log] # Either "console", "file", "syslog". Default is console and file # Use space to separate multiple modes, e.g. "console file" ;mode = console file # Either "debug", "info", "warn", "error", "critical", default is "info" ;level = info # optional settings to set different levels for specific loggers. Ex filters = sqlstore:debug ;filters = # For "console" mode only [log.console] ;level = # log line format, valid options are text, console and json ;format = console # For "file" mode only [log.file] ;level = # log line format, valid options are text, console and json ;format = text # This enables automated log rotate(switch of following options), default is true ;log_rotate = true # Max line number of single file, default is 1000000 ;max_lines = 1000000 # Max size shift of single file, default is 28 means 1 << 28, 256MB ;max_size_shift = 28 # Segment log daily, default is true ;daily_rotate = true # Expired days of log file(delete after max days), default is 7 ;max_days = 7 [log.syslog] ;level = # log line format, valid options are text, console and json ;format = text # Syslog network type and address. This can be udp, tcp, or unix. If left blank, the default unix endpoints will be used. ;network = ;address = # Syslog facility. user, daemon and local0 through local7 are valid. ;facility = # Syslog tag. By default, the process' argv[0] is used. ;tag = #################################### Usage Quotas ######################## [quota] ; enabled = false #### set quotas to -1 to make unlimited. #### # limit number of users per Org. ; org_user = 10 # limit number of dashboards per Org. ; org_dashboard = 100 # limit number of data_sources per Org. ; org_data_source = 10 # limit number of api_keys per Org. ; org_api_key = 10 # limit number of orgs a user can create. ; user_org = 10 # Global limit of users. ; global_user = -1 # global limit of orgs. ; global_org = -1 # global limit of dashboards ; global_dashboard = -1 # global limit of api_keys ; global_api_key = -1 # global limit on number of logged in users. ; global_session = -1 #################################### Alerting ############################ [alerting] # Disable alerting engine & UI features ;enabled = true # Makes it possible to turn off alert rule execution but alerting UI is visible ;execute_alerts = true # Default setting for new alert rules. Defaults to categorize error and timeouts as alerting. (alerting, keep_state) ;error_or_timeout = alerting # Default setting for how Grafana handles nodata or null values in alerting. (alerting, no_data, keep_state, ok) ;nodata_or_nullvalues = no_data # Alert notifications can include images, but rendering many images at the same time can overload the server # This limit will protect the server from render overloading and make sure notifications are sent out quickly ;concurrent_render_limit = 5 # Default setting for alert calculation timeout. Default value is 30 ;evaluation_timeout_seconds = 30 # Default setting for alert notification timeout. Default value is 30 ;notification_timeout_seconds = 30 # Default setting for max attempts to sending alert notifications. Default value is 3 ;max_attempts = 3 # Makes it possible to enforce a minimal interval between evaluations, to reduce load on the backend ;min_interval_seconds = 1 # Configures for how long alert annotations are stored. Default is 0, which keeps them forever. # This setting should be expressed as a duration. Examples: 6h (hours), 10d (days), 2w (weeks), 1M (month). ;max_annotation_age = # Configures max number of alert annotations that Grafana stores. Default value is 0, which keeps all alert annotations. ;max_annotations_to_keep = #################################### Annotations ######################### [annotations.dashboard] # Dashboard annotations means that annotations are associated with the dashboard they are created on. # Configures how long dashboard annotations are stored. Default is 0, which keeps them forever. # This setting should be expressed as a duration. Examples: 6h (hours), 10d (days), 2w (weeks), 1M (month). ;max_age = # Configures max number of dashboard annotations that Grafana stores. Default value is 0, which keeps all dashboard annotations. ;max_annotations_to_keep = [annotations.api] # API annotations means that the annotations have been created using the API without any # association with a dashboard. # Configures how long Grafana stores API annotations. Default is 0, which keeps them forever. # This setting should be expressed as a duration. Examples: 6h (hours), 10d (days), 2w (weeks), 1M (month). ;max_age = # Configures max number of API annotations that Grafana keeps. Default value is 0, which keeps all API annotations. ;max_annotations_to_keep = #################################### Explore ############################# [explore] # Enable the Explore section ;enabled = true #################################### Internal Grafana Metrics ########################## # Metrics available at HTTP API Url /metrics [metrics] # Disable / Enable internal metrics ;enabled = true # Graphite Publish interval ;interval_seconds = 10 # Disable total stats (stat_totals_*) metrics to be generated ;disable_total_stats = false #If both are set, basic auth will be required for the metrics endpoint. ; basic_auth_username = ; basic_auth_password = # Metrics environment info adds dimensions to the `grafana_environment_info` metric, which # can expose more information about the Grafana instance. [metrics.environment_info] #exampleLabel1 = exampleValue1 #exampleLabel2 = exampleValue2 # Send internal metrics to Graphite [metrics.graphite] # Enable by setting the address setting (ex localhost:2003) ;address = ;prefix = prod.grafana.%(instance_name)s. #################################### Grafana.com integration ########################## # Url used to import dashboards directly from Grafana.com [grafana_com] ;url = https://grafana.com #################################### Distributed tracing ############ [tracing.jaeger] # Enable by setting the address sending traces to jaeger (ex localhost:6831) ;address = localhost:6831 # Tag that will always be included in when creating new spans. ex (tag1:value1,tag2:value2) ;always_included_tag = tag1:value1 # Type specifies the type of the sampler: const, probabilistic, rateLimiting, or remote ;sampler_type = const # jaeger samplerconfig param # for "const" sampler, 0 or 1 for always false/true respectively # for "probabilistic" sampler, a probability between 0 and 1 # for "rateLimiting" sampler, the number of spans per second # for "remote" sampler, param is the same as for "probabilistic" # and indicates the initial sampling rate before the actual one # is received from the mothership ;sampler_param = 1 # sampling_server_url is the URL of a sampling manager providing a sampling strategy. ;sampling_server_url = # Whether or not to use Zipkin propagation (x-b3- HTTP headers). ;zipkin_propagation = false # Setting this to true disables shared RPC spans. # Not disabling is the most common setting when using Zipkin elsewhere in your infrastructure. ;disable_shared_zipkin_spans = false #################################### External image storage ########################## [external_image_storage] # Used for uploading images to public servers so they can be included in slack/email messages. # you can choose between (s3, webdav, gcs, azure_blob, local) provider = local [external_image_storage.s3] ;endpoint = ;path_style_access = ;bucket = ;region = ;path = ;access_key = ;secret_key = [external_image_storage.webdav] ;url = ;public_url = ;username = ;password = [external_image_storage.gcs] ;key_file = ;bucket = ;path = [external_image_storage.azure_blob] ;account_name = ;account_key = ;container_name = [external_image_storage.local] # does not require any configuration [rendering] # Options to configure a remote HTTP image rendering service, e.g. using https://github.com/grafana/grafana-image-renderer. # URL to a remote HTTP image renderer service, e.g. http://localhost:8081/render, will enable Grafana to render panels and dashboards to PNG-images using HTTP requests to an external service. server_url = http://122.xxx.xxx.220:8081/render/ # If the remote HTTP image renderer service runs on a different server than the Grafana server you may have to configure this to a URL where Grafana is reachable, e.g. http://grafana.domain/. callback_url = http://aa.midust.com/grafana/ # Concurrent render request limit affects when the /render HTTP endpoint is used. Rendering many images at the same time can overload the server, # which this setting can help protect against by only allowing a certain amount of concurrent requests. ;concurrent_render_request_limit = 30 [panels] # If set to true Grafana will allow script tags in text panels. Not recommended as it enable XSS vulnerabilities. ;disable_sanitize_html = false [plugins] ;enable_alpha = false ;app_tls_skip_verify_insecure = false # Enter a comma-separated list of plugin identifiers to identify plugins that are allowed to be loaded even if they lack a valid signature. allow_loading_unsigned_plugins = aliyun_cms_grafana_datasource,aliyun-log-service-datasource,grafana-log-service-datasource ;marketplace_url = https://grafana.com/grafana/plugins/ #################################### Grafana Image Renderer Plugin ########################## [plugin.grafana-image-renderer] # Instruct headless browser instance to use a default timezone when not provided by Grafana, e.g. when rendering panel image of alert. # See ICU’s metaZones.txt (https://cs.chromium.org/chromium/src/third_party/icu/source/data/misc/metaZones.txt) for a list of supported # timezone IDs. Fallbacks to TZ environment variable if not set. ;rendering_timezone = # Instruct headless browser instance to use a default language when not provided by Grafana, e.g. when rendering panel image of alert. # Please refer to the HTTP header Accept-Language to understand how to format this value, e.g. 'fr-CH, fr;q=0.9, en;q=0.8, de;q=0.7, *;q=0.5'. ;rendering_language = # Instruct headless browser instance to use a default device scale factor when not provided by Grafana, e.g. when rendering panel image of alert. # Default is 1. Using a higher value will produce more detailed images (higher DPI), but will require more disk space to store an image. ;rendering_viewport_device_scale_factor = # Instruct headless browser instance whether to ignore HTTPS errors during navigation. Per default HTTPS errors are not ignored. Due to # the security risk it's not recommended to ignore HTTPS errors. ;rendering_ignore_https_errors = # Instruct headless browser instance whether to capture and log verbose information when rendering an image. Default is false and will # only capture and log error messages. When enabled, debug messages are captured and logged as well. # For the verbose information to be included in the Grafana server log you have to adjust the rendering log level to debug, configure # [log].filter = rendering:debug. ;rendering_verbose_logging = # Instruct headless browser instance whether to output its debug and error messages into running process of remote rendering service. # Default is false. This can be useful to enable (true) when troubleshooting. ;rendering_dumpio = # Additional arguments to pass to the headless browser instance. Default is --no-sandbox. The list of Chromium flags can be found # here (https://peter.sh/experiments/chromium-command-line-switches/). Multiple arguments is separated with comma-character. ;rendering_args = # You can configure the plugin to use a different browser binary instead of the pre-packaged version of Chromium. # Please note that this is not recommended, since you may encounter problems if the installed version of Chrome/Chromium is not # compatible with the plugin. ;rendering_chrome_bin = # Instruct how headless browser instances are created. Default is 'default' and will create a new browser instance on each request. # Mode 'clustered' will make sure that only a maximum of browsers/incognito pages can execute concurrently. # Mode 'reusable' will have one browser instance and will create a new incognito page on each request. ;rendering_mode = # When rendering_mode = clustered you can instruct how many browsers or incognito pages can execute concurrently. Default is 'browser' # and will cluster using browser instances. # Mode 'context' will cluster using incognito pages. ;rendering_clustering_mode = # When rendering_mode = clustered you can define maximum number of browser instances/incognito pages that can execute concurrently.. ;rendering_clustering_max_concurrency = # Limit the maximum viewport width, height and device scale factor that can be requested. ;rendering_viewport_max_width = ;rendering_viewport_max_height = ;rendering_viewport_max_device_scale_factor = # Change the listening host and port of the gRPC server. Default host is 127.0.0.1 and default port is 0 and will automatically assign # a port not in use. ;grpc_host = ;grpc_port = [enterprise] # Path to a valid Grafana Enterprise license.jwt file ;license_path = [feature_toggles] # enable features, separated by spaces ;enable = [date_formats] # For information on what formatting patterns that are supported https://momentjs.com/docs/#/displaying/ # Default system date format used in time range picker and other places where full time is displayed ;full_date = YYYY-MM-DD HH:mm:ss # Used by graph and other places where we only show small intervals ;interval_second = HH:mm:ss ;interval_minute = HH:mm ;interval_hour = MM/DD HH:mm ;interval_day = MM/DD ;interval_month = YYYY-MM ;interval_year = YYYY # Experimental feature ;use_browser_locale = false # Default timezone for user preferences. Options are 'browser' for the browser local timezone or a timezone name from IANA Time Zone database, e.g. 'UTC' or 'Europe/Amsterdam' etc. ;default_timezone = browser
nginx配置文件
[root@kibana ~]# cat /usr/local/nginx/conf.d/vhost/grafana.conf server { listen 80; server_name aa.midust.com; access_log /var/log/nginx/aa.midust.com.access.log; error_log /var/log/nginx/aa.midust.com.error.log; # Load configuration files for the default server block. #include /etc/nginx/default.d/*.conf; location /grafana/ { root html; index index.html index.htm; proxy_pass http://122.xxx.xxx.220:3000/; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header x-forwarded-for $proxy_add_x_forwarded_for; proxy_redirect default; proxy_http_version 1.1; proxy_set_header Connection ""; } error_page 404 /404.html; location = /40x.html { } error_page 500 502 503 504 /50x.html; location = /50x.html { } }
web访问
http://aa.midust.com/grafana/login
问题:
1、docker启动 cAdvisor报错
Could not configure a source for OOM detection, disabling OOM events: open /dev/kmsg: no such file or directory
Failed to start container manager: inotify_add_watch /sys/fs/cgroup/cpuacct,cpu: no such file or directory
解决:
mount -o remount,rw '/sys/fs/cgroup' ln -s /sys/fs/cgroup/cpu,cpuacct /sys/fs/cgroup/cpuacct,cpu docker restart cadvisor
2、blackbox exporter模板报错
Panel plugin not found: grafana-piechart-panel
解决:
grafana-cli plugins install grafana-piechart-panel
3、If you're seeing this Grafana has failed to load its application files
解决:
开启serve_from_sub_path = true