监控平台prometheus实施步骤简单记录

---平台暂定为单节点,如有需要,可扩展为高可用集群

  1. 环境准备

    cat /etc/redhat-release 
    	CentOS Linux release 7.6.1810 (Core)
    	
    cat >> /etc/security/limits.conf <<EOF #增大环境描述符
    	root soft nofile 65535
    	root hard nofile 65535
    	* soft nproc 65535
    	* hard nproc 65535
    	* soft nofile 65535
    	* hard nofile 65535
    	EOF
    	echo "ulimit -SH 65535" >> /etc/rc.local
    	ulimit -SH 65535
    curl -o /etc/yum.repos.d/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
    
    curl -o /etc/yum.repos.d/epel.repo http://mirrors.aliyun.com/repo/epel-7.repo
    mkdir /soft 
    mkdir /application
    

    2.安装步骤

    1.安装prometheus server
    #进入软件目录
    cd /application
    
    yum install git -y
    
    git clone https://github.com/prometheus/prometheus.git
    
    ln -s prometheus-2.18.0-rc.0.linux-amd64 prometheus
    
    cd prometheus
    
    make build
    ./prometheus --config.file=your_config.yml
    
    #配置启动文件
    vi /etc/systemd/system/prometheus.service
    [Unit]
    Description=Prometheus Monitoring System
    Documentation=Prometheus Monitoring System
    
    [Service]
    ExecStart=/application/prometheus/prometheus \
      --config.file=/application/prometheus/prometheus.yml \
      --web.listen-address=:9090 \
      --web.enable-lifecycle \
      --storage.tsdb.retention=30d \   
      --web.read-timeout=5m \      
      --web.max-connections=512 \ 
      --web.external-url=::9090 \ 
      --web.route-prefix=/application/prometheus \ 
      --web.user-assets=/application/prometheus \  
      --web.enable-lifecycle  \ 
      --web.enable-admin-api     
    
    [Install]
    WantedBy=multi-user.target
    
    systemctl deamon-reload
    
    netstat -ltnp|grep 9090
    
    2.安装mysqld—exporter
    前提:创建用户
    CREATE USER 'exporter'@'localhost' IDENTIFIED BY 'XXXXXXXX' WITH MAX_USER_CONNECTIONS 3;
    GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'exporter'@'localhost';
    
    下载安装文件
    wget https://github.com/prometheus/mysqld_exporter/releases/download/v0.12.1/mysqld_exporter-0.12.1.linux-amd64.tar.gz
    
    cd /application
    
    tar xf prometheus-2.18.0-rc.0.linux-amd64.tar.gz
    
    ln -s mysqld_exporter-0.12.1.linux-amd64 mysqld_exporter
    
    配置mysql用户密码
    
    vim .my.cnf
    [client]
    user=xxxx
    password=xxxx
    
    启动服务
    nohup ./mysqld_exporter --collect.auto_increment.columns --no-collect.auto_increment.columns --config.my-cnf=.my.cnf &
    
    安装grafana
    sudo nano /etc/yum.repos.d/grafana.repo
    
    vim /etc/yum.repos.d/grafana.repo
    [grafana]
    name=grafana
    baseurl=https://packages.grafana.com/enterprise/rpm
    repo_gpgcheck=1
    enabled=1
    gpgcheck=1
    gpgkey=https://packages.grafana.com/gpg.key
    sslverify=1
    sslcacert=/etc/pki/tls/certs/ca-bundle.crt
    #安装
    yum install grafana-enterprise -y
    
    systemctl start grafana
    
    netstat -ltnp|grep 3000
    
    #安装alertmanager
    wget https://github.com/prometheus/alertmanager/releases/download/v0.20.0/alertmanager-0.20.0.linux-amd64.tar.gz
    
    cd /application
    
    tar xf alertmanager-0.20.0.linux-amd64.tar.gz
    
    ln -s alertmanager-0.20.0.linux-amd64 alertmanager #稍后启动
    
    nohup ./alertmanager --config.file=alertmanage.yml &
    
    

3.配置文件

#主配置文件
[root@prometheus prometheus]# cat prometheus.yml
# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
       - 127.0.0.1:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
   - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['10.0.0.15:9090']
  - job_name: 'zabbix-server'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['10.0.0.201:9100']
  - job_name: 'zabbix-server-mysql'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['10.0.0.201:9104']
  - job_name: 'test-mysql'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['10.0.0.15:9104']

#规则配置文件
groups:
- name: MySQLStatsAlert
  rules:
  - alert: MySQL is down
    expr: mysql_up == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "Instance {{ $labels.instance }} MySQL is down"
      description: "MySQL database is down. This requires immediate action!"
  - alert: open files high
    expr: mysql_global_status_innodb_num_open_files > (mysql_global_variables_open_files_limit) * 0.75
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Instance {{ $labels.instance }} open files high"
      description: "Open files is high. Please consider increasing open_files_limit."
  - alert: Read buffer size is bigger than max. allowed packet size
    expr: mysql_global_variables_read_buffer_size > mysql_global_variables_slave_max_allowed_packet 
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Instance {{ $labels.instance }} Read buffer size is bigger than max. allowed packet size"
      description: "Read buffer size (read_buffer_size) is bigger than max. allowed packet size (max_allowed_packet).This can break your replication."
  - alert: Sort buffer possibly missconfigured
    expr: mysql_global_variables_innodb_sort_buffer_size <256*1024 or mysql_global_variables_read_buffer_size > 4*1024*1024 
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Instance {{ $labels.instance }} Sort buffer possibly missconfigured"
      description: "Sort buffer size is either too big or too small. A good value for sort_buffer_size is between 256k and 4M."
  - alert: Thread stack size is too small
    expr: mysql_global_variables_thread_stack <196608
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Instance {{ $labels.instance }} Thread stack size is too small"
      description: "Thread stack size is too small. This can cause problems when you use Stored Language constructs for example. A typical is 256k for thread_stack_size."
  - alert: Used more than 80% of max connections limited 
    expr: mysql_global_status_max_used_connections > mysql_global_variables_max_connections * 0.8
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Instance {{ $labels.instance }} Used more than 80% of max connections limited"
      description: "Used more than 80% of max connections limited"
  - alert: InnoDB Force Recovery is enabled
    expr: mysql_global_variables_innodb_force_recovery != 0 
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Instance {{ $labels.instance }} InnoDB Force Recovery is enabled"
      description: "InnoDB Force Recovery is enabled. This mode should be used for data recovery purposes only. It prohibits writing to the data."
  - alert: InnoDB Log File size is too small
    expr: mysql_global_variables_innodb_log_file_size < 16777216 
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Instance {{ $labels.instance }} InnoDB Log File size is too small"
      description: "The InnoDB Log File size is possibly too small. Choosing a small InnoDB Log File size can have significant performance impacts."
  - alert: InnoDB Flush Log at Transaction Commit
    expr: mysql_global_variables_innodb_flush_log_at_trx_commit != 1
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Instance {{ $labels.instance }} InnoDB Flush Log at Transaction Commit"
      description: "InnoDB Flush Log at Transaction Commit is set to a values != 1. This can lead to a loss of commited transactions in case of a power failure."
  - alert: Table definition cache too small
    expr: mysql_global_status_open_table_definitions > mysql_global_variables_table_definition_cache
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} Table definition cache too small"
      description: "Your Table Definition Cache is possibly too small. If it is much too small this can have significant performance impacts!"
  - alert: Table open cache too small
    expr: mysql_global_status_open_tables >mysql_global_variables_table_open_cache * 99/100
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} Table open cache too small"
      description: "Your Table Open Cache is possibly too small (old name Table Cache). If it is much too small this can have significant performance impacts!"
  - alert: Thread stack size is possibly too small
    expr: mysql_global_variables_thread_stack < 262144
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} Thread stack size is possibly too small"
      description: "Thread stack size is possibly too small. This can cause problems when you use Stored Language constructs for example. A typical is 256k for thread_stack_size."
  - alert: InnoDB Buffer Pool Instances is too small
    expr: mysql_global_variables_innodb_buffer_pool_instances == 1
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} InnoDB Buffer Pool Instances is too small"
      description: "If you are using MySQL 5.5 and higher you should use several InnoDB Buffer Pool Instances for performance reasons. Some rules are: InnoDB Buffer Pool Instance should be at least 1 Gbyte in size. InnoDB Buffer Pool Instances you can set equal to the number of cores of your machine."
  - alert: InnoDB Plugin is enabled
    expr: mysql_global_variables_ignore_builtin_innodb == 1
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} InnoDB Plugin is enabled"
      description: "InnoDB Plugin is enabled"
  - alert: Binary Log is disabled
    expr: mysql_global_variables_log_bin != 1
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Instance {{ $labels.instance }} Binary Log is disabled"
      description: "Binary Log is disabled. This prohibits you to do Point in Time Recovery (PiTR)."
  - alert: Binlog Cache size too small
    expr: mysql_global_variables_binlog_cache_size < 1048576
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} Binlog Cache size too small"
      description: "Binlog Cache size is possibly to small. A value of 1 Mbyte or higher is OK."
  - alert: Binlog Statement Cache size too small
    expr: mysql_global_variables_binlog_stmt_cache_size <1048576 and mysql_global_variables_binlog_stmt_cache_size > 0
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} Binlog Statement Cache size too small"
      description: "Binlog Statement Cache size is possibly to small. A value of 1 Mbyte or higher is typically OK."
  - alert: Binlog Transaction Cache size too small
    expr: mysql_global_variables_binlog_cache_size  <1048576
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} Binlog Transaction Cache size too small"
      description: "Binlog Transaction Cache size is possibly to small. A value of 1 Mbyte or higher is typically OK."
  - alert: Sync Binlog is enabled
    expr: mysql_global_variables_sync_binlog == 1
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} Sync Binlog is enabled"
      description: "Sync Binlog is enabled. This leads to higher data security but on the cost of write performance."
  - alert: IO thread stopped
    expr: mysql_slave_status_slave_io_running != 1
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "Instance {{ $labels.instance }} IO thread stopped"
      description: "IO thread has stopped. This is usually because it cannot connect to the Master any more."
  - alert: SQL thread stopped 
    expr: mysql_slave_status_slave_sql_running == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "Instance {{ $labels.instance }} SQL thread stopped"
      description: "SQL thread has stopped. This is usually because it cannot apply a SQL statement received from the master."
  - alert: SQL thread stopped
    expr: mysql_slave_status_slave_sql_running != 1
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "Instance {{ $labels.instance }} Sync Binlog is enabled"
      description: "SQL thread has stopped. This is usually because it cannot apply a SQL statement received from the master."
  - alert: Slave lagging behind Master
    expr: rate(mysql_slave_status_seconds_behind_master[1m]) >30 
    for: 1m
    labels:
      severity: warning 
    annotations:
      summary: "Instance {{ $labels.instance }} Slave lagging behind Master"
      description: "Slave is lagging behind Master. Please check if Slave threads are running and if there are some performance issues!"
  - alert: Slave is NOT read only(Please ignore this warning indicator.)
    expr: mysql_global_variables_read_only != 0
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} Slave is NOT read only"
- name: example
  rules:

  # Alert for any instance that is unreachable for >5 minutes.
  - alert: InstanceDown
    expr: up == 0
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} down"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- name: mysql
  rules:

  # Alert for any instance that is unreachable for >5 minutes.
  - alert: 主从挂了
    expr: mysql_slave_status_slave_io_running == 0
    for: 0m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} 主从"
      description: "{{ $labels.instance }} of job {{ $labels.job }} 主从挂了."

#alertmanager报警配置
global:
  resolve_timeout: 1m
  smtp_smarthost: 'smtp.qq.com:465'
  smtp_from: 'xxxxxxx@qq.com'
  smtp_auth_username: 'xxxxxxx@qq.com'
  smtp_auth_password: 'xxxxxxx'
  smtp_require_tls: false

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'
receivers:
- name: 'web.hook'
  email_configs:
  - to: 'xxxxxxx@126.com'
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']
  1. grafana配置监控项

  2. 备用配合方案 zabbix

    1.配置初始环境
    增大文件描述符 (三台都操作)
    	cat >> /etc/security/limits.conf <<EOF
    	root soft nofile 65535
    	root hard nofile 65535
    	* soft nproc 65535
    	* hard nproc 65535
    	* soft nofile 65535
    	* hard nofile 65535
    	EOF
    	echo "ulimit -SH 65535" >> /etc/rc.local
    	ulimit -SH 65535
    2.配置yum环境
    curl -o /etc/yum.repos.d/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
    curl -o /etc/yum.repos.d/epel.repo http://mirrors.aliyun.com/repo/epel-7.repo
    rpm -Uvh https://repo.zabbix.com/zabbix/4.0/rhel/7/x86_64/zabbix-release-4.0-2.el7.noarch.rpm
    #这边要进行一个换源,要不然会被墙(最近才被墙)
    复制zabbix.repo 到其他服务器
    
    3.安装zabbix-server #只在服务端安装
    yum install zabbix-server-mysql zabbix-web-mysql zabbix-agent httpd  zabbix-get -y   #客户端只安装agent
    yum install zabbix-agent -y
    
    4.安装mysql数据库
    yum install mariadb-server -y
    systemctl start mariadb
    systemctl enable mariadb
    
    5.导入表结构
    mysql_secure_installation
    mysql -e "create database zabbix character set utf8 collate utf8_bin;"
    mysql -e "grant all privileges on zabbix.* to zabbix@localhost identified by '123456';"
    zcat /usr/share/doc/zabbix-server-mysql*/create.sql.gz | mysql  zabbix
    
    6,修改配置文件
    vi /etc/zabbix/zabbix_server.conf 
    DBHost=localhost 
    DBName=zabbix
    DBUser=zabbix
    DBPassword=1qaz@WSX
    
    
    vi /etc/httpd/conf.d/zabbix.conf
    php_value date.timezone Asia/Shanghai
    
    7.启动
    systemctl start zabbix-server
    systemctl enable zabbix-server
    systemctl start httpd
    systemctl enable httpd
    
    8.进入zabbix界面
    配置数据库用户密码
    账号 Admin 密码 zabbix
    
    9.导入模板
    。。。
    
    10.修改模板
    cp userparameter_percona_mysql.conf /etc/zabbix/zabbix_agentd.d/
    vim ss_get_mysql_stats.php #修改账号密码
    vim get_mysql_stats_wrapper.sh #修改账号密码  主从同步的
    
    11.解决图形字符乱码问题
    将simkai.ttf 拷贝到/usr/share/fonts/dejavu/ 下
    重新创建软链接
    ln -s /usr/share/fonts/dejavu/simkai.ttf zabbix-web-font
    
    11.安装测试用数据库(agent)
    略
    
    
    
    1.data目录755授权,zabbix授权 2》/dev/null
    
    #process select super
    #SELECT, PROCESS, SUPER
    replication slave, replication client
    

    6.zabbix客户端脚本配合

    #!/bin/bash
    #create by dhc
    DIR=`dirname $0`
    zabbix_conf=/etc/zabbix/zabbix_agentd.conf
    source /etc/init.d/functions
    yum_dir=/etc/yum.repos.d
    percona_name=percona-zabbix-templates-1.1.8-1.noarch.rpm
    scripts_dir=/var/lib/zabbix/percona/scripts
    template_file=/var/lib/zabbix/percona/templates/userparameter_percona_mysql.conf
    zabbix_keydir=/etc/zabbix/zabbix_agentd.d/
    yum_ip=10.0.0.50
    #config yum repo
    yum_repo(){
    #curl -o ${yum_dir}/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
    #curl -o ${yum_dir}/epel.repo http://mirrors.aliyun.com/repo/epel-7.repo
    cp $DIR/zabbix.repo ${yum_dir}/
    yum clean all
    yum makecache
    }
    #install zabbix... rpm packages
    yum_install(){
    ping -w1 -c1 $yum_ip &>/dev/null
    if [  $? -eq 0 ];then
    	yum install zabbix-agent php php-mysql -y
    	if [ ! $? -eq 0 ];then
    		action "安装失败,请手动执行 yum install zabbix-agent php php-mysql -y     " /bin/false
    		exit
    	 fi
    else
    	 action "连接不到$yum_ip....     " /bin/false
    	exit
    fi
    }
    #config zabbix_agent point to zabbix_server
    zabbix_conf(){
    if [ -e $zabbix_conf ];then
    	rm -f $zabbix_conf 2&>/dev/null
    	cp $DIR/zabbix_agentd.conf /etc/zabbix/
    	sed -i "/^Hostname/c Hostname=`hostname`" $zabbix_conf
    else
    	action "zabbix_agent 安装失败,请检查  " /bin/false
    	exit
    fi
    }
    
    zabbix_start(){
    systemctl start zabbix-agent
    }
    zabbix_restart(){
    systemctl restart zabbix-agent
    }
    #install percona monitoring templates 
    percona_moni(){
    if [ -e $percona_name ];then
    	rpm -ivh $DIR/percona-zabbix-templates-1.1.8-1.noarch.rpm
    	if [ -e $template_file ];then
    #		cp $template_file $zabbix_keydir
    		cp $DIR/userparameter_percona_mysql.conf $zabbix_keydir
    		rm -f $scripts_dir/get_mysql_stats_wrapper.sh && cp $DIR/get_mysql_stats_wrapper.sh $scripts_dir
    		chmod +x $scripts_dir/get_mysql_stats_wrapper.sh
    	else
    		action "$DIR perconna rpm 安装失败  " /bin/false
    		exit
    	fi
    else
    	action "$DIR perconna rpm包不存在  " /bin/false
    	exit
    fi
    }
    #editer the scripts with mysql user password sock.
    scripts_conf(){
    	echo -e "============================================================="
    	action  "install is ok " /bin/true
    	echo -e "============================================================="
    	echo -e "\n \n \n \n \n  "
    	sleep 3
    	echo -e "*****请在配置文件中修改数据库信息 !!!*****\
    	\n $scripts_dir/ss_get_mysql_stats.php (30行)\
    	\n $scripts_dir/get_mysql_stats_wrapper.sh (19行)"
    }
    #chown zabbix for /tmp/localhost_file
    chown_file(){
    > /tmp/localhost-mysql_cacti_stats.txt
    chown zabbix.zabbix /tmp/localhost-mysql_cacti_stats.txt
    }
    last(){
    /usr/bin/php -q /var/lib/zabbix/percona/scripts/ss_get_mysql_stats.php --host localhost --items gg
    rm -rf /tmp/localhost-mysql_cacti_stats.txt
    }
    yum_repo
    yum_install
    zabbix_conf
    zabbix_start
    percona_moni
    scripts_conf
    chown_file
    zabbix_restart
    last
    
    
posted @ 2020-07-07 13:56  大葱丁  阅读(195)  评论(0编辑  收藏  举报