一、open-falcon介绍
1)中文社区介绍
http://book.open-falcon.org/zh_0_2/intro/
参照文档: https://www.cnblogs.com/LAlexH/p/11161943.html
参照文档:https://www.cnblogs.com/straycats/p/7199209.html
视频连接:http://www.jikexueyuan.com/course/1651_3.html?ss=1
告警设置:https://www.cnblogs.com/python-lbl/p/10450186.html
2)falcon的优点
强大灵活的数据采集:自动发现,支持falcon-agent、snmp、支持用户主动push、用户自定义插件支持、opentsdb data model like(timestamp、endpoint、metric、key-value tags)
水平扩展能力:支持每个周期上亿次的数据采集、告警判定、历史数据存储和查询
高效率的告警策略管理:高效的portal、支持策略模板、模板继承和覆盖、多种告警方式、支持callback调用
人性化的告警设置:最大告警次数、告警级别、告警恢复通知、告警暂停、不同时段不同阈值、支持维护周期
高效率的graph组件:单机支撑200万metric的上报、归档、存储(周期为1分钟)
高效的历史数据query组件:采用rrdtool的数据归档策略,秒级返回上百个metric一年的历史数据
dashboard:多维度的数据展示,用户自定义Screen
高可用:整个系统无核心单点,易运维,易部署,可水平扩展
开发语言: 整个系统的后端,全部golang编写,portal和dashboard使用python编写
3)falcon的特性
数据采集方式多样灵活:支持agent、snmp、用户主动push、自定义插件等多种方式进行数据采集
高效率报警策略管理
人性化的告警设置
dashboard多维度数据展示
模板支持继承的同时支持覆盖策略项
server端无需做配置,只需要在client端按照agent则可以自动监控
引入tag概念,通过tag多维度对数据进行查询展示
4) falcon的架构图
Open-Falcon是一个比较大的分布式系统,有十几个组件。按照功能,这十几个组件可以划分为 基础组件、作图链路组件和报警链路组件,其安装部署的架构如下图所示
二、open-falcon单机环境安装
1)安装redis
1.1)yum安装方式
wget -O /etc/yum.repos.d/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
或 yum install epel-release
yum install redis -y systemctl start redis systemctl enable redis systemctl status redis
1.2)下载 tar包进行安装
#创建redis工作目录 mkdir /home/redis && cd /home/redis #下载redis程序包: wget http://download.redis.io/releases/redis-4.0.9.tar.gz #编译安装 tar -zxvf redis-4.0.9.tar.gz mv redis-4.0.9 redis4.0.9 && cd redis4.0.9 mkdir logs make && make install #修改配置文件 vim redis.conf bind 0.0.0.0 daemonize yes pidfile /var/run/redis_6379.pid logfile "/home/redis/redis4.0.9/logs/redis.log" #启动redis redis-server /home/redis/redis4.0.9/redis.conf #连接测试 redis-cli -h 127.0.0.1 -p 6379
2)安装mysql
wget -i -c http://dev.mysql.com/get/mysql57-community-release-el7-10.noarch.rpm yum -y install mysql57-community-release-el7-10.noarch.rpm yum -y install mysql-community-server systemctl start mysqld.service [root@node01 ~]# grep "password" /var/log/mysqld.log 2019-07-13T02:39:54.602191Z 1 [Note] A temporary password is generated for root@localhost: i?5XuEqh+aRL 第一次登陆。必须给密码 mysql -uroot -pi?5XuEqh+aRL mysql> set global validate_password_policy=0; Query OK, 0 rows affected (0.00 sec) mysql> set global validate_password_length=1; Query OK, 0 rows affected (0.00 sec) mysql> ALTER USER 'root'@'localhost' IDENTIFIED BY '123456'; Query OK, 0 rows affected (0.00 sec) systemctl stop mysqld.service systemctl start mysqld.service systemctl status mysqld.service systemctl enable mysqld.service
mysql -uroot -p123456
配套安装: yum install mysql-devel -y
2.1) 卸载mysql源。防止每次yum操作都会自动更新
yum -y remove mysql57-community-release-el7-10.noarch
2.2)不要使用root账号
GRANT ALL ON *.* TO 'falcon'@'localhost' IDENTIFIED BY 'falcon'; GRANT ALL ON *.* TO 'falcon'@'%' IDENTIFIED BY 'falcon'; flush privileges;
2.1) 授权普通用户远程连接
授权远程连接 GRANT ALL PRIVILEGES ON *.* TO 'falcon'@'%' IDENTIFIED BY 'falconpassword' WITH GRANT OPTION; flush privileges;
查看用户: SELECT DISTINCT CONCAT('User: ''',user,'''@''',host,''';') AS query FROM mysql.user;
部分版本授权不能使用 %,代表所有,可考虑使用 * 试一下
3)初始化表结构
git clone https://github.com/open-falcon/falcon-plus.git #导入表结构 cd ./falcon-plus/scripts/mysql/db_schema/ mysql -ufalcon -pfalcon < 1_uic-db-schema.sql mysql -ufalcon -pfalcon < 2_portal-db-schema.sql mysql -ufalcon -pfalcon < 3_dashboard-db-schema.sql mysql -ufalcon -pfalcon < 4_graph-db-schema.sql mysql -ufalcon -pfalcon < 5_alarms-db-schema.sql #删除目录 rm -rf falcon-plus/
4)安装golang
#下载go安装包 https://dl.google.com/go/go1.12.7.linux-amd64.tar.gz #解压至/home目录下 tar -zxvf go1.12.7.linux-amd64.tar.gz -C /home #声明PATH echo "export PATH=$PATH:/home/go/bin" >> /etc/profile source /etc/profile #查看go版本 go version
4.1)创建工作目录
export FALCON_HOME=/home export WORKSPACE=$FALCON_HOME/open-falcon mkdir -p $WORKSPACE
5)在工作目录中下载安装包
#下载安装包 wget https://github.com/open-falcon/falcon-plus/releases/download/v0.2.0/open-falcon-v0.2.0.tar.gz #解压 cd /home/open-falcon tar -zxvf open-falcon-v0.2.0.tar.gz
6)后端启动
#修改配置文件为自己设置的mysql用户和密码 grep -Ilr 3306 ./ | xargs -n1 -- sed -i 's/root:/falcon:falcon/g' #启动服务 /home/open-falcon/open-falcon start /home/open-falcon/open-falcon check #显示如下则全部启动成功 falcon-graph UP 27685 falcon-hbs UP 27697 falcon-judge UP 27707 falcon-transfer UP 27716 falcon-nodata UP 27724 falcon-aggregator UP 27732 falcon-agent UP 27743 falcon-gateway UP 27753 falcon-api UP 27761 falcon-alarm UP 28201
递归替换
grep -Ilr 3306 ./ | xargs -n1 -- sed -i 's/root:/falcon:falconpassword/g' grep -Ilr 3306 ./ | xargs -n1 -- sed -i 's/127.0.0.1/172.20.16.5/g'
三、安装前端展示界面
1)下载展示模板 dashboard
#下载dashboard项目至本地 cd $WORKSPACE git clone https://github.com/open-falcon/dashboard.git #安装所需依赖包 yum install -y python-virtualenv yum install -y python-devel yum install -y openldap-devel yum install -y mysql-devel yum groupinstall "Development tools"
2)创建依赖环境
#创建独立的虚拟环境 cd $WORKSPACE/dashboard/ virtualenv ./env #pip安装依赖 ./env/bin/pip install -r pip_requirements.txt -i https://pypi.douban.com/simple
3)在依赖环境中创建配置文件
vim rrd/config # TODO: read from api instead of db PORTAL_DB_HOST = os.environ.get("PORTAL_DB_HOST","127.0.0.1") PORTAL_DB_PORT = int(os.environ.get("PORTAL_DB_PORT",3306)) PORTAL_DB_USER = os.environ.get("PORTAL_DB_USER","falcon") PORTAL_DB_PASS = os.environ.get("PORTAL_DB_PASS","falcon") PORTAL_DB_NAME = os.environ.get("PORTAL_DB_NAME","falcon_portal") # alarm database # TODO: read from api instead of db ALARM_DB_HOST = os.environ.get("ALARM_DB_HOST","127.0.0.1") ALARM_DB_PORT = int(os.environ.get("ALARM_DB_PORT",3306)) ALARM_DB_USER = os.environ.get("ALARM_DB_USER","falcon") ALARM_DB_PASS = os.environ.get("ALARM_DB_PASS","falcon") ALARM_DB_NAME = os.environ.get("ALARM_DB_NAME","alarms")
4)启动服务查看状态
#启动
bash control start
bash control status
#查看日志
bash control tail
服务器安装完成
4.1)如果出现内部错误
[root@node01 dashboard]# cat rrd/config.py 请查看改该文件连接的mysql是否正常加载了用户名和密码
四、客户端的安装
1)从服务端拷贝文件到客户端
[root@node01 open-falcon]# pwd /home/open-falcon [root@node01 open-falcon]# scp -r agent/ root@192.168.1.7:/home/open-falcon/ [root@node01 open-falcon]# scp open-falcon root@192.168.1.7:/home/open-falcon/
2)编辑配置文件
#编辑agent配置文件,修改hostname、transfer、heaetbeat配置项 vim agent/config/cfg.json #启动agent,查看agent状态 ./open-falcon start agent ./open-falcon check agent tailf agent/logs/agent.log #重载配置文件 curl 127.0.0.1:1988/config/reload
稍等片刻。机器自动发现
五、查询基本使用
1)机器选择,监控指标选择
1.1)查看图像
2) Screen的功能的基本使用
归纳: 先创建demo组,再创建 相关监控的类。最后添加监控指标
再继续添加内存
3)分组功能
添加机器
4)创建模板
添加监控策略
将之前的主机组绑定模板
5)测试肯定会触发的报警值。模板里面进行修改
六、客户端的开机自启动
[root@iotansible0001 init.d]# pwd /etc/rc.d/init.d [root@iotansible0001 init.d]# cat falcon-agentd #!/bin/bash # /etc/init.d/falcon-agentd # chkconfig: 2345 20 80 # description: Starts and Stops falcon-agent dir=/home/envuser/falcon pid=`ps -ef | grep falcon-agent | grep -v falcon-agentd | grep -v "grep" | awk '{print $2}'` case "$1" in start) if [[ $pid -gt 0 ]];then echo $pid kill -9 $pid echo "Stopping falcon-agent ..." fi sleep 1 echo "Starting falcon-agent ..." su - envuser -c "cd $dir && nohup ./open-falcon start agent &" ;; stop) if [[ $pid -gt 0 ]];then echo $pid kill -9 $pid echo "Stopping falcon-agent ..." sleep 1 else echo "Falcon-agent is stoped ..." fi ;; restart) echo "Resstarting falcon-agent ..." if [[ $pid -gt 0 ]];then echo $pid kill -9 $pid echo "Stopping falcon-agent ..." fi sleep 1 echo "Starting falcon-agent ..." su - envuser -c "cd $dir && nohup ./open-falcon start agent &" ;; *) echo "Usage: falcon-agentd {start|stop|restart}" exit 0 esac exit 0
添加至启动项
chmod +x falcon-agentd chkconfig --add falcon-agentd chkconfig falcon-agentd on
七、客户端命令验证
[envuser@nginx-mqtt0001 bin]$ ./falcon-agent --check net.if ... ok cpustat ... ok disk.io ... ok memory ... ok ss -s ... ok ss -tln ... ok kernel ... ok df.bytes ... ok loadavg ... ok netstat ... ok ps aux ... ok du -bs ... ok
八、推送监控数据
curl -X POST -d "[{\"metric\": \"test_by_test\", \"endpoint\": \"test_by_test_ep\", \"timestamp\": `date +%s`,\"step\": 60,\"value\": 1,\"counterType\": \"GAUGE\",\"tags\": \"region=test\"}]" http://127.0.0.1:1988/v1/push &> /dev/null
8.1) 官方脚本
ts=`date +%s`; curl -X POST -d "[{\"metric\": \"test-metric\", \"endpoint\": \"test-endpoint\", \"timestamp\": $ts,\"step\": 60,\"value\": 1,\"counterType\": \"GAUGE\",\"tags\": \"idc=lg,project=xx\"}]" http://127.0.0.1:1988/v1/push
官方python脚本
#!-*- coding:utf8 -*- import requests import time import json ts = int(time.time()) payload = [ { "endpoint": "test-endpoint", "metric": "test-metric", "timestamp": ts, "step": 60, "value": 1, "counterType": "GAUGE", "tags": "idc=lg,loc=beijing", }, { "endpoint": "test-endpoint", "metric": "test-metric2", "timestamp": ts, "step": 60, "value": 2, "counterType": "GAUGE", "tags": "idc=lg,loc=beijing", }, ] r = requests.post("http://127.0.0.1:1988/v1/push", data=json.dumps(payload)) print(r.text)
通用python脚本改造
#!-*- coding:utf8 -*- import requests import time import json ts = int(time.time()) def get_push_data(endpoint, metric, tags, value, ts=int(time.time()), counterType="GAUGE"): data = [{"endpoint": endpoint, "metric": metric, "tags": tags, "timestamp": ts, "value": value, "step": 60, "counterType": counterType}] res = requests.post("http://127.0.0.1:1988/v1/push", data=json.dumps(data)) return res def pull_falcon(): endpoint = "history_stream" metric = "eniot_monitor_history_stream" tags = "region={region},point={point}".format( region = "region", point = "point", ) value = 1 res = get_push_data(endpoint, metric, tags, value) print(res.text) if __name__ == '__main__': pull_falcon()
九、 es 集群监控
引用配置
[es] data_host = elk0001:9200,elk0002:9200,elk0003:9200 log_host = elk-log0001.eniot.io:9200,elk-log0002.eniot.io:9200,elk-log0003.eniot.io:9200
监控脚本
# coding: utf-8 import time import datetime import json import traceback from monitor_logger import Logger from monitor_falcon import Falcon from elasticsearch import Elasticsearch log_file = u"eniot_monitor_es_status.log" class ESstatus(): def __init__(self,logger = None): self.logger = logger if logger else Logger(log_file).get_logger() self.falcon = Falcon(self.logger) def get_conf(self,cf): try: data_info = dict() region = cf.get(u"region", u"region") if not region: msg =u"get region by conf error!" self.logger.error(msg) return data_info.update({u"region": region}) data_host = cf.get(u"es", u"data_host") if not data_host: msg =u"get es host data by conf error!" self.logger.error(msg) return data_info.update({u"data_host": data_host}) log_host = cf.get(u"es", u"log_host") if not data_host: msg = u"get es host data by conf error!" self.logger.error(msg) return data_info.update({u"log_host": log_host}) return data_info except: self.logger.error(traceback.format_exc()) def push_falcon(self,region, excutetime , status,clusterName): try: endpoint = "eniot_monitor_es_status" metric = "eniot_monitor_es_status_excutetime" tags = "region={region},clusterName={clusterName}".format( region = region, clusterName = clusterName, ) print(tags) falcon_push_data = self.falcon.get_push_data(endpoint, metric, tags, float(excutetime)) self.falcon.push_data(falcon_push_data) metric = "eniot_monitor_es_status" falcon_push_data = self.falcon.get_push_data(endpoint, metric, tags, status) self.falcon.push_data(falcon_push_data) except: self.logger.error(traceback.format_exc()) def monitor_es_client(self,region,host): try: esclient = Elasticsearch(host) start_time = time.clock() result = esclient.cat.health().split(" ") result_v = esclient.cat.health(v=True) print(result_v) clusterName =result[2] if result[3] != "green": status = 0 else: status = 1 end_time = time.clock() excutetime = end_time - start_time print("result = " + result[3]) print("excutetime = " + str(excutetime)) self.push_falcon(region, excutetime, status,clusterName) except: self.logger.error(traceback.format_exc()) def main(self): try: cf =self.falcon.check_conf() data_info = self.get_conf(cf) if not data_info: msg = u"get es info error!" self.logger.warn(msg) return region = data_info["region"] log_host = data_info["log_host"].split(",") data_host = data_info["data_host"].split(",") self.monitor_es_client(region,log_host) self.monitor_es_client(region,data_host) except: self.logger.error(traceback.format_exc()) if __name__ == '__main__': app = ESstatus() app.main()