使用python脚本+zabbix前端监控云联网底层TCP数据流所负载的链路质量,并在丢包时联动保存MTR记录
个人博客地址
背景
目前国内各家云联网跨区域数据传输,会将数据流通过哈希运算负载到不同的底层链路上,而底层链路质量差异较大,这种情况导致的现象就是,使用传统的icmp监控线路正常,但是业务一直不稳定,所以才有了使用TCP监控的需求
使用TCP测试链路质量各RS厂商都有类似的功能,如RPM,NQA等,但缺陷是不能进行绘图,不能准确掌握线路整个周期内的质量,所以考虑使用zabbix自定义脚本来实现TCP监控
zabbix-tcpping脚本如下
#!/usr/bin/env python3 #-*-coding:utf-8-*- import os,sys,subprocess import re,datetime,time import logging,argparse dir_path = os.path.dirname(os.path.abspath(__file__)) tcp_line = dir_path + '/tcping.py {0} {1} 20 ' ping_type = {'tcp':tcp_line} log_name = dir_path + '/log/' + time.strftime('%Y-%m-%d',time.localtime()) + '.log' def logger(): logger = logging.getLogger() fh = logging.FileHandler(log_name) formater = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") fh.setFormatter(formater) logger.setLevel(logging.DEBUG) logger.addHandler(fh) return logger logger = logger() def value(type,value_type,dip,dport): cmd_line = type.format(dip,dport) res = subprocess.Popen(cmd_line,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE).communicate() # return(res[0]) #python2中使用 return(str(res[0],'utf8')) def get_value(item,res): res_value = {'pkloss':'100','restime':'0'} value_cache = re.findall('\d+%',res)[0] res_value['pkloss'] = re.findall('\d*',value_cache)[0] restime_cache = re.findall('\d+[.]\d+\s',res) if item == 'restime': count = len(restime_cache) first = 0 for x in restime_cache: x = float(x) first+=x res_value['restime'] = round(float((first/count)),1) logger.info('%s:%s'%(item,round(float(res_value[item]),1))) return(round(float(res_value[item]),1)) def main(type,value_type,dip,dport): s = value(ping_type[type],value_type,dip,dport) print(get_value(value_type,s)) if __name__ == "__main__": parser = argparse.ArgumentParser(description='tcp for monitor') parser.add_argument('-T',action='store', dest='ping_type',default='tcp') parser.add_argument('-t',action = 'store',dest='tip') parser.add_argument('-p',action='store',dest='dport') parser.add_argument('-i',action='store',dest='item') args= parser.parse_args() type = args.ping_type tip = args.tip dport = args.dport item = args.item main(type,item,tip,dport)
脚本使用方法
$ python3 zabbixtcp.py -T tcp -t 8.8.8.8 -p 53 -i restime 35.5$ python3 zabbixtcp.py -T tcp -t 8.8.8.8 -p 53 -i pkloss 0.0
zabbix配置
编辑zabbix-agent配置文件,允许使用自定义脚本并添加key vi /etc/zabbix/zabbix_agentd.conf UnsafeUserParameters=1 UserParameter=tcp_loss[*],/etc/zabbix/monitor/tcploss.py -T tcp -t $1 -p $2 -i pkloss UserParameter=tcp_restime[*],/etc/zabbix/monitor/tcploss.py -T tcp -t $1 -p $2 -i restime
zabbix前端页面添加item
添加graphs
等待一段时间就可以看到图像了
后端log如下
后续需求,当线路丢包率超过10%时自动执行mtr并保存记录,实现很简单,加一段if语句执行一段shell就行,代码如下(只适用于linux)
if item =='pkloss' and 100 > round(float(res_value[item]),1) > 9: global tip mtr = '/etc/zabbix/bin/mtr_bash %s'%tip subprocess.Popen(mtr,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
shell如下
#!/usr/bin/env bash IP=$1 mtr -r -n -c 30 -w -b $IP >> /etc/zabbix/bin/log/$1-$(date +%Y-%m-%d)
手动触发下看看效果
代码中使用的tcping.py文件下载链接
https://github.com/yantisj/tcpping/blob/master/tcpping.py
2021-04-16
基于原始代码增加指定源ip功能,
#!/usr/bin/env python3 """ TCP Ping Test (defaults to port 80, 10000 packets) Usage: ./tcpping.py host [port] [maxCount] - Ctrl-C Exits with Results """ import sys import socket import time import signal from timeit import default_timer as timer host = None port = 80 maxCount = 10000 count = 0 try: sip = sys.argv[1] except IndexError: print("Usage: tcpping.py host [port] [maxCount]") sys.exit(1) try: host = sys.argv[2] except IndexError: print("Usage: tcpping.py host [port] [maxCount]") sys.exit(1) try: port = int(sys.argv[3]) except ValueError: print("Error: Port Must be Integer:", sys.argv[4]) sys.exit(1) except IndexError: pass try: maxCount = int(sys.argv[4]) except ValueError: print("Error: Max Count Value Must be Integer", sys.argv[4]) sys.exit(1) except IndexError: pass passed = 0 failed = 0 def getResults(): """ Summarize Results """ lRate = 0 if failed != 0: lRate = failed / (count) * 100 lRate = "%.2f" % lRate print("\nTCP Ping Results: Connections (Total/Pass/Fail): [{:}/{:}/{:}] (Failed: {:}%)".format((count), passed, failed, str(lRate))) def signal_handler(signal, frame): """ Catch Ctrl-C and Exit """ getResults() sys.exit(0) signal.signal(signal.SIGINT, signal_handler) while count < maxCount: count += 1 success = False s = socket.socket( socket.AF_INET, socket.SOCK_STREAM) s.settimeout(1) s_start = timer() try: s.bind((sip,0)) s.connect((host, int(port))) s.shutdown(socket.SHUT_RD) success = True except socket.timeout: print("Connection timed out!") failed += 1 except OSError as e: print("OS Error:", e) failed += 1 s_stop = timer() s_runtime = "%.2f" % (1000 * (s_stop - s_start)) if success: print("Connected to %s[%s]: tcp_seq=%s time=%s ms" % (host, port, (count-1), s_runtime)) passed += 1 if count < maxCount: time.sleep(1) getResults()
适配以上代码zabbix脚本需要做以下变更
#!/usr/bin/env python3 #-*-coding:utf-8-*- import os,sys,subprocess import re,random,datetime,time import logging,argparse dir_path = os.path.dirname(os.path.abspath(__file__)) tcp_line = dir_path + '/tcping.py {0} {1} {2} 10 ' ping_type = {'tcp':tcp_line} log_name = dir_path + '/log/' + time.strftime('%Y-%m-%d',time.localtime()) + '.log' def logger(): logger = logging.getLogger() fh = logging.FileHandler(log_name) formater = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") fh.setFormatter(formater) logger.setLevel(logging.DEBUG) logger.addHandler(fh) return logger logger = logger() def value(type,value_type,sip,dip,dport): cmd_line = type.format(sip,dip,dport) res = subprocess.Popen(cmd_line,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE).communicate() # return(res[0]) #python2中使用 return(str(res[0],'utf8')) def get_value(item,res): res_value = {'pkloss':'100','restime':'0'} value_cache = re.findall('\d+%',res)[0] res_value['pkloss'] = re.findall('\d*',value_cache)[0] restime_cache = re.findall('\d+[.]\d+\s',res) if item == 'restime': count = len(restime_cache) first = 0 for x in restime_cache: x = float(x) first+=x res_value['restime'] = round(float((first/count)),1) logger.info('%s:%s'%(item,round(float(res_value[item]),1))) return(round(float(res_value[item]),1)) def main(type,value_type,sip,dip,dport): s = value(ping_type[type],value_type,sip,dip,dport) print(get_value(value_type,s)) if __name__ == "__main__": parser = argparse.ArgumentParser(description='tcp for monitor') parser.add_argument('-T',action='store', dest='ping_type',default='tcp') parser.add_argument('-s',action = 'store',dest='sip') parser.add_argument('-t',action = 'store',dest='tip') parser.add_argument('-p',action='store',dest='dport') parser.add_argument('-i',action='store',dest='item') args= parser.parse_args() type = args.ping_type sip = args.sip tip = args.tip dport = args.dport item = args.item main(type,item,sip,tip,dport)
zabbix 定义key
UserParameter=tcp_pkloss[*],/etc/zabbix/monitor/tcploss.py -T tcp -s $1 -t $2 -p $3 -i pkloss
UserParameter=tcp_restime[*],/etc/zabbix/monitor/tcploss.py -T tcp -s $1 -t $2 -p $3 -i restime
以驱魔为理想,为生计而奔波