使用python脚本+zabbix前端监控云联网底层TCP数据流所负载的链路质量,并在丢包时联动保存MTR记录

个人博客地址

http://www.darkghost.life

背景

  目前国内各家云联网跨区域数据传输,会将数据流通过哈希运算负载到不同的底层链路上,而底层链路质量差异较大,这种情况导致的现象就是,使用传统的icmp监控线路正常,但是业务一直不稳定,所以才有了使用TCP监控的需求

  使用TCP测试链路质量各RS厂商都有类似的功能,如RPM,NQA等,但缺陷是不能进行绘图,不能准确掌握线路整个周期内的质量,所以考虑使用zabbix自定义脚本来实现TCP监控

 

zabbix-tcpping脚本如下

#!/usr/bin/env python3
#-*-coding:utf-8-*-
import os,sys,subprocess 
import re,datetime,time
import logging,argparse

dir_path = os.path.dirname(os.path.abspath(__file__))
tcp_line = dir_path + '/tcping.py {0} {1} 20 '
ping_type = {'tcp':tcp_line}

log_name = dir_path + '/log/' + time.strftime('%Y-%m-%d',time.localtime()) + '.log'
def logger():
    logger = logging.getLogger()
    fh = logging.FileHandler(log_name)
    formater = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
    fh.setFormatter(formater)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(fh)
    return logger
logger = logger()

    
def value(type,value_type,dip,dport):
    cmd_line = type.format(dip,dport)
    res = subprocess.Popen(cmd_line,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE).communicate()
    # return(res[0])  #python2中使用
    return(str(res[0],'utf8'))
def get_value(item,res):
    res_value = {'pkloss':'100','restime':'0'}
    value_cache = re.findall('\d+%',res)[0]
    res_value['pkloss'] =  re.findall('\d*',value_cache)[0]
    restime_cache = re.findall('\d+[.]\d+\s',res)
    if item == 'restime':
        count = len(restime_cache)
        first = 0
        for x in restime_cache:
            x = float(x)
            first+=x
        res_value['restime'] = round(float((first/count)),1)
    logger.info('%s:%s'%(item,round(float(res_value[item]),1)))
    return(round(float(res_value[item]),1))
    

def main(type,value_type,dip,dport):
    s = value(ping_type[type],value_type,dip,dport)
    print(get_value(value_type,s))
        
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='tcp for monitor')
    parser.add_argument('-T',action='store', dest='ping_type',default='tcp')
    parser.add_argument('-t',action = 'store',dest='tip')
    parser.add_argument('-p',action='store',dest='dport')
    parser.add_argument('-i',action='store',dest='item')
    args= parser.parse_args()

    type = args.ping_type
    tip = args.tip
    dport = args.dport
    item = args.item

    main(type,item,tip,dport)

 脚本使用方法

$ python3 zabbixtcp.py -T tcp -t 8.8.8.8 -p 53 -i restime
35.5$ python3 zabbixtcp.py -T tcp -t 8.8.8.8 -p 53 -i pkloss
0.0

 

 

zabbix配置

编辑zabbix-agent配置文件,允许使用自定义脚本并添加key
vi /etc/zabbix/zabbix_agentd.conf

UnsafeUserParameters=1


UserParameter=tcp_loss[*],/etc/zabbix/monitor/tcploss.py -T tcp  -t $1  -p $2  -i pkloss
UserParameter=tcp_restime[*],/etc/zabbix/monitor/tcploss.py -T tcp  -t $1  -p $2  -i restime

zabbix前端页面添加item

 

 

 

 

 

 

添加graphs

 

 

 

 

 

等待一段时间就可以看到图像了

 

 

 

后端log如下

后续需求,当线路丢包率超过10%时自动执行mtr并保存记录,实现很简单,加一段if语句执行一段shell就行,代码如下(只适用于linux)

    if item =='pkloss' and 100 > round(float(res_value[item]),1)  > 9:
        global tip
    mtr = '/etc/zabbix/bin/mtr_bash %s'%tip
    subprocess.Popen(mtr,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)

shell如下

#!/usr/bin/env bash
IP=$1
mtr -r -n -c 30 -w -b $IP >> /etc/zabbix/bin/log/$1-$(date +%Y-%m-%d)

手动触发下看看效果

 

 

 

 

代码中使用的tcping.py文件下载链接

https://github.com/yantisj/tcpping/blob/master/tcpping.py

 

2021-04-16

基于原始代码增加指定源ip功能,

#!/usr/bin/env python3
"""
TCP Ping Test (defaults to port 80, 10000 packets)
Usage: ./tcpping.py host [port] [maxCount]
- Ctrl-C Exits with Results
"""

import sys
import socket
import time
import signal
from timeit import default_timer as timer

host = None
port = 80


maxCount = 10000
count = 0


try:
    sip = sys.argv[1]
except IndexError:
    print("Usage: tcpping.py host [port] [maxCount]")
    sys.exit(1)

try:
    host = sys.argv[2]
except IndexError:
    print("Usage: tcpping.py host [port] [maxCount]")
    sys.exit(1)


try:
    port = int(sys.argv[3])
except ValueError:
    print("Error: Port Must be Integer:", sys.argv[4])
    sys.exit(1)
except IndexError:
    pass


try:
    maxCount = int(sys.argv[4])
except ValueError:
    print("Error: Max Count Value Must be Integer", sys.argv[4])
    sys.exit(1)
except IndexError:
    pass



passed = 0
failed = 0


def getResults():
    """ Summarize Results """

    lRate = 0
    if failed != 0:
        lRate = failed / (count) * 100
        lRate = "%.2f" % lRate

    print("\nTCP Ping Results: Connections (Total/Pass/Fail): [{:}/{:}/{:}] (Failed: {:}%)".format((count), passed, failed, str(lRate)))

def signal_handler(signal, frame):
    """ Catch Ctrl-C and Exit """
    getResults()
    sys.exit(0)


signal.signal(signal.SIGINT, signal_handler)


while count < maxCount:


    count += 1

    success = False


    s = socket.socket(
    socket.AF_INET, socket.SOCK_STREAM)


    s.settimeout(1)

    s_start = timer()


    try:
        s.bind((sip,0))
        s.connect((host, int(port)))
        s.shutdown(socket.SHUT_RD)
        success = True
    

    except socket.timeout:
        print("Connection timed out!")
        failed += 1
    except OSError as e:
        print("OS Error:", e)
        failed += 1


    s_stop = timer()
    s_runtime = "%.2f" % (1000 * (s_stop - s_start))

    if success:
        print("Connected to %s[%s]: tcp_seq=%s time=%s ms" % (host, port, (count-1), s_runtime))
        passed += 1


    if count < maxCount:
        time.sleep(1)


getResults()

适配以上代码zabbix脚本需要做以下变更

#!/usr/bin/env python3
#-*-coding:utf-8-*-
import os,sys,subprocess 
import re,random,datetime,time
import logging,argparse

dir_path = os.path.dirname(os.path.abspath(__file__))
tcp_line = dir_path + '/tcping.py {0} {1} {2} 10 '
ping_type = {'tcp':tcp_line}

log_name = dir_path + '/log/' + time.strftime('%Y-%m-%d',time.localtime()) + '.log'
def logger():
    logger = logging.getLogger()
    fh = logging.FileHandler(log_name)
    formater = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
    fh.setFormatter(formater)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(fh)
    return logger
logger = logger()

    
def value(type,value_type,sip,dip,dport):
    cmd_line = type.format(sip,dip,dport)
    res = subprocess.Popen(cmd_line,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE).communicate()
    # return(res[0])  #python2中使用
    return(str(res[0],'utf8'))
def get_value(item,res):
    res_value = {'pkloss':'100','restime':'0'}
    value_cache = re.findall('\d+%',res)[0]
    res_value['pkloss'] =  re.findall('\d*',value_cache)[0]
    restime_cache = re.findall('\d+[.]\d+\s',res)
    if item == 'restime':
        count = len(restime_cache)
        first = 0
        for x in restime_cache:
            x = float(x)
            first+=x
        res_value['restime'] = round(float((first/count)),1)
    logger.info('%s:%s'%(item,round(float(res_value[item]),1)))
    return(round(float(res_value[item]),1))
    

def main(type,value_type,sip,dip,dport):
    s = value(ping_type[type],value_type,sip,dip,dport)
    print(get_value(value_type,s))
        
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='tcp for monitor')
    parser.add_argument('-T',action='store', dest='ping_type',default='tcp')
    parser.add_argument('-s',action = 'store',dest='sip')
    parser.add_argument('-t',action = 'store',dest='tip')
    parser.add_argument('-p',action='store',dest='dport')
    parser.add_argument('-i',action='store',dest='item')
    args= parser.parse_args()

    type = args.ping_type
    sip = args.sip
    tip = args.tip
    dport = args.dport
    item = args.item

    main(type,item,sip,tip,dport)

  zabbix 定义key

UserParameter=tcp_pkloss[*],/etc/zabbix/monitor/tcploss.py -T tcp  -s $1 -t $2  -p $3  -i pkloss
UserParameter=tcp_restime[*],/etc/zabbix/monitor/tcploss.py -T tcp  -s $1 -t $2  -p $3  -i restime

 

posted @ 2021-04-12 07:21  无限's-blog  阅读(949)  评论(0编辑  收藏  举报