zabbix 源码分析 another/first network error wait for 15s seconds 出现原因及调优建议
在监控设备的时候,在server端的日志中有时候会见到类似another network error, wait for 15s seconds的异常,今天我们看下这个问题的出现原因和解决方案:
问题定位到poller.c,看下下面两份代码:
这个get_values的部分代码:
for (i = 0; i < num; i++) { switch (errcodes[i]) { case SUCCEED: case NOTSUPPORTED: case AGENT_ERROR: if (HOST_AVAILABLE_TRUE != last_available) { zbx_activate_item_host(&items[i], ×pec); last_available = HOST_AVAILABLE_TRUE; } break; case NETWORK_ERROR: case GATEWAY_ERROR: case TIMEOUT_ERROR: if (HOST_AVAILABLE_FALSE != last_available) { zbx_deactivate_item_host(&items[i], ×pec, results[i].msg); last_available = HOST_AVAILABLE_FALSE; } break; case CONFIG_ERROR: /* nothing to do */ break; default: zbx_error("unknown response code returned: %d", errcodes[i]); THIS_SHOULD_NEVER_HAPPEN; }
这里是zbx_deactivate_item_host的代码:
void zbx_deactivate_item_host(DC_ITEM *item, zbx_timespec_t *ts, const char *error) // #0 { const char *__function_name = "zbx_deactivate_item_host"; zbx_host_availability_t in, out; // #1 unsigned char agent_type; // #2 zabbix_log(LOG_LEVEL_DEBUG, "In %s() hostid:" ZBX_FS_UI64 " itemid:" ZBX_FS_UI64 " type:%d", // #3 __function_name, item->host.hostid, item->itemid, (int)item->type); zbx_host_availability_init(&in, item->host.hostid); // #4 zbx_host_availability_init(&out,item->host.hostid); // #5 if (ZBX_AGENT_UNKNOWN == (agent_type = host_availability_agent_by_item_type(item->type))) // #6 goto out; if (FAIL == host_get_availability(&item->host, agent_type, &in)) // #7 goto out; if (FAIL == DChost_deactivate(item->host.hostid, agent_type, ts, &in.agents[agent_type], // #8 &out.agents[agent_type], error)) { goto out; } if (FAIL == db_host_update_availability(&out)) // #9 goto out; host_set_availability(&item->host, agent_type, &out); // #10 if (0 == in.agents[agent_type].errors_from) // #11 { zabbix_log(LOG_LEVEL_WARNING, "%s item \"%s\" on host \"%s\" failed:" // #12 " first network error, wait for %d seconds", zbx_agent_type_string(item->type), item->key_orig, item->host.host, out.agents[agent_type].disable_until - ts->sec); } else { if (HOST_AVAILABLE_FALSE != in.agents[agent_type].available) // #13 { if (HOST_AVAILABLE_FALSE != out.agents[agent_type].available) // #14 { zabbix_log(LOG_LEVEL_WARNING, "%s item \"%s\" on host \"%s\" failed:" // #15 " another network error, wait for %d seconds", zbx_agent_type_string(item->type), item->key_orig, item->host.host, out.agents[agent_type].disable_until - ts->sec); } else { zabbix_log(LOG_LEVEL_WARNING, "temporarily disabling %s checks on host \"%s\":" // #16 " host unavailable", zbx_agent_type_string(item->type), item->host.host); } } } zabbix_log(LOG_LEVEL_DEBUG, "%s() errors_from:%d available:%d", __function_name, out.agents[agent_type].errors_from, out.agents[agent_type].available); out: zbx_host_availability_clean(&out); zbx_host_availability_clean(&in); zabbix_log(LOG_LEVEL_DEBUG, "End of %s()", __function_name); }
下面看下这里是zbx_deactivate_item_host的代码的逻辑:
#0 zbx_deactivate_item_host函数接收三个参数
1 结构体指针,主机的一些综合参数 //dbcache.h typedef struct { DC_HOST host; DC_INTERFACE interface; zbx_uint64_t itemid; zbx_uint64_t lastlogsize; zbx_uint64_t valuemapid; unsigned char type; unsigned char value_type; unsigned char state; unsigned char snmpv3_securitylevel; unsigned char authtype; unsigned char flags; unsigned char snmpv3_authprotocol; unsigned char snmpv3_privprotocol; unsigned char inventory_link; unsigned char status; unsigned char history; unsigned char trends; unsigned char follow_redirects; unsigned char post_type; unsigned char retrieve_mode; unsigned char request_method; unsigned char output_format; unsigned char verify_peer; unsigned char verify_host; unsigned char allow_traps; char key_orig[ITEM_KEY_LEN * ZBX_MAX_BYTES_IN_UTF8_CHAR + 1], *key; char *units; char *delay; int history_sec; int nextcheck; int lastclock; int mtime; char trapper_hosts[ITEM_TRAPPER_HOSTS_LEN_MAX]; char logtimefmt[ITEM_LOGTIMEFMT_LEN_MAX]; char snmp_community_orig[ITEM_SNMP_COMMUNITY_LEN_MAX], *snmp_community; char snmp_oid_orig[ITEM_SNMP_OID_LEN_MAX], *snmp_oid; char snmpv3_securityname_orig[ITEM_SNMPV3_SECURITYNAME_LEN_MAX], *snmpv3_securityname; char snmpv3_authpassphrase_orig[ITEM_SNMPV3_AUTHPASSPHRASE_LEN_MAX], *snmpv3_authpassphrase; char snmpv3_privpassphrase_orig[ITEM_SNMPV3_PRIVPASSPHRASE_LEN_MAX], *snmpv3_privpassphrase; char ipmi_sensor[ITEM_IPMI_SENSOR_LEN_MAX]; char *params; char username_orig[ITEM_USERNAME_LEN_MAX], *username; char publickey_orig[ITEM_PUBLICKEY_LEN_MAX], *publickey; char privatekey_orig[ITEM_PRIVATEKEY_LEN_MAX], *privatekey; char password_orig[ITEM_PASSWORD_LEN_MAX], *password; char snmpv3_contextname_orig[ITEM_SNMPV3_CONTEXTNAME_LEN_MAX], *snmpv3_contextname; char jmx_endpoint_orig[ITEM_JMX_ENDPOINT_LEN_MAX], *jmx_endpoint; char timeout_orig[ITEM_TIMEOUT_LEN_MAX], *timeout; char url_orig[ITEM_URL_LEN_MAX], *url; char query_fields_orig[ITEM_QUERY_FIELDS_LEN_MAX], *query_fields; char *posts; char status_codes_orig[ITEM_STATUS_CODES_LEN_MAX], *status_codes; char http_proxy_orig[ITEM_HTTP_PROXY_LEN_MAX], *http_proxy; char *headers; char ssl_cert_file_orig[ITEM_SSL_CERT_FILE_LEN_MAX], *ssl_cert_file; char ssl_key_file_orig[ITEM_SSL_KEY_FILE_LEN_MAX], *ssl_key_file; char ssl_key_password_orig[ITEM_SSL_KEY_PASSWORD_LEN_MAX], *ssl_key_password; char *error; } DC_ITEM; 2 结构体指针 //common.h typedef struct { int sec; /* seconds */ int ns; /* nanoseconds */ } zbx_timespec_t; 3 错误信息
#1 定义了两个结构体数组 in 和 out
//db.h typedef struct { /* flags specifying which fields are set, see ZBX_FLAGS_AGENT_STATUS_* defines */ unsigned char flags; /* agent availability fields */ unsigned char available; char *error; int errors_from; int disable_until; } zbx_agent_availability_t; typedef struct { zbx_uint64_t hostid; zbx_agent_availability_t agents[ZBX_AGENT_MAX]; //这里的ZBX_AGENT_MAX 为4 ,分别代表ZABBIX, SNMP, IPMI, JMX4种类型 } zbx_host_availability_t;
#2 声明unsigned char agent_type,unsigned char和char的区别是char表示-128-127,unsigned char 表示0-255,这里的255会在后面遇到,所以需要255的这个表示范围
#3 记录DEBUG 的log,如果需要显示这份日志,需要将server端的配置文件debug等级更改为5,不过我不建议你这么做
#4 初始化主机IN可用性数据
//dbconfig.c void zbx_host_availability_init(zbx_host_availability_t *availability, zbx_uint64_t hostid) { memset(availability, 0, sizeof(zbx_host_availability_t)); availability->hostid = hostid; }
#5 同#4一样,只不过是OUT
#6 为agent_type赋值,如果agent_type不属于#1中的四种,跳至out处
1、host_availability_agent_by_item_type 位于poller.c,接收item的type字段,用来判断监控类型 //poller.c static unsigned char host_availability_agent_by_item_type(unsigned char type) { switch (type) { case ITEM_TYPE_ZABBIX: return ZBX_AGENT_ZABBIX; break; case ITEM_TYPE_SNMPv1: case ITEM_TYPE_SNMPv2c: case ITEM_TYPE_SNMPv3: return ZBX_AGENT_SNMP; break; case ITEM_TYPE_IPMI: return ZBX_AGENT_IPMI; break; case ITEM_TYPE_JMX: return ZBX_AGENT_JMX; break; default: return ZBX_AGENT_UNKNOWN; } } 2、ZBX_AGENT_UNKNOWN 常量 为 255 对应之前的 #2
#7 根据agent_type来判断主机的可用性,网络设备会匹配到ZBX_AGENT_SNMP,四个值分别代表的意思是
//poller.c static int host_get_availability(const DC_HOST *dc_host, unsigned char agent, zbx_host_availability_t *ha) { zbx_agent_availability_t *availability = &ha->agents[agent]; availability->flags = ZBX_FLAGS_AGENT_STATUS; switch (agent) { case ZBX_AGENT_ZABBIX: availability->available = dc_host->available; availability->error = zbx_strdup(NULL, dc_host->error); availability->errors_from = dc_host->errors_from; availability->disable_until = dc_host->disable_until; break; case ZBX_AGENT_SNMP: availability->available = dc_host->snmp_available; //主机的snmp可用状态 availability->error = zbx_strdup(NULL, dc_host->snmp_error); //错误信息 availability->errors_from = dc_host->snmp_errors_from; //错误发生时间 availability->disable_until = dc_host->snmp_disable_until; //下次延迟检测时间 break; case ZBX_AGENT_IPMI: availability->available = dc_host->ipmi_available; availability->error = zbx_strdup(NULL, dc_host->ipmi_error); availability->errors_from = dc_host->ipmi_errors_from; availability->disable_until = dc_host->ipmi_disable_until; break; case ZBX_AGENT_JMX: availability->available = dc_host->jmx_available; availability->error = zbx_strdup(NULL, dc_host->jmx_error); availability->disable_until = dc_host->jmx_disable_until; availability->errors_from = dc_host->jmx_errors_from; break; default: return FAIL; } ha->hostid = dc_host->hostid; return SUCCEED; } //dbcache.h typedef struct { zbx_uint64_t hostid; zbx_uint64_t proxy_hostid; char host[HOST_HOST_LEN_MAX]; char name[HOST_NAME_LEN * ZBX_MAX_BYTES_IN_UTF8_CHAR + 1]; unsigned char maintenance_status; unsigned char maintenance_type; int maintenance_from; int errors_from; unsigned char available; int disable_until; int snmp_errors_from; unsigned char snmp_available; int snmp_disable_until; int ipmi_errors_from; unsigned char ipmi_available; int ipmi_disable_until; signed char ipmi_authtype; unsigned char ipmi_privilege; char ipmi_username[HOST_IPMI_USERNAME_LEN_MAX]; char ipmi_password[HOST_IPMI_PASSWORD_LEN_MAX]; int jmx_errors_from; unsigned char jmx_available; int jmx_disable_until; char inventory_mode; unsigned char status; unsigned char tls_connect; unsigned char tls_accept; #if defined(HAVE_POLARSSL) || defined(HAVE_GNUTLS) || defined(HAVE_OPENSSL) char tls_issuer[HOST_TLS_ISSUER_LEN_MAX]; char tls_subject[HOST_TLS_SUBJECT_LEN_MAX]; char tls_psk_identity[HOST_TLS_PSK_IDENTITY_LEN_MAX]; char tls_psk[HOST_TLS_PSK_LEN_MAX]; #endif char error[HOST_ERROR_LEN_MAX]; char snmp_error[HOST_ERROR_LEN_MAX]; char ipmi_error[HOST_ERROR_LEN_MAX]; char jmx_error[HOST_ERROR_LEN_MAX]; } DC_HOST; //db.h #define ZBX_FLAGS_AGENT_STATUS_AVAILABLE 0x00000001 #define ZBX_FLAGS_AGENT_STATUS_ERROR 0x00000002 #define ZBX_FLAGS_AGENT_STATUS_ERRORS_FROM 0x00000004 #define ZBX_FLAGS_AGENT_STATUS_DISABLE_UNTIL 0x00000008 #define ZBX_FLAGS_AGENT_STATUS (ZBX_FLAGS_AGENT_STATUS_AVAILABLE | \ ZBX_FLAGS_AGENT_STATUS_ERROR | \ ZBX_FLAGS_AGENT_STATUS_ERRORS_FROM | \ ZBX_FLAGS_AGENT_STATUS_DISABLE_UNTIL) //common.h #define FAIL -1
#8 根据agent_type 设置主机状态
//dbconfig.c int DChost_deactivate(zbx_uint64_t hostid, unsigned char agent_type, const zbx_timespec_t *ts, zbx_agent_availability_t *in, zbx_agent_availability_t *out, const char *error_msg) { int ret = FAIL, errors_from,disable_until; const char *error; unsigned char available; ZBX_DC_HOST *dc_host; /* don't try deactivating host if the unreachable delay has not passed since the first error */ if (CONFIG_UNREACHABLE_DELAY > ts->sec - in->errors_from) goto out; WRLOCK_CACHE; if (NULL == (dc_host = (ZBX_DC_HOST *)zbx_hashset_search(&config->hosts, &hostid))) goto unlock; /* Don't try deactivating host if: */ /* - (server, proxy) it's not monitored any more; */ /* - (server) it's monitored by proxy. */ if ((0 != (program_type & ZBX_PROGRAM_TYPE_SERVER) && 0 != dc_host->proxy_hostid) || HOST_STATUS_MONITORED != dc_host->status) { goto unlock; } DChost_get_agent_availability(dc_host, agent_type, in); available = in->available; error = in->error; if (0 == in->errors_from) { /* first error, schedule next unreachable check */ errors_from = ts->sec; disable_until = ts->sec + CONFIG_UNREACHABLE_DELAY; } else { errors_from = in->errors_from; disable_until = in->disable_until; /* Check if other pollers haven't already attempted deactivating host. */ /* In that case should wait the initial unreachable delay before */ /* trying to make it unavailable. */ if (CONFIG_UNREACHABLE_DELAY <= ts->sec - errors_from) { /* repeating error */ if (CONFIG_UNREACHABLE_PERIOD > ts->sec - errors_from) { /* leave host available, schedule next unreachable check */ disable_until = ts->sec + CONFIG_UNREACHABLE_DELAY; } else { /* make host unavailable, schedule next unavailable check */ disable_until = ts->sec + CONFIG_UNAVAILABLE_DELAY; available = HOST_AVAILABLE_FALSE; error = error_msg; } } } zbx_agent_availability_init(out, available, error, errors_from, disable_until); DChost_set_agent_availability(dc_host, ts->sec, agent_type, out); if (ZBX_FLAGS_AGENT_STATUS_NONE != out->flags) ret = SUCCEED; unlock: UNLOCK_CACHE; out: return ret; }
主要看下这段:
if (0 == in->errors_from) { /* first error, schedule next unreachable check */ errors_from = ts->sec; disable_until = ts->sec + CONFIG_UNREACHABLE_DELAY; } else { errors_from = in->errors_from; disable_until = in->disable_until; /* Check if other pollers haven't already attempted deactivating host. */ /* In that case should wait the initial unreachable delay before */ /* trying to make it unavailable. */ if (CONFIG_UNREACHABLE_DELAY <= ts->sec - errors_from) { /* repeating error */ if (CONFIG_UNREACHABLE_PERIOD > ts->sec - errors_from) { /* leave host available, schedule next unreachable check */ disable_until = ts->sec + CONFIG_UNREACHABLE_DELAY; } else { /* make host unavailable, schedule next unavailable check */ disable_until = ts->sec + CONFIG_UNAVAILABLE_DELAY; available = HOST_AVAILABLE_FALSE; error = error_msg; } } }
如果错误第一次出现:
错误发生时间=检查的时间戳
下次的检查时间 = 时间戳+15s
否则:
错误发生时间 = in->errors_from
下次检查时间 = in->disable_until
检查的时间戳-错误发生时间>=15s:
检查的时间戳-错误发生时间< 45s:
下次的检查时间 = 检查的时间戳+15s
否则:
下一次检查时间 =检查的时间戳+15s
主机可用性为不可用
用配置文件来解释就是: 如果由于网络等原因没有实现项目的及时监控,第一次的监控间隔为UnreachableDelay时间(15s),如果这次也失败了,那么从第一次失败到本次检查在UnreachablePeriod时间内,会再次在UnreachableDelay时间后监控
#9 更新数据库中的主机可用性信息
// poller.c static int db_host_update_availability(const zbx_host_availability_t *ha) { char *sql = NULL; size_t sql_alloc = 0, sql_offset = 0; if (SUCCEED == zbx_sql_add_host_availability(&sql, &sql_alloc, &sql_offset, ha)) { DBbegin(); DBexecute("%s", sql); DBcommit(); zbx_free(sql); return SUCCEED; } return FAIL; }
#10 根据agent_type设置主机可用性信息
//poller.c static int host_set_availability(DC_HOST *dc_host, unsigned char agent, const zbx_host_availability_t *ha) { const zbx_agent_availability_t *availability = &ha->agents[agent]; unsigned char *pavailable; int *perrors_from, *pdisable_until; char *perror; switch (agent) { case ZBX_AGENT_ZABBIX: pavailable = &dc_host->available; perror = dc_host->error; perrors_from = &dc_host->errors_from; pdisable_until = &dc_host->disable_until; break; case ZBX_AGENT_SNMP: pavailable = &dc_host->snmp_available; perror = dc_host->snmp_error; perrors_from = &dc_host->snmp_errors_from; pdisable_until = &dc_host->snmp_disable_until; break; case ZBX_AGENT_IPMI: pavailable = &dc_host->ipmi_available; perror = dc_host->ipmi_error; perrors_from = &dc_host->ipmi_errors_from; pdisable_until = &dc_host->ipmi_disable_until; break; case ZBX_AGENT_JMX: pavailable = &dc_host->jmx_available; perror = dc_host->jmx_error; pdisable_until = &dc_host->jmx_disable_until; perrors_from = &dc_host->jmx_errors_from; break; default: return FAIL; } if (0 != (availability->flags & ZBX_FLAGS_AGENT_STATUS_AVAILABLE)) *pavailable = availability->available; if (0 != (availability->flags & ZBX_FLAGS_AGENT_STATUS_ERROR)) zbx_strlcpy(perror, availability->error, HOST_ERROR_LEN_MAX); if (0 != (availability->flags & ZBX_FLAGS_AGENT_STATUS_ERRORS_FROM)) *perrors_from = availability->errors_from; if (0 != (availability->flags & ZBX_FLAGS_AGENT_STATUS_DISABLE_UNTIL)) *pdisable_until = availability->disable_until; return SUCCEED; }
#11-16
如果是第一次检查:
记录日志first network error, wait for 15 seconds
否则:
如果数据库中的主机如果显示可用:
记录日志another network error, wait for 15 seconds
否则
记录日志temporarily disabling(这是前段页面的绿色图标会变为红色)
从上面的代码可以看出,在三中情况下会产生network error, wait for 15s seconds的日志,分别是在poller过程中产生的网络错误,网关问题,或者是检查超时。总结下来就是:zabbix server 与zabbix agentd的连接和数据的收发不能成功或者在取得数据的一系列处理中花费的时间超过了zabbix server 的Timeout参数情况下发生。
从正常取值到出现异常的处理过程是这样的:
正常取值 UnreachableDelay UnreachableDelay UnreachableDelay UnnavailableDelay 恢复
| | |
| | |
-----------------------UnreachablePeriod------------
1 2 3 4 5
过程 日志
- 1 获取正常监控数据
2 发生错误 ------------>first network
3 再次发生错误 ------------>another network
4 置为不可用 ------------>temporarily disabling
5 恢复 ------------>resuming
日志中的15s在配置文件中对应的配置UnreachableDelay,默认为15s,在源码中的位置是server.c中的CONFIG_UNREACHABLE_DELAY,
但注意这个配置不会解决任何network error的问题,只是为计算下一个检查时间提供时间依据。还有大家应该注意到了UnreachableDelay参数和UnreachablePeriod是倍数关系。我们在调优的时候需要注意下。
从zabbix 1.8版使用至今,根据我这几年的经验分析产生此类日志基本出现在网络设备,服务器很少出现,这与SNMP使用UDP协议有关系,但主要问题还是几方面问题:
- 1、网络不稳定
- 2、设备端问题
- 3、poller排队了
- 4、Timeout超时了
这四种中的Timeout和poller又是有相互联系的,关于服务器如何设置poller,我后面的文章再介绍,先暂时分别来看下这四种情况:
网络不稳定多出现于几种情况:
- 1、使用公网实现和IDC互连,也就是被检查设备和server不在一个IDC,这种情况建议在另一端增加proxy,使对端设备的检测都在内网进行
- 2、使用云端网络,使用云端的网络互连方式打通云端设备和IDC的互连,这种情况的网络对于用户来说就是一种黑盒,基本无法排障,如果你使用大厂的服务,会偶尔出现日志报错,但不会影响到使用体验
网络设备端问题的情况:
- 1、设备性能:如何判断网络设备端问题呢?可以在网络设备上debug snmp信息,看每个包是否是都回了还是报错了,这种情况可以将snmp的采取间隔加大,
- 2、对端和server连接的端口带宽打满了
poller排队处理;
poller数量是由zabbix_server配置文件中的startpollers指定,poller.c主要做几件事:1、从队列中获取item的数据 2、获取item获取监控数据 3、把数据放入缓存
poller只会处理被动状态的监控项:
如果你是服务器出现此类日志:解决方法一种是增大poller的数量,一种是把被动模式改为主动模式,
如果你是网络设备:改用脚本实现,或者增大poller数量
关于Timeout ,这里有同学可能会说将服务器的检查时间调长为30s,这种设置如果检查设备少没关系,数量比较多我不建议这样调整,超过2s的检测项都改在agentd改用脚本实现吧
以上,是我使用zabbix中关于日志报警wait for 15s seconds 的一些理解和心得,如果文章内容对你有所帮助,请点个赞吧。如果你发现文中有错误的方面,也请留言给我,谢谢!