DPVS学习笔记:连接跟踪
DPVS学习笔记:连接跟踪
1、基本概念
连接跟踪是一种记录和管理网络连接状态的机制。
在网络环境中,一个连接通常由源IP地址、目的IP地址、源端口、目的端口和传输层协议(TCP或UDP)这五个元素唯一标识,也称为五元组。
DPVS的连接跟踪模块会对经过它的每一个连接进行跟踪,记录其状态信息,以便对连接进行有效的管理和控制。
2、主要功能
① 状态管理:DPVS的连接跟踪模块可以记录每个网络连接的状态(如TCP连接的SYN、SYN-ACK、ESTABLISHED等状态),并根据状态进行相应的处理。
② NAT支持:连接跟踪是实现NAT功能的基础。DPVS可以通过连接跟踪模块记录每个连接的源地址和端口转换信息,确保数据包能够正确转发。
③ 会话超时管理:DPVS会为每个连接设置超时时间,当连接长时间没有活动时,会自动清理连接状态,释放资源。
④ 负载均衡:在负载均衡场景中,连接跟踪可以确保同一个客户端请求被转发到同一个后端服务器,保持会话的一致性。
3、工作原理
① 数据包捕获:DPVS通过DPDK捕获网络数据包,并将其传递给连接跟踪模块进行处理。
② 连接状态识别:连接跟踪模块会解析数据包的协议(如TCP、UDP、ICMP等),并根据协议类型识别连接的状态。
③ 状态表更新:DPVS会维护一个连接状态表,记录每个连接的状态信息。
④ 数据包处理:根据连接状态表中的信息,DPVS会对数据包进行相应的处理(如NAT转换、负载均衡转发等)。
⑤ 超时清理:DPVS会定期检查连接状态表,清理超时的连接,释放资源。
4、实现
4.1 核心数据结构
4.1.1 连接跟踪表
/* helpers */
#define this_conn_tbl (RTE_PER_LCORE(dp_vs_conn_tbl)) //每个核独立的连接跟踪表
#ifdef CONFIG_DPVS_IPVS_CONN_LOCK
#define this_conn_lock (RTE_PER_LCORE(dp_vs_conn_lock)) //每个核的哈希表操作锁
#endif
#define this_conn_count (RTE_PER_LCORE(dp_vs_conn_count)) //每个核独立的连接计数
#define this_conn_cache (dp_vs_conn_cache[rte_socket_id()]) //基于NUMA节点的连接表项内存池缓存
4.1.2 连接结构体
struct dp_vs_conn {
int af; //地址族,标识ip版本
uint8_t proto; //协议类型
union inet_addr caddr; /* Client address */ //客户端地址
union inet_addr vaddr; /* Virtual address */ //虚拟地址
union inet_addr laddr; /* director Local address */ //负载均衡器本地地址
union inet_addr daddr; /* Destination (RS) address */ //后端真实服务器地址
uint16_t cport; //客户端端口
uint16_t vport; //虚拟端口
uint16_t lport; //负载均衡器本地端口
uint16_t dport; //后端真实服务器端口
struct rte_mempool *connpool; //连接内存池
struct conn_tuple_hash tuplehash[DPVS_CONN_DIR_MAX]; //双向哈希表
rte_atomic32_t refcnt; //连接的引用计数
struct dpvs_timer timer; //定时器
struct timeval timeout; //超时时间
lcoreid_t lcore; //核
struct dp_vs_dest *dest; /* real server */ //真实服务器
void *prot_data; /* protocol specific data */ //协议特定数据
/* for FNAT */
struct dp_vs_laddr *local; /* local address */ //本地地址
struct dp_vs_seq fnat_seq; //FNAT序列号信息
/* save last SEQ/ACK from RS for RST when conn expire*/
uint32_t rs_end_seq; //真实服务器的最后序列号
uint32_t rs_end_ack; //真实服务器的最后确认号
int (*packet_xmit)(struct dp_vs_proto *prot,
struct dp_vs_conn *conn,
struct rte_mbuf *mbuf); //处理入方向流量
int (*packet_out_xmit)(struct dp_vs_proto *prot,
struct dp_vs_conn *conn,
struct rte_mbuf *mbuf); //处理出方向流量
/* L2 fast xmit */
struct rte_ether_addr in_smac; //输入的源MAC地址
struct rte_ether_addr in_dmac; //输入的目的MAC地址
struct rte_ether_addr out_smac; //输出的源MAC地址
struct rte_ether_addr out_dmac; //输出的目的MAC地址
/* route for neigbour */
struct netif_port *in_dev; /* inside to rs*/ //内部到真实服务器的接口
struct netif_port *out_dev; /* outside to client*/ //外部到客户端的接口
union inet_addr in_nexthop; /* to rs*/ //到真实服务器的下一跳地址
union inet_addr out_nexthop; /* to client*/ //到客户端的下一跳地址
#ifdef CONFIG_DPVS_IPVS_STATS_DEBUG
/* statistics */
struct dp_vs_conn_stats stats; //连接统计信息
#endif
/* synproxy related members */
struct dp_vs_seq syn_proxy_seq; /* seq used in synproxy */ //synproxy模式下使用的序列号
struct list_head ack_mbuf; /* ack mbuf saved in step2 */ //保存step2中收到数据包的链表
uint16_t ack_num; /* ack mbuf number stored */ //保存的ack数据包数量
uint8_t wscale_vs; /* outbound wscale factor to client */ //客户端的窗口缩放因子
uint8_t wscale_rs; /* outbound wscale factor from rs */ //真实服务器的窗口缩放因子
struct rte_mbuf *syn_mbuf; /* saved rs syn packet for retransmition */ //保存的SYN数据包
rte_atomic32_t syn_retry_max; /* syn retransmition max packets */ //syn重传的最大次数
/* add for stopping ack storm */
uint32_t last_seq; /* seq of the last ack packet */ //最后一个ack包的序列号
uint32_t last_ack_seq; /* ack seq of the last ack packet */ //最后一个ack包的确认号
rte_atomic32_t dup_ack_cnt; /* count of repeated ack packets */ //重复ack包的计数
uint8_t pp_version; /* proxy protocol version */ //代理协议版本
uint8_t pp_sent; /* proxy protocol data has sent */ //代理协议数据是否已发送
/* flags and state transition */
volatile uint16_t flags; //标志位
volatile uint16_t state; //当前状态
volatile uint16_t old_state; /* old state, to be used for state transition
triggered synchronization */ //以前的状态(用于状态转换)
/* controll members */
struct dp_vs_conn *control; /* master who controlls me */ //控制该连接的主连接
rte_atomic32_t n_control; /* number of connections controlled by me*/ //该连接控制的其他连接的数量
#ifdef CONFIG_DPVS_IPVS_STATS_DEBUG
uint64_t ctime; /* create time */ //连接的创建时间
#endif
/* connection redirect in fnat/snat/nat modes */
struct dp_vs_redirect *redirect; //指向连接重定向信息的指针
} __rte_cache_aligned;
4.1.3 连接状态
TCP:
enum {
DPVS_TCP_S_NONE = 0,
DPVS_TCP_S_ESTABLISHED,
DPVS_TCP_S_SYN_SENT,
DPVS_TCP_S_SYN_RECV,
DPVS_TCP_S_FIN_WAIT,
DPVS_TCP_S_TIME_WAIT,
DPVS_TCP_S_CLOSE,
DPVS_TCP_S_CLOSE_WAIT,
DPVS_TCP_S_LAST_ACK,
DPVS_TCP_S_LISTEN,
DPVS_TCP_S_SYNACK,
DPVS_TCP_S_LAST
};
UDP:
enum {
DPVS_UDP_S_NONE = 0,
DPVS_UDP_S_ONEWAY,
DPVS_UDP_S_NORMAL,
DPVS_UDP_S_LAST
};
ICMP:
enum {
DPVS_ICMP_S_NORMAL = 0,
DPVS_ICMP_S_LAST
};
4.2 核心流程分析
4.2.1 连接跟踪的初始化
int dp_vs_conn_init(void)
{
int i, err;
lcoreid_t lcore;
char poolname[32];
/* init connection template table */
//分配连接模板表
dp_vs_ct_tbl = rte_malloc(NULL, sizeof(struct list_head) * DPVS_CONN_TBL_SIZE,
RTE_CACHE_LINE_SIZE);
if (!dp_vs_ct_tbl) {
err = EDPVS_NOMEM;
RTE_LOG(WARNING, IPVS, "%s: %s.\n",
__func__, dpvs_strerror(err));
return err;
}
//初始化连接模板表
for (i = 0; i < DPVS_CONN_TBL_SIZE; i++)
INIT_LIST_HEAD(&dp_vs_ct_tbl[i]);
rte_spinlock_init(&dp_vs_ct_lock);
/*
* unlike linux per_cpu() which can assign CPU number,
* RTE_PER_LCORE() can only access own instances.
* it make codes looks strange.
*/
//初始化各工作核的本地连接表
rte_eal_mp_remote_launch(conn_init_lcore, NULL, SKIP_MAIN);
RTE_LCORE_FOREACH_WORKER(lcore) {
if ((err = rte_eal_wait_lcore(lcore)) < 0) {
RTE_LOG(WARNING, IPVS, "%s: lcore %d: %s.\n",
__func__, lcore, dpvs_strerror(err));
}
}
//初始化连接控制模块
conn_ctrl_init();
//创建每个NUMA节点上的连接缓存
/* connection cache on each NUMA socket */
for (i = 0; i < get_numa_nodes(); i++) {
snprintf(poolname, sizeof(poolname), "dp_vs_conn_%d", i);
dp_vs_conn_cache[i] = rte_mempool_create(poolname,
conn_pool_size,
sizeof(struct dp_vs_conn),
conn_pool_cache,
0, NULL, NULL, NULL, NULL,
i, 0);
if (!dp_vs_conn_cache[i]) {
err = EDPVS_NOMEM;
goto cleanup;
}
}
//生成随机数
dp_vs_conn_rnd = (uint32_t)random();
return EDPVS_OK;
cleanup:
dp_vs_conn_term();
return err;
}
4.2.2 数据包处理流程
① 查找连接表
static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf,
const struct inet_hook_state *state, int af)
{
struct dp_vs_iphdr iph;
struct dp_vs_proto *prot;
struct dp_vs_conn *conn;
int dir, verdict, err, related;
bool drop = false;
lcoreid_t cid, peer_cid;
eth_type_t etype = mbuf->packet_type; /* FIXME: use other field ? */
assert(mbuf && state);
cid = peer_cid = rte_lcore_id();
//数据包不是发往本机的,那么返回
if (unlikely(etype != ETH_PKT_HOST))
return INET_ACCEPT;
if (dp_vs_fill_iphdr(af, mbuf, &iph) != EDPVS_OK)
return INET_ACCEPT;
//处理ICMP消息
if (unlikely(iph.proto == IPPROTO_ICMP ||
iph.proto == IPPROTO_ICMPV6)) {
/* handle related ICMP error to existing conn */
verdict = dp_vs_in_icmp(af, mbuf, &related);
if (related || verdict != INET_ACCEPT)
return verdict;
/* let unrelated and valid ICMP goes down,
* may implement ICMP fwd in the futher. */
}
//查找四层处理协议,目前实现了tcp、udp、icmp三种
prot = dp_vs_proto_lookup(iph.proto);
if (unlikely(!prot))
return INET_ACCEPT;
/*
* Defrag ipvs-forwarding TCP/UDP is not supported for some reasons,
*
* - RSS/flow-director do not support TCP/UDP fragments, means it's
* not able to direct frags to same lcore as original TCP/UDP packets.
* - per-lcore conn table will miss if frags reachs wrong lcore.
*
* If we redirect frags to "correct" lcore, it may cause performance
* issue. Also it need to understand RSS algorithm. Moreover, for the
* case frags in same flow are not occur in same lcore, a global lock is
* needed, which is not a good idea.
*/
if (af == AF_INET && ip4_is_frag(ip4_hdr(mbuf))) {
RTE_LOG(DEBUG, IPVS, "%s: frag not support.\n", __func__);
return INET_DROP;
}
/* packet belongs to existing connection ? */
//在流表中查找连接
conn = prot->conn_lookup(prot, &iph, mbuf, &dir, false, &drop, &peer_cid);
...
}
② 新建连接
static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf,
const struct inet_hook_state *state, int af)
{
...
if (unlikely(!conn)) {
/* try schedule RS and create new connection */
//如果没有找到会话,conn_sched根据请求选择一个后端rs建立连接
if (prot->conn_sched(prot, &iph, mbuf, &conn, &verdict) != EDPVS_OK) {
/* RTE_LOG(DEBUG, IPVS, "%s: fail to schedule.\n", __func__); */
return verdict;
}
/* only SNAT triggers connection by inside-outside traffic. */
//snat模式,则是内部服务器访问外部服务,内网服务器-->dpvs-->外网服务器,所以设置dir = DPVS_CONN_DIR_OUTBOUND
if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT)
dir = DPVS_CONN_DIR_OUTBOUND;
else
dir = DPVS_CONN_DIR_INBOUND;
} else {
/* assert(conn->dest != NULL); */
if (prot->conn_expire_quiescent && (conn->flags & DPVS_CONN_F_EXPIRE_QUIESCENT) &&
conn->dest && (!dp_vs_dest_is_avail(conn->dest) ||
rte_atomic16_read(&conn->dest->weight) == 0)) {
RTE_LOG(INFO, IPVS, "%s: the conn is quiescent, expire it right now,"
" and drop the packet!\n", __func__);
prot->conn_expire_quiescent(conn);
dp_vs_conn_put(conn);
return INET_DROP;
}
}
...
}
以TCP协议为例:
tcp_conn_sched
/* set @verdict if failed to schedule */
static int tcp_conn_sched(struct dp_vs_proto *proto,
const struct dp_vs_iphdr *iph,
struct rte_mbuf *mbuf,
struct dp_vs_conn **conn,
int *verdict)
{
struct tcphdr *th, _tcph;
struct dp_vs_service *svc;
assert(proto && iph && mbuf && conn && verdict);
//获取tcp header,只是指针操作,不涉及数据复制
th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph);
if (unlikely(!th)) {
*verdict = INET_DROP;
return EDPVS_INVPKT;
}
/* Syn-proxy step 2 logic: receive client's 3-handshake ack packet */
/* When synproxy disabled, only SYN packets can arrive here.
* So don't judge SYNPROXY flag here! If SYNPROXY flag judged, and syn_proxy
* got disbled and keepalived reloaded, SYN packets for RS may never be sent. */
if (dp_vs_synproxy_ack_rcv(iph->af, mbuf, th, proto, conn, iph, verdict) == 0) {
/* Attention: First ACK packet is also stored in conn->ack_mbuf */
return EDPVS_PKTSTOLEN;
}
/* only TCP-SYN without other flag can be scheduled */
//对于新建立的连接,只允许syn请求,其他的抛弃
if (!th->syn || th->ack || th->fin || th->rst) {
#ifdef CONFIG_DPVS_IPVS_DEBUG
char dbuf[64], sbuf[64];
const char *daddr, *saddr;
daddr = inet_ntop(iph->af, &iph->daddr, dbuf, sizeof(dbuf)) ? dbuf : "::";
saddr = inet_ntop(iph->af, &iph->saddr, sbuf, sizeof(sbuf)) ? sbuf : "::";
RTE_LOG(DEBUG, IPVS,
"%s: [%d] try sched non-SYN packet: [%c%c%c%c] %s/%d->%s/%d\n",
__func__, rte_lcore_id(),
th->syn ? 'S' : '.', th->fin ? 'F' : '.',
th->ack ? 'A' : '.', th->rst ? 'R' : '.',
saddr, ntohs(th->source), daddr, ntohs(th->dest));
#endif
/* Drop tcp packet which is send to vip and !vport */
if (g_defence_tcp_drop &&
(svc = dp_vs_vip_lookup(iph->af, iph->proto,
&iph->daddr, rte_lcore_id()))) {
dp_vs_estats_inc(DEFENCE_TCP_DROP);
*verdict = INET_DROP;
return EDPVS_INVPKT;
}
*verdict = INET_ACCEPT;
return EDPVS_INVAL;
}
//根据请求目的地址和端口来查找服务,找不到丢弃
svc = dp_vs_service_lookup(iph->af, iph->proto, &iph->daddr, th->dest,
0, mbuf, NULL, rte_lcore_id());
if (!svc) {
/* Drop tcp packet which is send to vip and !vport */
if (g_defence_tcp_drop &&
(svc = dp_vs_vip_lookup(iph->af, iph->proto,
&iph->daddr, rte_lcore_id()))) {
dp_vs_estats_inc(DEFENCE_TCP_DROP);
*verdict = INET_DROP;
return EDPVS_INVPKT;
}
*verdict = INET_ACCEPT;
return EDPVS_NOSERV;
}
//根据服务来选择rs建立连接
*conn = dp_vs_schedule(svc, iph, mbuf, false);
if (!*conn) {
*verdict = INET_DROP;
return EDPVS_RESOURCE;
}
return EDPVS_OK;
}
dp_vs_schedule
/* select an RS by service's scheduler and create a connection */
struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc,
const struct dp_vs_iphdr *iph,
struct rte_mbuf *mbuf,
bool is_synproxy_on)
{
uint16_t _ports[2], *ports; /* sport, dport */
struct dp_vs_dest *dest;
struct dp_vs_conn *conn;
struct dp_vs_conn_param param;
uint32_t flags = 0;
assert(svc && iph && mbuf);
//从mbuf中提取源端口和目的端口
ports = mbuf_header_pointer(mbuf, iph->len, sizeof(_ports), _ports);
if (!ports)
return NULL;
/* persistent service */
//如果服务标记为持久化
if (svc->flags & DP_VS_SVC_F_PERSISTENT)
return dp_vs_sched_persist(svc, iph, mbuf, is_synproxy_on);
//根据特定算法选择real server,常用的有wrr、rr、wlc;返回dest结构体是后端rs
dest = svc->scheduler->schedule(svc, mbuf, iph);
if (!dest) {
RTE_LOG(INFO, IPVS, "%s: no dest found.\n", __func__);
#ifdef CONFIG_DPVS_MBUF_DEBUG
dp_vs_mbuf_dump("found dest failed.", iph->af, mbuf);
#endif
return NULL;
}
if (dest->fwdmode == DPVS_FWD_MODE_SNAT)
return dp_vs_snat_schedule(dest, iph, ports, mbuf);
//处理ICMP协议
if (unlikely(iph->proto == IPPROTO_ICMP)) {
struct icmphdr *ich, _icmph;
ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph);
if (!ich)
return NULL;
ports = _ports;
_ports[0] = icmp4_id(ich);
_ports[1] = ich->type << 8 | ich->code;
//填充参数供新建连接使用
dp_vs_conn_fill_param(iph->af, iph->proto,
&iph->saddr, &iph->daddr,
ports[0], ports[1], 0, ¶m);
} else if (unlikely(iph->proto == IPPROTO_ICMPV6)) {
struct icmp6_hdr *ic6h, _ic6hp;
ic6h = mbuf_header_pointer(mbuf, iph->len, sizeof(_ic6hp), &_ic6hp);
if (!ic6h)
return NULL;
ports = _ports;
_ports[0] = icmp6h_id(ic6h);
_ports[1] = ic6h->icmp6_type << 8 | ic6h->icmp6_code;
dp_vs_conn_fill_param(iph->af, iph->proto,
&iph->daddr, &dest->addr,
ports[1], ports[0],
0, ¶m);
} else {
dp_vs_conn_fill_param(iph->af, iph->proto,
&iph->saddr, &iph->daddr,
ports[0], ports[1], 0, ¶m);
}
if (is_synproxy_on)
flags |= DPVS_CONN_F_SYNPROXY;
if (svc->flags & DP_VS_SVC_F_ONEPACKET && iph->proto == IPPROTO_UDP)
flags |= DPVS_CONN_F_ONE_PACKET;
if (svc->flags & DP_VS_SVC_F_EXPIRE_QUIESCENT)
flags |= DPVS_CONN_F_EXPIRE_QUIESCENT;
//创建新连接
conn = dp_vs_conn_new(mbuf, iph, ¶m, dest, flags);
if (!conn)
return NULL;
//更新连接统计信息
dp_vs_stats_conn(conn);
return conn;
}
dp_vs_conn_new
struct dp_vs_conn *dp_vs_conn_new(struct rte_mbuf *mbuf,
const struct dp_vs_iphdr *iph,
struct dp_vs_conn_param *param,
struct dp_vs_dest *dest, uint32_t flags)
{
struct dp_vs_conn *new;
struct conn_tuple_hash *t;
uint16_t rport;
__be16 _ports[2], *ports;
int err;
assert(mbuf && param && dest);
//为新的连接分配内存
new = dp_vs_conn_alloc(dest->fwdmode, flags);
if (unlikely(!new))
return NULL;
new->flags = flags;
/* set proper RS port */
//是否为模板连接或者目的端口不为0
if (dp_vs_conn_is_template(new) || param->ct_dport != 0)
rport = param->ct_dport;
else if (dest->fwdmode == DPVS_FWD_MODE_SNAT) {
if (unlikely(param->proto == IPPROTO_ICMP ||
param->proto == IPPROTO_ICMPV6)) {
rport = param->vport;
} else {
//从mbuf中提取端口
ports = mbuf_header_pointer(mbuf, iph->len, sizeof(_ports), _ports);
if (unlikely(!ports)) {
RTE_LOG(WARNING, IPVS, "%s: no memory\n", __func__);
goto errout;
}
rport = ports[0];
}
} else {
rport = dest->port;
}
/* init inbound conn tuple hash */
t = &tuplehash_in(new);
t->direct = DPVS_CONN_DIR_INBOUND; //入口流量
t->af = param->af;
t->proto = param->proto;
t->saddr = *param->caddr; //源地址是外网client addr
t->sport = param->cport;
t->daddr = *param->vaddr; //目的地址是服务虚ip地址
t->dport = param->vport;
INIT_LIST_HEAD(&t->list);
/* init outbound conn tuple hash */
t = &tuplehash_out(new);
t->direct = DPVS_CONN_DIR_OUTBOUND; //出口流量
t->af = dest->af;
t->proto = param->proto;
if (dest->fwdmode == DPVS_FWD_MODE_SNAT) {
t->saddr = iph->saddr;
} else {
t->saddr = dest->addr;
}
t->sport = rport;
t->daddr = *param->caddr; /* non-FNAT */
t->dport = param->cport; /* non-FNAT */
INIT_LIST_HEAD(&t->list);
/* init connection */
new->af = param->af;
new->proto = param->proto;
new->caddr = *param->caddr;
new->cport = param->cport;
new->vaddr = *param->vaddr;
new->vport = param->vport;
new->laddr = *param->caddr; /* non-FNAT */
new->lport = param->cport; /* non-FNAT */
if (dest->fwdmode == DPVS_FWD_MODE_SNAT)
new->daddr = iph->saddr;
else
new->daddr = dest->addr;
new->dport = rport;
if (dest->fwdmode == DPVS_FWD_MODE_FNAT) {
new->pp_version = dest->svc->proxy_protocol;
new->pp_sent = 0;
}
/* neighbour confirm cache */
if (AF_INET == tuplehash_in(new).af) {
new->in_nexthop.in.s_addr = htonl(INADDR_ANY);
} else {
new->in_nexthop.in6 = in6addr_any;
}
if (AF_INET == tuplehash_out(new).af) {
new->out_nexthop.in.s_addr = htonl(INADDR_ANY);
} else {
new->out_nexthop.in6 = in6addr_any;
}
new->in_dev = NULL;
new->out_dev = NULL;
/* Controll member */
new->control = NULL;
rte_atomic32_clear(&new->n_control);
/* caller will use it right after created,
* just like dp_vs_conn_get(). */
rte_atomic32_set(&new->refcnt, 1);
new->state = 0;
#ifdef CONFIG_DPVS_IPVS_STATS_DEBUG
new->ctime = rte_rdtsc();
#endif
/* bind destination and corresponding trasmitter */
//设置转发模式相关的几个发包收包操作
err = dp_vs_conn_bind_dest(new, dest);
if (err != EDPVS_OK) {
RTE_LOG(WARNING, IPVS, "%s: fail to bind dest: %s\n",
__func__, dpvs_strerror(err));
goto errout;
}
/* FNAT only: select and bind local address/port */
if (dest->fwdmode == DPVS_FWD_MODE_FNAT) {
//绑定lb本地socket
if ((err = dp_vs_laddr_bind(new, dest->svc)) != EDPVS_OK)
goto unbind_dest;
}
/* init redirect if it exists */
//初始化重定向信息
dp_vs_redirect_init(new);
/* add to hash table (dual dir for each bucket) */
//将连接添加到哈希表中
if ((err = dp_vs_conn_hash(new)) != EDPVS_OK)
goto unbind_laddr;
/* timer */
//设置连接的初始超时时间
new->timeout.tv_sec = conn_init_timeout;
new->timeout.tv_usec = 0;
/* synproxy */
INIT_LIST_HEAD(&new->ack_mbuf);
rte_atomic32_set(&new->syn_retry_max, 0);
rte_atomic32_set(&new->dup_ack_cnt, 0);
if ((flags & DPVS_CONN_F_SYNPROXY) && !dp_vs_conn_is_template(new)) {
struct tcphdr _tcph, *th = NULL;
struct dp_vs_synproxy_ack_pakcet *ack_mbuf;
struct dp_vs_proto *pp;
th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph);
if (!th) {
RTE_LOG(ERR, IPVS, "%s: get tcphdr failed\n", __func__);
goto unbind_laddr;
}
/* save ack packet */
if (unlikely(rte_mempool_get(this_ack_mbufpool, (void **)&ack_mbuf) != 0)) {
RTE_LOG(ERR, IPVS, "%s: no memory\n", __func__);
goto unbind_laddr;
}
ack_mbuf->mbuf = mbuf;
list_add_tail(&ack_mbuf->list, &new->ack_mbuf);
new->ack_num++;
sp_dbg_stats32_inc(sp_ack_saved);
/* save ack_seq - 1 */
new->syn_proxy_seq.isn =
htonl((uint32_t) ((ntohl(th->ack_seq) - 1)));
/* save ack_seq */
new->fnat_seq.fdata_seq = ntohl(th->ack_seq);
/* FIXME: use DP_VS_TCP_S_SYN_SENT for syn */
pp = dp_vs_proto_lookup(param->proto);
new->timeout.tv_sec = pp->timeout_table[new->state = DPVS_TCP_S_SYN_SENT];
}
/* schedule conn timer */
#ifdef CONFIG_TIMER_DEBUG
snprintf(new->timer.name, sizeof(new->timer.name), "%s", "conn");
#endif
//对超时时间进行随机延迟处理
dpvs_time_rand_delay(&new->timeout, 1000000);
//将连接对象添加到定时器中,以便管理连接的超时
dp_vs_conn_attach_timer(new, true);
#ifdef CONFIG_DPVS_IPVS_DEBUG
conn_dump("new conn: ", new);
#endif
return new;
unbind_laddr:
dp_vs_laddr_unbind(new);
unbind_dest:
dp_vs_conn_unbind_dest(new, true);
errout:
dp_vs_conn_free(new);
return NULL;
}
③ 更新连接状态
static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf,
const struct inet_hook_state *state, int af)
{
...
//tcp状态转移
if (prot->state_trans) {
err = prot->state_trans(prot, conn, mbuf, dir);
if (err != EDPVS_OK)
RTE_LOG(WARNING, IPVS, "%s: fail to trans state.", __func__);
}
conn->old_state = conn->state;
...
}
以TCP协议为例:
tcp_state_trans
static int tcp_state_trans(struct dp_vs_proto *proto, struct dp_vs_conn *conn,
struct rte_mbuf *mbuf, int dir)
{
struct tcphdr *th, _tcph;
int idx, off;
int new_state = DPVS_TCP_S_CLOSE;
assert(proto && conn && mbuf);
struct dp_vs_dest *dest = conn->dest;
int af = conn->af;
#ifdef CONFIG_DPVS_IPVS_DEBUG
char dbuf[64], cbuf[64];
const char *daddr, *caddr;
#endif
//确定地址族
if (dir == DPVS_CONN_DIR_INBOUND && dest->fwdmode == DPVS_FWD_MODE_FNAT)
af = tuplehash_in(conn).af;
else if (dir == DPVS_CONN_DIR_OUTBOUND && dest->fwdmode == DPVS_FWD_MODE_FNAT)
af = tuplehash_out(conn).af;
int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf));
//从数据包缓存区中获取TCP头部信息
th = mbuf_header_pointer(mbuf, iphdrlen, sizeof(_tcph), &_tcph);
if (unlikely(!th))
return EDPVS_INVPKT;
//根据转发模式和数据包方向确认状态转换表的偏移量
if (dest->fwdmode == DPVS_FWD_MODE_DR || dest->fwdmode == DPVS_FWD_MODE_TUNNEL)
off = 8;
else if (dir == DPVS_CONN_DIR_INBOUND)
off = 0;
else if (dir == DPVS_CONN_DIR_OUTBOUND)
off = 4;
else
return EDPVS_NOTSUPP; /* do not support INPUT_ONLY now */
//获取状态转换索引
if ((idx = tcp_state_idx(th)) < 0) {
RTE_LOG(DEBUG, IPVS, "tcp_state_idx=%d !\n", idx);
goto tcp_state_out;
}
//计算出新的TCP状态
new_state = tcp_states[off + idx].next_state[conn->state];
tcp_state_out:
//如果状态没有变化,返回
if (new_state == conn->state)
return EDPVS_OK;
/* state changed */
//记录状态转换信息(调试模式)
#ifdef CONFIG_DPVS_IPVS_DEBUG
daddr = inet_ntop(tuplehash_out(conn).af, &conn->daddr, dbuf, sizeof(dbuf)) ? dbuf : "::";
caddr = inet_ntop(tuplehash_in(conn).af, &conn->caddr, cbuf, sizeof(cbuf)) ? cbuf : "::";
RTE_LOG(DEBUG, IPVS, "state trans: %s %s [%c%c%c%c] %s:%u->%s:%u "
" state %s->%s conn.refcnt %d\n",
proto->name, dir == DPVS_CONN_DIR_OUTBOUND ? "out" : "in",
th->syn ? 'S' : '.', th->fin ? 'F' : '.',
th->ack ? 'A' : '.', th->rst ? 'R' : '.',
caddr, ntohs(conn->cport),
daddr, ntohs(conn->dport),
tcp_state_name(conn->state),
tcp_state_name(new_state),
rte_atomic32_read(&conn->refcnt));
#endif
//记录就状态
conn->old_state = conn->state; // old_state called when connection reused
//更新新状态
conn->state = new_state;
//根据新状态设置连接超时时间
dp_vs_conn_set_timeout(conn, proto);
//检测后端服务器健康状况
if (new_state == DPVS_TCP_S_CLOSE && conn->old_state == DPVS_TCP_S_SYN_RECV)
dp_vs_dest_detected_dead(conn->dest); // connection reset by dest
else if (new_state == DPVS_TCP_S_ESTABLISHED)
dp_vs_dest_detected_alive(conn->dest);
//更新服务器活跃连接计数
if (dest) {
if (!(conn->flags & DPVS_CONN_F_INACTIVE)
&& (new_state != DPVS_TCP_S_ESTABLISHED)) {
rte_atomic32_dec(&dest->actconns);
rte_atomic32_inc(&dest->inactconns);
conn->flags |= DPVS_CONN_F_INACTIVE;
} else if ((conn->flags & DPVS_CONN_F_INACTIVE)
&& (new_state == DPVS_TCP_S_ESTABLISHED)) {
rte_atomic32_inc(&dest->actconns);
rte_atomic32_dec(&dest->inactconns);
conn->flags &= ~DPVS_CONN_F_INACTIVE;
}
}
return EDPVS_OK;
}
4.2.3 连接超时管理
4.2.3.1 DPVS的连接超时管理主要基于定时器机制,其实现步骤如下:
● 设置超时时间:在创建连接时,根据连接的类型和协议,为其设置一个初始的超时时间。
● 启动定时器:将连接与定时器关联起来,当定时器到期时,触发超时处理函数。
● 更新定时器:在每次接收到连接的新数据包时,更新该连接的超时时间,以延长连接的存活时间。
● 超时处理:当定时器到期且连接没有新的数据包传输时,执行超时处理函数,释放连接占用的资源,更新相关统计信息,并从连接跟踪表中删除该连接。
4.2.3.2 在DPVS中,连接超时通常由以下几个元素决定:协议类型、连接的生命周期、用户配置的超时值等。
4.2.3.3 实现
① 设置超时时间
void dp_vs_conn_set_timeout(struct dp_vs_conn *conn, struct dp_vs_proto *pp)
{
unsigned conn_timeout = 0;
/* set proper timeout */
if ((conn->proto == IPPROTO_TCP && conn->state == DPVS_TCP_S_ESTABLISHED)
|| conn->proto == IPPROTO_UDP) {
conn_timeout = dp_vs_conn_get_timeout(conn);
if (conn_timeout > 0) {
conn->timeout.tv_sec = conn_timeout;
return;
}
}
if (pp && pp->timeout_table)
conn->timeout.tv_sec = pp->timeout_table[conn->state];
else
conn->timeout.tv_sec = 60;
}
② 启动定时器
static void dp_vs_conn_attach_timer(struct dp_vs_conn *conn, bool lock)
{
int rc;
//检查连接是否在定时器中
if (dp_vs_conn_is_in_timer(conn))
return;
//检查连接是否为单包连接
if (conn->flags & DPVS_CONN_F_ONE_PACKET) {
return;
}
//检查连接是否为模板连接
if (dp_vs_conn_is_template(conn)) {
if (lock)
rc = dpvs_timer_sched(&conn->timer, &conn->timeout,
dp_vs_conn_expire, conn, true);
else
rc = dpvs_timer_sched_nolock(&conn->timer, &conn->timeout,
dp_vs_conn_expire, conn, true);
} else {
if (lock)
rc = dpvs_timer_sched(&conn->timer, &conn->timeout,
dp_vs_conn_expire, conn, false);
else
rc = dpvs_timer_sched_nolock(&conn->timer, &conn->timeout,
dp_vs_conn_expire, conn, false);
}
//设置连接已在定时器的标志
if (rc == EDPVS_OK)
dp_vs_conn_set_in_timer(conn);
}
③ 更新定时器
在每次接收到连接的新数据包时,调用dp_vs_conn_set_timeout更新连接的超时时间,以延长连接的存活时间。
void dp_vs_conn_set_timeout(struct dp_vs_conn *conn, struct dp_vs_proto *pp)
{
unsigned conn_timeout = 0;
/* set proper timeout */
if ((conn->proto == IPPROTO_TCP && conn->state == DPVS_TCP_S_ESTABLISHED)
|| conn->proto == IPPROTO_UDP) {
conn_timeout = dp_vs_conn_get_timeout(conn);
if (conn_timeout > 0) {
conn->timeout.tv_sec = conn_timeout;
return;
}
}
if (pp && pp->timeout_table)
conn->timeout.tv_sec = pp->timeout_table[conn->state];
else
conn->timeout.tv_sec = 60;
}
④ 超时处理
/* timeout hanlder */
static int dp_vs_conn_expire(void *priv)
{
struct dp_vs_conn *conn = priv;
struct dp_vs_proto *pp;
assert(conn);
assert(conn->af == AF_INET || conn->af == AF_INET6);
assert(rte_atomic32_read(&conn->refcnt) > 0);
pp = dp_vs_proto_lookup(conn->proto);
//设置连接的超时时间
dp_vs_conn_set_timeout(conn, pp);
//随机延迟,避免多个连接同时超时
dpvs_time_rand_delay(&conn->timeout, 1000000);
//如果不是单包连接,增加连接的引用计数,避免连接在超时处理过程中被过早销毁
if (!(conn->flags & DPVS_CONN_F_ONE_PACKET)) {
rte_atomic32_inc(&conn->refcnt);
}
//尝试重发数据包,如果重发成功,减少引用计数,稍后处理延时。
if (dp_vs_conn_resend_packets(conn, pp) == EDPVS_OK) {
/* expire later */
dp_vs_conn_put_nolock(conn);
return DTIMER_OK;
}
/* somebody is controlled by me, expire later */
//如果该连接控制着其他连接,减少引用计数,稍后处理延时
if (rte_atomic32_read(&conn->n_control)) {
dp_vs_conn_put_nolock(conn);
return DTIMER_OK;
}
/* unhash it then no further user can get it,
* even we cannot del it now. */
//从哈希表中删除连接
dp_vs_conn_unhash(conn);
/* refcnt == 1 means we are the only referer.
* no one is using the conn and it's timed out. */
//如果连接引用数为1,说明只有定时器处理函数在引用该连接
if (rte_atomic32_read(&conn->refcnt) == 1) {
//将连接从定时器中分离
dp_vs_conn_detach_timer(conn, false);
/* I was controlled by someone */
if (conn->control)
dp_vs_control_del(conn);
if (pp && pp->conn_expire)
//进行协议相关的清理
pp->conn_expire(pp, conn);
//释放sa资源
dp_vs_conn_sa_release(conn);
//解除与目标服务器的绑定
dp_vs_conn_unbind_dest(conn, false);
//解除与本地地址的绑定
dp_vs_laddr_unbind(conn);
//释放连接的数据包资源
dp_vs_conn_free_packets(conn);
//减少引用计数
rte_atomic32_dec(&conn->refcnt);
#ifdef CONFIG_DPVS_IPVS_STATS_DEBUG
conn_stats_dump("del conn", conn);
#endif
#ifdef CONFIG_DPVS_IPVS_DEBUG
conn_dump("del conn: ", conn);
#endif
//释放连接对象的内存
dp_vs_conn_free(conn);
return DTIMER_STOP;
}
//如果走到这里,说明连接还有人用,加回流表
dp_vs_conn_hash(conn);
/* some one is using it when expire,
* try del it again later */
//更新超时时间,加回定时器
dp_vs_conn_refresh_timer(conn, false);
rte_atomic32_dec(&conn->refcnt);
return DTIMER_OK;
}