DPVS学习笔记:连接跟踪

DPVS学习笔记:连接跟踪


1、基本概念

连接跟踪是一种记录和管理网络连接状态的机制。

在网络环境中,一个连接通常由源IP地址、目的IP地址、源端口、目的端口和传输层协议(TCP或UDP)这五个元素唯一标识,也称为五元组。

DPVS的连接跟踪模块会对经过它的每一个连接进行跟踪,记录其状态信息,以便对连接进行有效的管理和控制。


2、主要功能

① 状态管理:DPVS的连接跟踪模块可以记录每个网络连接的状态(如TCP连接的SYN、SYN-ACK、ESTABLISHED等状态),并根据状态进行相应的处理。

② NAT支持:连接跟踪是实现NAT功能的基础。DPVS可以通过连接跟踪模块记录每个连接的源地址和端口转换信息,确保数据包能够正确转发。

③ 会话超时管理:DPVS会为每个连接设置超时时间,当连接长时间没有活动时,会自动清理连接状态,释放资源。

④ 负载均衡:在负载均衡场景中,连接跟踪可以确保同一个客户端请求被转发到同一个后端服务器,保持会话的一致性。


3、工作原理

① 数据包捕获:DPVS通过DPDK捕获网络数据包,并将其传递给连接跟踪模块进行处理。

② 连接状态识别:连接跟踪模块会解析数据包的协议(如TCP、UDP、ICMP等),并根据协议类型识别连接的状态。

③ 状态表更新:DPVS会维护一个连接状态表,记录每个连接的状态信息。

④ 数据包处理:根据连接状态表中的信息,DPVS会对数据包进行相应的处理(如NAT转换、负载均衡转发等)。

⑤ 超时清理:DPVS会定期检查连接状态表,清理超时的连接,释放资源。


4、实现

4.1 核心数据结构

4.1.1 连接跟踪表

/* helpers */
#define this_conn_tbl               (RTE_PER_LCORE(dp_vs_conn_tbl)) //每个核独立的连接跟踪表
#ifdef CONFIG_DPVS_IPVS_CONN_LOCK
#define this_conn_lock              (RTE_PER_LCORE(dp_vs_conn_lock)) //每个核的哈希表操作锁
#endif
#define this_conn_count             (RTE_PER_LCORE(dp_vs_conn_count)) //每个核独立的连接计数
#define this_conn_cache             (dp_vs_conn_cache[rte_socket_id()]) //基于NUMA节点的连接表项内存池缓存

4.1.2 连接结构体

struct dp_vs_conn {
    int                     af; //地址族,标识ip版本
    uint8_t                 proto; //协议类型
    union inet_addr         caddr;  /* Client address */ //客户端地址
    union inet_addr         vaddr;  /* Virtual address */ //虚拟地址
    union inet_addr         laddr;  /* director Local address */ //负载均衡器本地地址
    union inet_addr         daddr;  /* Destination (RS) address */ //后端真实服务器地址
    uint16_t                cport; //客户端端口
    uint16_t                vport; //虚拟端口
    uint16_t                lport; //负载均衡器本地端口
    uint16_t                dport; //后端真实服务器端口

    struct rte_mempool      *connpool; //连接内存池
    struct conn_tuple_hash  tuplehash[DPVS_CONN_DIR_MAX]; //双向哈希表
    rte_atomic32_t          refcnt; //连接的引用计数
    struct dpvs_timer       timer; //定时器
    struct timeval          timeout; //超时时间
    lcoreid_t               lcore; //核
    struct dp_vs_dest       *dest;  /* real server */ //真实服务器
    void                    *prot_data;  /* protocol specific data */ //协议特定数据

    /* for FNAT */
    struct dp_vs_laddr      *local; /* local address */ //本地地址
    struct dp_vs_seq        fnat_seq; //FNAT序列号信息

    /* save last SEQ/ACK from RS for RST when conn expire*/
    uint32_t                rs_end_seq; //真实服务器的最后序列号
    uint32_t                rs_end_ack; //真实服务器的最后确认号

    int (*packet_xmit)(struct dp_vs_proto *prot,
                        struct dp_vs_conn *conn,
                        struct rte_mbuf *mbuf); //处理入方向流量
    int (*packet_out_xmit)(struct dp_vs_proto *prot,
                        struct dp_vs_conn *conn,
                        struct rte_mbuf *mbuf); //处理出方向流量

    /* L2 fast xmit */
    struct rte_ether_addr   in_smac; //输入的源MAC地址
    struct rte_ether_addr   in_dmac; //输入的目的MAC地址
    struct rte_ether_addr   out_smac; //输出的源MAC地址
    struct rte_ether_addr   out_dmac; //输出的目的MAC地址

    /* route for neigbour */
    struct netif_port       *in_dev;    /* inside to rs*/ //内部到真实服务器的接口
    struct netif_port       *out_dev;   /* outside to client*/ //外部到客户端的接口
    union inet_addr         in_nexthop;  /* to rs*/ //到真实服务器的下一跳地址
    union inet_addr         out_nexthop; /* to client*/ //到客户端的下一跳地址

#ifdef CONFIG_DPVS_IPVS_STATS_DEBUG
    /* statistics */
    struct dp_vs_conn_stats stats; //连接统计信息
#endif

    /* synproxy related members */
    struct dp_vs_seq syn_proxy_seq;     /* seq used in synproxy */ //synproxy模式下使用的序列号
    struct list_head ack_mbuf;          /* ack mbuf saved in step2 */ //保存step2中收到数据包的链表
    uint16_t ack_num;                   /* ack mbuf number stored */ //保存的ack数据包数量
    uint8_t wscale_vs;                  /* outbound wscale factor to client */ //客户端的窗口缩放因子
    uint8_t wscale_rs;                  /* outbound wscale factor from rs */ //真实服务器的窗口缩放因子
    struct rte_mbuf *syn_mbuf;          /* saved rs syn packet for retransmition */ //保存的SYN数据包
    rte_atomic32_t syn_retry_max;       /* syn retransmition max packets */ //syn重传的最大次数

    /* add for stopping ack storm */
    uint32_t last_seq;                  /* seq of the last ack packet */ //最后一个ack包的序列号
    uint32_t last_ack_seq;              /* ack seq of the last ack packet */ //最后一个ack包的确认号
    rte_atomic32_t dup_ack_cnt;         /* count of repeated ack packets */ //重复ack包的计数

    uint8_t pp_version;                 /* proxy protocol version */ //代理协议版本
    uint8_t pp_sent;                    /* proxy protocol data has sent */ //代理协议数据是否已发送

    /* flags and state transition */
    volatile uint16_t       flags; //标志位
    volatile uint16_t       state; //当前状态
    volatile uint16_t       old_state;  /* old state, to be used for state transition
                                           triggered synchronization */ //以前的状态(用于状态转换)
    /* controll members */
    struct dp_vs_conn *control;         /* master who controlls me */ //控制该连接的主连接
    rte_atomic32_t n_control;           /* number of connections controlled by me*/ //该连接控制的其他连接的数量
#ifdef CONFIG_DPVS_IPVS_STATS_DEBUG
    uint64_t ctime;                     /* create time */ //连接的创建时间
#endif

    /* connection redirect in fnat/snat/nat modes */
    struct dp_vs_redirect  *redirect; //指向连接重定向信息的指针

} __rte_cache_aligned;

4.1.3 连接状态

TCP:

enum {
    DPVS_TCP_S_NONE         = 0,
    DPVS_TCP_S_ESTABLISHED,
    DPVS_TCP_S_SYN_SENT,
    DPVS_TCP_S_SYN_RECV,
    DPVS_TCP_S_FIN_WAIT,
    DPVS_TCP_S_TIME_WAIT,
    DPVS_TCP_S_CLOSE,
    DPVS_TCP_S_CLOSE_WAIT,
    DPVS_TCP_S_LAST_ACK,
    DPVS_TCP_S_LISTEN,
    DPVS_TCP_S_SYNACK,
    DPVS_TCP_S_LAST
};

UDP:

enum {
    DPVS_UDP_S_NONE     = 0,
    DPVS_UDP_S_ONEWAY,
    DPVS_UDP_S_NORMAL,
    DPVS_UDP_S_LAST
};

ICMP:

enum {
    DPVS_ICMP_S_NORMAL      = 0,
    DPVS_ICMP_S_LAST
};

4.2 核心流程分析

4.2.1 连接跟踪的初始化

int dp_vs_conn_init(void)
{
    int i, err;
    lcoreid_t lcore;
    char poolname[32];

    /* init connection template table */
	//分配连接模板表
    dp_vs_ct_tbl = rte_malloc(NULL, sizeof(struct list_head) * DPVS_CONN_TBL_SIZE,
            RTE_CACHE_LINE_SIZE);
    if (!dp_vs_ct_tbl) {
        err = EDPVS_NOMEM;
        RTE_LOG(WARNING, IPVS, "%s: %s.\n",
            __func__, dpvs_strerror(err));
        return err;
    }

	//初始化连接模板表
    for (i = 0; i < DPVS_CONN_TBL_SIZE; i++)
        INIT_LIST_HEAD(&dp_vs_ct_tbl[i]);
    rte_spinlock_init(&dp_vs_ct_lock);

    /*
     * unlike linux per_cpu() which can assign CPU number,
     * RTE_PER_LCORE() can only access own instances.
     * it make codes looks strange.
     */
    //初始化各工作核的本地连接表
    rte_eal_mp_remote_launch(conn_init_lcore, NULL, SKIP_MAIN);
    RTE_LCORE_FOREACH_WORKER(lcore) {
        if ((err = rte_eal_wait_lcore(lcore)) < 0) {
            RTE_LOG(WARNING, IPVS, "%s: lcore %d: %s.\n",
                    __func__, lcore, dpvs_strerror(err));
        }
    }

	//初始化连接控制模块
    conn_ctrl_init();

	//创建每个NUMA节点上的连接缓存
    /* connection cache on each NUMA socket */
    for (i = 0; i < get_numa_nodes(); i++) {
        snprintf(poolname, sizeof(poolname), "dp_vs_conn_%d", i);
        dp_vs_conn_cache[i] = rte_mempool_create(poolname,
                                    conn_pool_size,
                                    sizeof(struct dp_vs_conn),
                                    conn_pool_cache,
                                    0, NULL, NULL, NULL, NULL,
                                    i, 0);
        if (!dp_vs_conn_cache[i]) {
            err = EDPVS_NOMEM;
            goto cleanup;
        }
    }

	//生成随机数
    dp_vs_conn_rnd = (uint32_t)random();

    return EDPVS_OK;

cleanup:
    dp_vs_conn_term();
    return err;
}

4.2.2 数据包处理流程

① 查找连接表

static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf,
                      const struct inet_hook_state *state, int af)
{
    struct dp_vs_iphdr iph;
    struct dp_vs_proto *prot;
    struct dp_vs_conn *conn;
    int dir, verdict, err, related;
    bool drop = false;
    lcoreid_t cid, peer_cid;
    eth_type_t etype = mbuf->packet_type; /* FIXME: use other field ? */
    assert(mbuf && state);

    cid = peer_cid = rte_lcore_id();

	//数据包不是发往本机的,那么返回
    if (unlikely(etype != ETH_PKT_HOST))
        return INET_ACCEPT;

    if (dp_vs_fill_iphdr(af, mbuf, &iph) != EDPVS_OK)
        return INET_ACCEPT;

	//处理ICMP消息
    if (unlikely(iph.proto == IPPROTO_ICMP ||
                 iph.proto == IPPROTO_ICMPV6)) {
        /* handle related ICMP error to existing conn */
        verdict = dp_vs_in_icmp(af, mbuf, &related);
        if (related || verdict != INET_ACCEPT)
            return verdict;
        /* let unrelated and valid ICMP goes down,
         * may implement ICMP fwd in the futher. */
    }

	//查找四层处理协议,目前实现了tcp、udp、icmp三种
    prot = dp_vs_proto_lookup(iph.proto);
    if (unlikely(!prot))
        return INET_ACCEPT;

    /*
     * Defrag ipvs-forwarding TCP/UDP is not supported for some reasons,
     *
     * - RSS/flow-director do not support TCP/UDP fragments, means it's
     *   not able to direct frags to same lcore as original TCP/UDP packets.
     * - per-lcore conn table will miss if frags reachs wrong lcore.
     *
     * If we redirect frags to "correct" lcore, it may cause performance
     * issue. Also it need to understand RSS algorithm. Moreover, for the
     * case frags in same flow are not occur in same lcore, a global lock is
     * needed, which is not a good idea.
     */
    if (af == AF_INET && ip4_is_frag(ip4_hdr(mbuf))) {
        RTE_LOG(DEBUG, IPVS, "%s: frag not support.\n", __func__);
        return INET_DROP;
    }

    /* packet belongs to existing connection ? */
	//在流表中查找连接
    conn = prot->conn_lookup(prot, &iph, mbuf, &dir, false, &drop, &peer_cid);

    ...
}

② 新建连接

static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf,
                      const struct inet_hook_state *state, int af)
{
    ...
    if (unlikely(!conn)) {
        /* try schedule RS and create new connection */
		//如果没有找到会话,conn_sched根据请求选择一个后端rs建立连接
        if (prot->conn_sched(prot, &iph, mbuf, &conn, &verdict) != EDPVS_OK) {
            /* RTE_LOG(DEBUG, IPVS, "%s: fail to schedule.\n", __func__); */
            return verdict;
        }

        /* only SNAT triggers connection by inside-outside traffic. */
		//snat模式,则是内部服务器访问外部服务,内网服务器-->dpvs-->外网服务器,所以设置dir = DPVS_CONN_DIR_OUTBOUND
        if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT)
            dir = DPVS_CONN_DIR_OUTBOUND;
        else
            dir = DPVS_CONN_DIR_INBOUND;
    } else {
        /* assert(conn->dest != NULL); */
        if (prot->conn_expire_quiescent && (conn->flags & DPVS_CONN_F_EXPIRE_QUIESCENT) &&
                conn->dest && (!dp_vs_dest_is_avail(conn->dest) ||
                    rte_atomic16_read(&conn->dest->weight) == 0)) {
            RTE_LOG(INFO, IPVS, "%s: the conn is quiescent, expire it right now,"
                    " and drop the packet!\n", __func__);
            prot->conn_expire_quiescent(conn);
            dp_vs_conn_put(conn);
            return INET_DROP;
        }
    }

   ...
}

以TCP协议为例:

tcp_conn_sched

/* set @verdict if failed to schedule */
static int tcp_conn_sched(struct dp_vs_proto *proto,
                          const struct dp_vs_iphdr *iph,
                          struct rte_mbuf *mbuf,
                          struct dp_vs_conn **conn,
                          int *verdict)
{
    struct tcphdr *th, _tcph;
    struct dp_vs_service *svc;

    assert(proto && iph && mbuf && conn && verdict);

	//获取tcp header,只是指针操作,不涉及数据复制
    th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph);
    if (unlikely(!th)) {
        *verdict = INET_DROP;
        return EDPVS_INVPKT;
    }

    /* Syn-proxy step 2 logic: receive client's 3-handshake ack packet */
    /* When synproxy disabled, only SYN packets can arrive here.
     * So don't judge SYNPROXY flag here! If SYNPROXY flag judged, and syn_proxy
     * got disbled and keepalived reloaded, SYN packets for RS may never be sent. */
    if (dp_vs_synproxy_ack_rcv(iph->af, mbuf, th, proto, conn, iph, verdict) == 0) {
        /* Attention: First ACK packet is also stored in conn->ack_mbuf */
        return EDPVS_PKTSTOLEN;
    }

    /* only TCP-SYN without other flag can be scheduled */
	//对于新建立的连接,只允许syn请求,其他的抛弃
    if (!th->syn || th->ack || th->fin || th->rst) {
#ifdef CONFIG_DPVS_IPVS_DEBUG
        char dbuf[64], sbuf[64];
        const char *daddr, *saddr;

        daddr = inet_ntop(iph->af, &iph->daddr, dbuf, sizeof(dbuf)) ? dbuf : "::";
        saddr = inet_ntop(iph->af, &iph->saddr, sbuf, sizeof(sbuf)) ? sbuf : "::";
        RTE_LOG(DEBUG, IPVS,
                "%s: [%d] try sched non-SYN packet: [%c%c%c%c] %s/%d->%s/%d\n",
                __func__, rte_lcore_id(),
                th->syn ? 'S' : '.', th->fin ? 'F' : '.',
                th->ack ? 'A' : '.', th->rst ? 'R' : '.',
                saddr, ntohs(th->source), daddr, ntohs(th->dest));
#endif

        /* Drop tcp packet which is send to vip and !vport */
        if (g_defence_tcp_drop &&
                (svc = dp_vs_vip_lookup(iph->af, iph->proto,
                                    &iph->daddr, rte_lcore_id()))) {
            dp_vs_estats_inc(DEFENCE_TCP_DROP);
            *verdict = INET_DROP;
            return EDPVS_INVPKT;
        }

        *verdict = INET_ACCEPT;
        return EDPVS_INVAL;
    }

	//根据请求目的地址和端口来查找服务,找不到丢弃
    svc = dp_vs_service_lookup(iph->af, iph->proto, &iph->daddr, th->dest,
                               0, mbuf, NULL, rte_lcore_id());
    if (!svc) {
        /* Drop tcp packet which is send to vip and !vport */
        if (g_defence_tcp_drop &&
                (svc = dp_vs_vip_lookup(iph->af, iph->proto,
                                   &iph->daddr, rte_lcore_id()))) {
            dp_vs_estats_inc(DEFENCE_TCP_DROP);
            *verdict = INET_DROP;
            return EDPVS_INVPKT;
        }
        *verdict = INET_ACCEPT;
        return EDPVS_NOSERV;
    }

	//根据服务来选择rs建立连接
    *conn = dp_vs_schedule(svc, iph, mbuf, false);
    if (!*conn) {
        *verdict = INET_DROP;
        return EDPVS_RESOURCE;
    }

    return EDPVS_OK;
}

dp_vs_schedule

/* select an RS by service's scheduler and create a connection */
struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc,
                                  const struct dp_vs_iphdr *iph,
                                  struct rte_mbuf *mbuf,
                                  bool is_synproxy_on)
{
    uint16_t _ports[2], *ports; /* sport, dport */
    struct dp_vs_dest *dest;
    struct dp_vs_conn *conn;
    struct dp_vs_conn_param param;
    uint32_t flags = 0;

    assert(svc && iph && mbuf);

	//从mbuf中提取源端口和目的端口
    ports = mbuf_header_pointer(mbuf, iph->len, sizeof(_ports), _ports);
    if (!ports)
        return NULL;

    /* persistent service */
	//如果服务标记为持久化
    if (svc->flags & DP_VS_SVC_F_PERSISTENT)
        return dp_vs_sched_persist(svc, iph,  mbuf, is_synproxy_on);

	//根据特定算法选择real server,常用的有wrr、rr、wlc;返回dest结构体是后端rs
    dest = svc->scheduler->schedule(svc, mbuf, iph);
    if (!dest) {
        RTE_LOG(INFO, IPVS, "%s: no dest found.\n", __func__);
#ifdef CONFIG_DPVS_MBUF_DEBUG
        dp_vs_mbuf_dump("found dest failed.", iph->af, mbuf);
#endif
        return NULL;
    }

    if (dest->fwdmode == DPVS_FWD_MODE_SNAT)
        return dp_vs_snat_schedule(dest, iph, ports, mbuf);

	//处理ICMP协议
    if (unlikely(iph->proto == IPPROTO_ICMP)) {
        struct icmphdr *ich, _icmph;
        ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph);
        if (!ich)
            return NULL;

        ports = _ports;
        _ports[0] = icmp4_id(ich);
        _ports[1] = ich->type << 8 | ich->code;

		//填充参数供新建连接使用
        dp_vs_conn_fill_param(iph->af, iph->proto,
                              &iph->saddr, &iph->daddr,
                              ports[0], ports[1], 0, &param);
    } else if (unlikely(iph->proto == IPPROTO_ICMPV6)) {
        struct icmp6_hdr *ic6h, _ic6hp;
        ic6h = mbuf_header_pointer(mbuf, iph->len, sizeof(_ic6hp), &_ic6hp);
        if (!ic6h)
            return NULL;

        ports = _ports;
        _ports[0] = icmp6h_id(ic6h);
        _ports[1] = ic6h->icmp6_type << 8 | ic6h->icmp6_code;

        dp_vs_conn_fill_param(iph->af, iph->proto,
                              &iph->daddr, &dest->addr,
                              ports[1], ports[0],
                              0, &param);
    } else {
        dp_vs_conn_fill_param(iph->af, iph->proto,
                              &iph->saddr, &iph->daddr,
                              ports[0], ports[1], 0, &param);
    }

    if (is_synproxy_on)
        flags |= DPVS_CONN_F_SYNPROXY;
    if (svc->flags & DP_VS_SVC_F_ONEPACKET && iph->proto == IPPROTO_UDP)
        flags |= DPVS_CONN_F_ONE_PACKET;
    if (svc->flags & DP_VS_SVC_F_EXPIRE_QUIESCENT)
        flags |= DPVS_CONN_F_EXPIRE_QUIESCENT;

	//创建新连接
    conn = dp_vs_conn_new(mbuf, iph, &param, dest, flags);
    if (!conn)
        return NULL;

	//更新连接统计信息
    dp_vs_stats_conn(conn);
    return conn;
}

dp_vs_conn_new

struct dp_vs_conn *dp_vs_conn_new(struct rte_mbuf *mbuf,
                                  const struct dp_vs_iphdr *iph,
                                  struct dp_vs_conn_param *param,
                                  struct dp_vs_dest *dest, uint32_t flags)
{
    struct dp_vs_conn *new;
    struct conn_tuple_hash *t;
    uint16_t rport;
    __be16 _ports[2], *ports;
    int err;

    assert(mbuf && param && dest);

	//为新的连接分配内存
    new = dp_vs_conn_alloc(dest->fwdmode, flags);
    if (unlikely(!new))
        return NULL;

    new->flags = flags;

    /* set proper RS port */
	//是否为模板连接或者目的端口不为0
    if (dp_vs_conn_is_template(new) || param->ct_dport != 0)
        rport = param->ct_dport;
    else if (dest->fwdmode == DPVS_FWD_MODE_SNAT) {
        if (unlikely(param->proto == IPPROTO_ICMP ||
                    param->proto == IPPROTO_ICMPV6)) {
            rport = param->vport;
        } else {
			//从mbuf中提取端口
            ports = mbuf_header_pointer(mbuf, iph->len, sizeof(_ports), _ports);
            if (unlikely(!ports)) {
                RTE_LOG(WARNING, IPVS, "%s: no memory\n", __func__);
                goto errout;
            }
            rport = ports[0];
        }
    } else {
        rport = dest->port;
    }

    /* init inbound conn tuple hash */
    t = &tuplehash_in(new);
    t->direct   = DPVS_CONN_DIR_INBOUND; //入口流量
    t->af       = param->af;
    t->proto    = param->proto;
    t->saddr    = *param->caddr; //源地址是外网client addr
    t->sport    = param->cport;
    t->daddr    = *param->vaddr; //目的地址是服务虚ip地址
    t->dport    = param->vport;
    INIT_LIST_HEAD(&t->list);

    /* init outbound conn tuple hash */
    t = &tuplehash_out(new);
    t->direct   = DPVS_CONN_DIR_OUTBOUND; //出口流量
    t->af       = dest->af;
    t->proto    = param->proto;
    if (dest->fwdmode == DPVS_FWD_MODE_SNAT) {
        t->saddr = iph->saddr;
    } else {
        t->saddr = dest->addr;
    }
    t->sport    = rport;
    t->daddr    = *param->caddr;    /* non-FNAT */
    t->dport    = param->cport;     /* non-FNAT */
    INIT_LIST_HEAD(&t->list);

    /* init connection */
    new->af     = param->af;
    new->proto  = param->proto;
    new->caddr  = *param->caddr;
    new->cport  = param->cport;
    new->vaddr  = *param->vaddr;
    new->vport  = param->vport;
    new->laddr  = *param->caddr;    /* non-FNAT */
    new->lport  = param->cport;     /* non-FNAT */
    if (dest->fwdmode == DPVS_FWD_MODE_SNAT)
        new->daddr  = iph->saddr;
    else
        new->daddr  = dest->addr;
    new->dport  = rport;

    if (dest->fwdmode == DPVS_FWD_MODE_FNAT) {
        new->pp_version = dest->svc->proxy_protocol;
        new->pp_sent = 0;
    }

    /* neighbour confirm cache */
    if (AF_INET == tuplehash_in(new).af) {
        new->in_nexthop.in.s_addr = htonl(INADDR_ANY);
    } else {
        new->in_nexthop.in6 = in6addr_any;
    }

    if (AF_INET == tuplehash_out(new).af) {
        new->out_nexthop.in.s_addr = htonl(INADDR_ANY);
    } else {
        new->out_nexthop.in6 = in6addr_any;
    }

    new->in_dev = NULL;
    new->out_dev = NULL;

    /* Controll member */
    new->control = NULL;
    rte_atomic32_clear(&new->n_control);

    /* caller will use it right after created,
     * just like dp_vs_conn_get(). */
    rte_atomic32_set(&new->refcnt, 1);
    new->state  = 0;
#ifdef CONFIG_DPVS_IPVS_STATS_DEBUG
    new->ctime = rte_rdtsc();
#endif

    /* bind destination and corresponding trasmitter */
	//设置转发模式相关的几个发包收包操作
    err = dp_vs_conn_bind_dest(new, dest);
    if (err != EDPVS_OK) {
        RTE_LOG(WARNING, IPVS, "%s: fail to bind dest: %s\n",
                __func__, dpvs_strerror(err));
        goto errout;
    }

    /* FNAT only: select and bind local address/port */
    if (dest->fwdmode == DPVS_FWD_MODE_FNAT) {
		//绑定lb本地socket
        if ((err = dp_vs_laddr_bind(new, dest->svc)) != EDPVS_OK)
            goto unbind_dest;
    }

    /* init redirect if it exists */
	//初始化重定向信息
    dp_vs_redirect_init(new);

    /* add to hash table (dual dir for each bucket) */
	//将连接添加到哈希表中
    if ((err = dp_vs_conn_hash(new)) != EDPVS_OK)
        goto unbind_laddr;

    /* timer */
	//设置连接的初始超时时间
    new->timeout.tv_sec = conn_init_timeout;
    new->timeout.tv_usec = 0;

    /* synproxy */
    INIT_LIST_HEAD(&new->ack_mbuf);
    rte_atomic32_set(&new->syn_retry_max, 0);
    rte_atomic32_set(&new->dup_ack_cnt, 0);

    if ((flags & DPVS_CONN_F_SYNPROXY) && !dp_vs_conn_is_template(new)) {
        struct tcphdr _tcph, *th = NULL;
        struct dp_vs_synproxy_ack_pakcet *ack_mbuf;
        struct dp_vs_proto *pp;

        th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph);
        if (!th) {
            RTE_LOG(ERR, IPVS, "%s: get tcphdr failed\n", __func__);
            goto unbind_laddr;
        }

        /* save ack packet */
        if (unlikely(rte_mempool_get(this_ack_mbufpool, (void **)&ack_mbuf) != 0)) {
            RTE_LOG(ERR, IPVS, "%s: no memory\n", __func__);
            goto unbind_laddr;
        }
        ack_mbuf->mbuf = mbuf;
        list_add_tail(&ack_mbuf->list, &new->ack_mbuf);
        new->ack_num++;
        sp_dbg_stats32_inc(sp_ack_saved);

        /* save ack_seq - 1 */
        new->syn_proxy_seq.isn =
            htonl((uint32_t) ((ntohl(th->ack_seq) - 1)));

        /* save ack_seq */
        new->fnat_seq.fdata_seq = ntohl(th->ack_seq);

        /* FIXME: use DP_VS_TCP_S_SYN_SENT for syn */
        pp = dp_vs_proto_lookup(param->proto);
        new->timeout.tv_sec = pp->timeout_table[new->state = DPVS_TCP_S_SYN_SENT];
    }

    /* schedule conn timer */
#ifdef CONFIG_TIMER_DEBUG
    snprintf(new->timer.name, sizeof(new->timer.name), "%s", "conn");
#endif
	//对超时时间进行随机延迟处理
    dpvs_time_rand_delay(&new->timeout, 1000000);
	//将连接对象添加到定时器中,以便管理连接的超时
    dp_vs_conn_attach_timer(new, true);

#ifdef CONFIG_DPVS_IPVS_DEBUG
    conn_dump("new conn: ", new);
#endif
    return new;

unbind_laddr:
    dp_vs_laddr_unbind(new);
unbind_dest:
    dp_vs_conn_unbind_dest(new, true);
errout:
    dp_vs_conn_free(new);
    return NULL;
}

③ 更新连接状态

static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf,
                      const struct inet_hook_state *state, int af)
{
    ...

	//tcp状态转移
    if (prot->state_trans) {
        err = prot->state_trans(prot, conn, mbuf, dir);
        if (err != EDPVS_OK)
            RTE_LOG(WARNING, IPVS, "%s: fail to trans state.", __func__);
    }
    conn->old_state = conn->state;

	...
}

以TCP协议为例:

tcp_state_trans

static int tcp_state_trans(struct dp_vs_proto *proto, struct dp_vs_conn *conn,
                           struct rte_mbuf *mbuf, int dir)
{
    struct tcphdr *th, _tcph;
    int idx, off;
    int new_state = DPVS_TCP_S_CLOSE;
    assert(proto && conn && mbuf);
    struct dp_vs_dest *dest = conn->dest;
    int af = conn->af;
#ifdef CONFIG_DPVS_IPVS_DEBUG
    char dbuf[64], cbuf[64];
    const char *daddr, *caddr;
#endif

	//确定地址族
    if (dir == DPVS_CONN_DIR_INBOUND && dest->fwdmode == DPVS_FWD_MODE_FNAT)
        af = tuplehash_in(conn).af;
    else if (dir == DPVS_CONN_DIR_OUTBOUND && dest->fwdmode == DPVS_FWD_MODE_FNAT)
        af = tuplehash_out(conn).af;

    int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf));
	//从数据包缓存区中获取TCP头部信息
    th = mbuf_header_pointer(mbuf, iphdrlen, sizeof(_tcph), &_tcph);
    if (unlikely(!th))
        return EDPVS_INVPKT;
	//根据转发模式和数据包方向确认状态转换表的偏移量
    if (dest->fwdmode == DPVS_FWD_MODE_DR || dest->fwdmode == DPVS_FWD_MODE_TUNNEL)
        off = 8;
    else if (dir == DPVS_CONN_DIR_INBOUND)
        off = 0;
    else if (dir == DPVS_CONN_DIR_OUTBOUND)
        off = 4;
    else
        return EDPVS_NOTSUPP; /* do not support INPUT_ONLY now */

	//获取状态转换索引
    if ((idx = tcp_state_idx(th)) < 0) {
        RTE_LOG(DEBUG, IPVS, "tcp_state_idx=%d !\n", idx);
        goto tcp_state_out;
    }

	//计算出新的TCP状态
    new_state = tcp_states[off + idx].next_state[conn->state];

tcp_state_out:
	//如果状态没有变化,返回
    if (new_state == conn->state)
        return EDPVS_OK;

    /* state changed */

//记录状态转换信息(调试模式)
#ifdef CONFIG_DPVS_IPVS_DEBUG
    daddr = inet_ntop(tuplehash_out(conn).af, &conn->daddr, dbuf, sizeof(dbuf)) ? dbuf : "::";
    caddr = inet_ntop(tuplehash_in(conn).af, &conn->caddr, cbuf, sizeof(cbuf)) ? cbuf : "::";

    RTE_LOG(DEBUG, IPVS, "state trans: %s %s [%c%c%c%c] %s:%u->%s:%u "
            " state %s->%s conn.refcnt %d\n",
            proto->name, dir == DPVS_CONN_DIR_OUTBOUND ? "out" : "in",
            th->syn ? 'S' : '.', th->fin ? 'F' : '.',
            th->ack ? 'A' : '.', th->rst ? 'R' : '.',
            caddr, ntohs(conn->cport),
            daddr, ntohs(conn->dport),
            tcp_state_name(conn->state),
            tcp_state_name(new_state),
            rte_atomic32_read(&conn->refcnt));
#endif

	//记录就状态
    conn->old_state = conn->state; // old_state called when connection reused
    //更新新状态
    conn->state = new_state;

	//根据新状态设置连接超时时间
    dp_vs_conn_set_timeout(conn, proto);

	//检测后端服务器健康状况
    if (new_state == DPVS_TCP_S_CLOSE && conn->old_state == DPVS_TCP_S_SYN_RECV)
        dp_vs_dest_detected_dead(conn->dest); // connection reset by dest
    else if (new_state == DPVS_TCP_S_ESTABLISHED)
        dp_vs_dest_detected_alive(conn->dest);

	//更新服务器活跃连接计数
    if (dest) {
        if (!(conn->flags & DPVS_CONN_F_INACTIVE)
                && (new_state != DPVS_TCP_S_ESTABLISHED)) {
            rte_atomic32_dec(&dest->actconns);
            rte_atomic32_inc(&dest->inactconns);
            conn->flags |= DPVS_CONN_F_INACTIVE;
        } else if ((conn->flags & DPVS_CONN_F_INACTIVE)
                && (new_state == DPVS_TCP_S_ESTABLISHED)) {
            rte_atomic32_inc(&dest->actconns);
            rte_atomic32_dec(&dest->inactconns);
            conn->flags &= ~DPVS_CONN_F_INACTIVE;
        }
    }

    return EDPVS_OK;
}

4.2.3 连接超时管理

4.2.3.1 DPVS的连接超时管理主要基于定时器机制,其实现步骤如下:

● 设置超时时间:在创建连接时,根据连接的类型和协议,为其设置一个初始的超时时间。

● 启动定时器:将连接与定时器关联起来,当定时器到期时,触发超时处理函数。

● 更新定时器:在每次接收到连接的新数据包时,更新该连接的超时时间,以延长连接的存活时间。

● 超时处理:当定时器到期且连接没有新的数据包传输时,执行超时处理函数,释放连接占用的资源,更新相关统计信息,并从连接跟踪表中删除该连接。

4.2.3.2 在DPVS中,连接超时通常由以下几个元素决定:协议类型、连接的生命周期、用户配置的超时值等。

4.2.3.3 实现

① 设置超时时间

void dp_vs_conn_set_timeout(struct dp_vs_conn *conn, struct dp_vs_proto *pp)
{
    unsigned conn_timeout = 0;

    /* set proper timeout */
    if ((conn->proto == IPPROTO_TCP && conn->state == DPVS_TCP_S_ESTABLISHED)
            || conn->proto == IPPROTO_UDP) {
        conn_timeout = dp_vs_conn_get_timeout(conn);

        if (conn_timeout > 0) {
            conn->timeout.tv_sec = conn_timeout;
            return;
        }
    }

    if (pp && pp->timeout_table)
        conn->timeout.tv_sec = pp->timeout_table[conn->state];
    else
        conn->timeout.tv_sec = 60;
}

② 启动定时器

static void dp_vs_conn_attach_timer(struct dp_vs_conn *conn, bool lock)
{
    int rc;

	//检查连接是否在定时器中
    if (dp_vs_conn_is_in_timer(conn))
        return;

	//检查连接是否为单包连接
    if (conn->flags & DPVS_CONN_F_ONE_PACKET) {
        return;
    }
	//检查连接是否为模板连接
    if (dp_vs_conn_is_template(conn)) {
        if (lock)
            rc = dpvs_timer_sched(&conn->timer, &conn->timeout,
                                  dp_vs_conn_expire, conn, true);
        else
            rc = dpvs_timer_sched_nolock(&conn->timer, &conn->timeout,
                                  dp_vs_conn_expire, conn, true);
    } else {
        if (lock)
            rc = dpvs_timer_sched(&conn->timer, &conn->timeout,
                                  dp_vs_conn_expire, conn, false);
        else
            rc = dpvs_timer_sched_nolock(&conn->timer, &conn->timeout,
                                  dp_vs_conn_expire, conn, false);
    }

	//设置连接已在定时器的标志
    if (rc == EDPVS_OK)
        dp_vs_conn_set_in_timer(conn);
}

③ 更新定时器

在每次接收到连接的新数据包时,调用dp_vs_conn_set_timeout更新连接的超时时间,以延长连接的存活时间。

void dp_vs_conn_set_timeout(struct dp_vs_conn *conn, struct dp_vs_proto *pp)
{
    unsigned conn_timeout = 0;

    /* set proper timeout */
    if ((conn->proto == IPPROTO_TCP && conn->state == DPVS_TCP_S_ESTABLISHED)
            || conn->proto == IPPROTO_UDP) {
        conn_timeout = dp_vs_conn_get_timeout(conn);

        if (conn_timeout > 0) {
            conn->timeout.tv_sec = conn_timeout;
            return;
        }
    }

    if (pp && pp->timeout_table)
        conn->timeout.tv_sec = pp->timeout_table[conn->state];
    else
        conn->timeout.tv_sec = 60;
}

④ 超时处理

/* timeout hanlder */
static int dp_vs_conn_expire(void *priv)
{
    struct dp_vs_conn *conn = priv;
    struct dp_vs_proto *pp;

    assert(conn);
    assert(conn->af == AF_INET || conn->af == AF_INET6);
    assert(rte_atomic32_read(&conn->refcnt) > 0);

    pp = dp_vs_proto_lookup(conn->proto);
	//设置连接的超时时间
    dp_vs_conn_set_timeout(conn, pp);
	//随机延迟,避免多个连接同时超时
    dpvs_time_rand_delay(&conn->timeout, 1000000);

	//如果不是单包连接,增加连接的引用计数,避免连接在超时处理过程中被过早销毁
    if (!(conn->flags & DPVS_CONN_F_ONE_PACKET)) {
        rte_atomic32_inc(&conn->refcnt);
    }

	//尝试重发数据包,如果重发成功,减少引用计数,稍后处理延时。
    if (dp_vs_conn_resend_packets(conn, pp) == EDPVS_OK) {
        /* expire later */
        dp_vs_conn_put_nolock(conn);
        return DTIMER_OK;
    }

    /* somebody is controlled by me, expire later */
	//如果该连接控制着其他连接,减少引用计数,稍后处理延时
    if (rte_atomic32_read(&conn->n_control)) {
        dp_vs_conn_put_nolock(conn);
        return DTIMER_OK;
    }

    /* unhash it then no further user can get it,
     * even we cannot del it now. */
    //从哈希表中删除连接
    dp_vs_conn_unhash(conn);

    /* refcnt == 1 means we are the only referer.
     * no one is using the conn and it's timed out. */
    //如果连接引用数为1,说明只有定时器处理函数在引用该连接
    if (rte_atomic32_read(&conn->refcnt) == 1) {
		//将连接从定时器中分离
        dp_vs_conn_detach_timer(conn, false);

        /* I was controlled by someone */
        if (conn->control)
            dp_vs_control_del(conn);

        if (pp && pp->conn_expire)
			//进行协议相关的清理
            pp->conn_expire(pp, conn);

		//释放sa资源
        dp_vs_conn_sa_release(conn);
		//解除与目标服务器的绑定 
        dp_vs_conn_unbind_dest(conn, false);
		//解除与本地地址的绑定
        dp_vs_laddr_unbind(conn);
		//释放连接的数据包资源
        dp_vs_conn_free_packets(conn);

		//减少引用计数
        rte_atomic32_dec(&conn->refcnt);

#ifdef CONFIG_DPVS_IPVS_STATS_DEBUG
        conn_stats_dump("del conn", conn);
#endif
#ifdef CONFIG_DPVS_IPVS_DEBUG
        conn_dump("del conn: ", conn);
#endif

		//释放连接对象的内存
        dp_vs_conn_free(conn);

        return DTIMER_STOP;
    }

	//如果走到这里,说明连接还有人用,加回流表
    dp_vs_conn_hash(conn);

    /* some one is using it when expire,
     * try del it again later */
    //更新超时时间,加回定时器
    dp_vs_conn_refresh_timer(conn, false);

    rte_atomic32_dec(&conn->refcnt);
    return DTIMER_OK;
}
posted @ 2025-02-04 22:18  调蓝师  阅读(131)  评论(0)    收藏  举报