dpdk l2fwd (2)

回到l2fwd的main函数中

int
MAIN(int argc, char **argv)
{
    struct lcore_queue_conf *qconf;
    struct rte_eth_dev_info dev_info;
    int ret;
    uint8_t nb_ports;
    uint8_t nb_ports_available;
    uint8_t portid, last_port;
    unsigned lcore_id, rx_lcore_id;
    unsigned nb_ports_in_mask = 0;

    /* init EAL */
    ret = rte_eal_init(argc, argv);
    if (ret < 0)
        rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
    argc -= ret;
    argv += ret;

    /* parse application arguments (after the EAL ones) */
    ret = l2fwd_parse_args(argc, argv);
    if (ret < 0)
        rte_exit(EXIT_FAILURE, "Invalid L2FWD arguments\n");

    /* create the mbuf pool */
    l2fwd_pktmbuf_pool =
        rte_mempool_create("mbuf_pool", NB_MBUF,
                   MBUF_SIZE, 32,
                   sizeof(struct rte_pktmbuf_pool_private),
                   rte_pktmbuf_pool_init, NULL,
                   rte_pktmbuf_init, NULL,
                   rte_socket_id(), 0);
    if (l2fwd_pktmbuf_pool == NULL)
        rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");

    /* init driver(s) */
    if (rte_pmd_init_all() < 0)
        rte_exit(EXIT_FAILURE, "Cannot init pmd\n");

    if (rte_eal_pci_probe() < 0)
        rte_exit(EXIT_FAILURE, "Cannot probe PCI\n");

    nb_ports = rte_eth_dev_count();
    if (nb_ports == 0)
        rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");

    if (nb_ports > RTE_MAX_ETHPORTS)
        nb_ports = RTE_MAX_ETHPORTS;

    /* reset l2fwd_dst_ports */
    for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++)
        l2fwd_dst_ports[portid] = 0;
    last_port = 0;

    /* port0发给port1, port1发给port0. 两个端口为一对,互相发包 */
    /*
     * Each logical core is assigned a dedicated TX queue on each port.
     */
    for (portid = 0; portid < nb_ports; portid++) {
        /* skip ports that are not enabled */
        if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
            continue;

        if (nb_ports_in_mask % 2) {
            l2fwd_dst_ports[portid] = last_port;
            l2fwd_dst_ports[last_port] = portid;
        }
        else
            last_port = portid;

        nb_ports_in_mask++;

        rte_eth_dev_info_get(portid, &dev_info);
    }
    if (nb_ports_in_mask % 2) {
        printf("Notice: odd number of ports in portmask.\n");
        l2fwd_dst_ports[last_port] = last_port;
    }

    rx_lcore_id = 0;
    qconf = NULL;

    /* 每个core负责收l2fwd_rx_queue_per_lcore个端口, 每个端口(其实应该是QUEUE,因为这里一个port只有一个QUEUE)只能由一个lcore进行收包 */
    /* Initialize the port/queue configuration of each logical core */
    for (portid = 0; portid < nb_ports; portid++) {
        /* skip ports that are not enabled */
        if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
            continue;

        /* get the lcore_id for this port */
        while (rte_lcore_is_enabled(rx_lcore_id) == 0 ||
               lcore_queue_conf[rx_lcore_id].n_rx_port ==
               l2fwd_rx_queue_per_lcore) {
            rx_lcore_id++;
            if (rx_lcore_id >= RTE_MAX_LCORE)
                rte_exit(EXIT_FAILURE, "Not enough cores\n");
        }

        if (qconf != &lcore_queue_conf[rx_lcore_id])
            /* Assigned a new logical core in the loop above. */
            qconf = &lcore_queue_conf[rx_lcore_id];

        qconf->rx_port_list[qconf->n_rx_port] = portid;
        qconf->n_rx_port++;
        printf("Lcore %u: RX port %u\n", rx_lcore_id, (unsigned) portid);
    }

    nb_ports_available = nb_ports;

    /* 每个port收发包队列的初始化 */
    /* Initialise each port */
    for (portid = 0; portid < nb_ports; portid++) {
        /* skip ports that are not enabled */
        if ((l2fwd_enabled_port_mask & (1 << portid)) == 0) {
            printf("Skipping disabled port %u\n", (unsigned) portid);
            nb_ports_available--;
            continue;
        }
        /* init port */
        printf("Initializing port %u... ", (unsigned) portid);
        fflush(stdout);
        ret = rte_eth_dev_configure(portid, 1, 1, &port_conf);
        if (ret < 0)
            rte_exit(EXIT_FAILURE, "Cannot configure device: err=%d, port=%u\n",
                  ret, (unsigned) portid);

        rte_eth_macaddr_get(portid,&l2fwd_ports_eth_addr[portid]);

        /* init one RX queue */
        fflush(stdout);
        ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
                         rte_eth_dev_socket_id(portid), &rx_conf,
                         l2fwd_pktmbuf_pool);
        if (ret < 0)
            rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup:err=%d, port=%u\n",
                  ret, (unsigned) portid);

        /* init one TX queue on each port */
        fflush(stdout);
        ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
                rte_eth_dev_socket_id(portid), &tx_conf);
        if (ret < 0)
            rte_exit(EXIT_FAILURE, "rte_eth_tx_queue_setup:err=%d, port=%u\n",
                ret, (unsigned) portid);

        /* Start device */
        ret = rte_eth_dev_start(portid);
        if (ret < 0)
            rte_exit(EXIT_FAILURE, "rte_eth_dev_start:err=%d, port=%u\n",
                  ret, (unsigned) portid);

        printf("done: \n");

        rte_eth_promiscuous_enable(portid);

        printf("Port %u, MAC address: %02X:%02X:%02X:%02X:%02X:%02X\n\n",
                (unsigned) portid,
                l2fwd_ports_eth_addr[portid].addr_bytes[0],
                l2fwd_ports_eth_addr[portid].addr_bytes[1],
                l2fwd_ports_eth_addr[portid].addr_bytes[2],
                l2fwd_ports_eth_addr[portid].addr_bytes[3],
                l2fwd_ports_eth_addr[portid].addr_bytes[4],
                l2fwd_ports_eth_addr[portid].addr_bytes[5]);

        /* initialize port stats */
        memset(&port_statistics, 0, sizeof(port_statistics));
    }

    if (!nb_ports_available) {
        rte_exit(EXIT_FAILURE,
            "All available ports are disabled. Please set portmask.\n");
    }

    check_all_ports_link_status(nb_ports, l2fwd_enabled_port_mask);

    /* 启动l2fwd线程 */
    /* launch per-lcore init on every lcore */
    rte_eal_mp_remote_launch(l2fwd_launch_one_lcore, NULL, CALL_MASTER);
    RTE_LCORE_FOREACH_SLAVE(lcore_id) {
        if (rte_eal_wait_lcore(lcore_id) < 0)
            return -1;
    }

    return 0;
}

 

以下详细分析端口初始化过程; 对于每个port, 首先调用rte_eth_dev_configure配置端口的收发包队列个数,并初始化收发包队列控制块;

int
rte_eth_dev_configure(uint8_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
              const struct rte_eth_conf *dev_conf)
{
    struct rte_eth_dev *dev;
    struct rte_eth_dev_info dev_info;
    int diag;

    /* 只能由primary进程初始化 */
    /* This function is only safe when called from the primary process
     * in a multi-process setup*/
    PROC_PRIMARY_OR_ERR_RET(-E_RTE_SECONDARY);

    if (port_id >= nb_ports || port_id >= RTE_MAX_ETHPORTS) {
        PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
        return (-EINVAL);
    }
    dev = &rte_eth_devices[port_id];

    /* 在PMD驱动初始化过程中,E1000的ops注册为eth_em_ops */
    FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
    FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);

    /* rte_eth_dev_start会把该标记为置为1 */
    if (dev->data->dev_started) {
        PMD_DEBUG_TRACE(
            "port %d must be stopped to allow configuration\n", port_id);
        return (-EBUSY);
    }

    /* eth_em_infos_get会返回tx,rx队列数; 本例子max_rx_queues = 1 max_tx_queues = 1 */
    /*
     * Check that the numbers of RX and TX queues are not greater
     * than the maximum number of RX and TX queues supported by the
     * configured device.
     */
    (*dev->dev_ops->dev_infos_get)(dev, &dev_info);
    if (nb_rx_q > dev_info.max_rx_queues) {
        PMD_DEBUG_TRACE("ethdev port_id=%d nb_rx_queues=%d > %d\n",
                port_id, nb_rx_q, dev_info.max_rx_queues);
        return (-EINVAL);
    }
    if (nb_rx_q == 0) {
        PMD_DEBUG_TRACE("ethdev port_id=%d nb_rx_q == 0\n", port_id);
        return (-EINVAL);
    }

    if (nb_tx_q > dev_info.max_tx_queues) {
        PMD_DEBUG_TRACE("ethdev port_id=%d nb_tx_queues=%d > %d\n",
                port_id, nb_tx_q, dev_info.max_tx_queues);
        return (-EINVAL);
    }
    if (nb_tx_q == 0) {
        PMD_DEBUG_TRACE("ethdev port_id=%d nb_tx_q == 0\n", port_id);
        return (-EINVAL);
    }

    /* dev_conf里面是tx,rx模式的配置 */
    /* Copy the dev_conf parameter into the dev structure */
    memcpy(&dev->data->dev_conf, dev_conf, sizeof(dev->data->dev_conf));

    /* 是否收大报文 一般不需要 */
    /*
     * If jumbo frames are enabled, check that the maximum RX packet
     * length is supported by the configured device.
     */
    if (dev_conf->rxmode.jumbo_frame == 1) {
        if (dev_conf->rxmode.max_rx_pkt_len >
            dev_info.max_rx_pktlen) {
            PMD_DEBUG_TRACE("ethdev port_id=%d max_rx_pkt_len %u"
                " > max valid value %u\n",
                port_id,
                (unsigned)dev_conf->rxmode.max_rx_pkt_len,
                (unsigned)dev_info.max_rx_pktlen);
            return (-EINVAL);
        }
        else if (dev_conf->rxmode.max_rx_pkt_len < ETHER_MIN_LEN) {
            PMD_DEBUG_TRACE("ethdev port_id=%d max_rx_pkt_len %u"
                " < min valid value %u\n",
                port_id,
                (unsigned)dev_conf->rxmode.max_rx_pkt_len,
                (unsigned)ETHER_MIN_LEN);
            return (-EINVAL);
        }
    } else
        /* Use default value */
        dev->data->dev_conf.rxmode.max_rx_pkt_len = ETHER_MAX_LEN;

    /* 多队列的检查, 其中各种模式DCB/RSS表示什么意思? */
    /* multipe queue mode checking */
    diag = rte_eth_dev_check_mq_mode(port_id, nb_rx_q, nb_tx_q, dev_conf);
    if (diag != 0) {
        PMD_DEBUG_TRACE("port%d rte_eth_dev_check_mq_mode = %d\n",
                port_id, diag);
        return diag;
    }

    /*
     * Setup new number of RX/TX queues and reconfigure device.
     */
    /* RX队列控制块内存分配 */
    diag = rte_eth_dev_rx_queue_config(dev, nb_rx_q);
    if (diag != 0) {
        PMD_DEBUG_TRACE("port%d rte_eth_dev_rx_queue_config = %d\n",
                port_id, diag);
        return diag;
    }

    /* TX队列控制块内存分配 */
    diag = rte_eth_dev_tx_queue_config(dev, nb_tx_q);
    if (diag != 0) {
        PMD_DEBUG_TRACE("port%d rte_eth_dev_tx_queue_config = %d\n",
                port_id, diag);
        rte_eth_dev_rx_queue_config(dev, 0);
        return diag;
    }

    /* eth_em_configure, 标记intr->flags |= E1000_FLAG_NEED_LINK_UPDATE; */
    diag = (*dev->dev_ops->dev_configure)(dev);
    if (diag != 0) {
        PMD_DEBUG_TRACE("port%d dev_configure = %d\n",
                port_id, diag);
        rte_eth_dev_rx_queue_config(dev, 0);
        rte_eth_dev_tx_queue_config(dev, 0);
        return diag;
    }

    return 0;
}

RX queue setup

int
rte_eth_rx_queue_setup(uint8_t port_id, uint16_t rx_queue_id,
               uint16_t nb_rx_desc, unsigned int socket_id,
               const struct rte_eth_rxconf *rx_conf,
               struct rte_mempool *mp)
{
    struct rte_eth_dev *dev;
    struct rte_pktmbuf_pool_private *mbp_priv;
    struct rte_eth_dev_info dev_info;

    /* This function is only safe when called from the primary process
     * in a multi-process setup*/
    PROC_PRIMARY_OR_ERR_RET(-E_RTE_SECONDARY);

    if (port_id >= nb_ports) {
        PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
        return (-EINVAL);
    }
    dev = &rte_eth_devices[port_id];
    if (rx_queue_id >= dev->data->nb_rx_queues) {
        PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id);
        return (-EINVAL);
    }

    if (dev->data->dev_started) {
        PMD_DEBUG_TRACE(
            "port %d must be stopped to allow configuration\n", port_id);
        return -EBUSY;
    }

    FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
    FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_setup, -ENOTSUP);

    /*
     * Check the size of the mbuf data buffer.
     * This value must be provided in the private data of the memory pool.
     * First check that the memory pool has a valid private data.
     */
    (*dev->dev_ops->dev_infos_get)(dev, &dev_info);
    if (mp->private_data_size < sizeof(struct rte_pktmbuf_pool_private)) {
        PMD_DEBUG_TRACE("%s private_data_size %d < %d\n",
                mp->name, (int) mp->private_data_size,
                (int) sizeof(struct rte_pktmbuf_pool_private));
        return (-ENOSPC);
    }

    /* mbuf data部分大小(2048) > 256 */
    mbp_priv = rte_mempool_get_priv(mp);
    if ((uint32_t) (mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM) <
        dev_info.min_rx_bufsize) {
        PMD_DEBUG_TRACE("%s mbuf_data_room_size %d < %d "
                "(RTE_PKTMBUF_HEADROOM=%d + min_rx_bufsize(dev)"
                "=%d)\n",
                mp->name,
                (int)mbp_priv->mbuf_data_room_size,
                (int)(RTE_PKTMBUF_HEADROOM +
                      dev_info.min_rx_bufsize),
                (int)RTE_PKTMBUF_HEADROOM,
                (int)dev_info.min_rx_bufsize);
        return (-EINVAL);
    }

    /* eth_em_rx_queue_setup, 初始化收包描述符 */
    return (*dev->dev_ops->rx_queue_setup)(dev, rx_queue_id, nb_rx_desc,
                           socket_id, rx_conf, mp);
}

 

int
eth_em_rx_queue_setup(struct rte_eth_dev *dev,
        uint16_t queue_idx,
        uint16_t nb_desc,
        unsigned int socket_id,
        const struct rte_eth_rxconf *rx_conf,
        struct rte_mempool *mp)
{
    const struct rte_memzone *rz;
    struct em_rx_queue *rxq;
    struct e1000_hw     *hw;
    uint32_t rsize;

    hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);

    /*
     * Validate number of receive descriptors.
     * It must not exceed hardware maximum, and must be multiple
     * of EM_ALIGN.
     */
    if (((nb_desc * sizeof(rxq->rx_ring[0])) % EM_ALIGN) != 0 ||
            (nb_desc > EM_MAX_RING_DESC) ||
            (nb_desc < EM_MIN_RING_DESC)) {
        return (-EINVAL);
    }

    /*
     * EM devices don't support drop_en functionality
     */
    if (rx_conf->rx_drop_en) {
        RTE_LOG(ERR, PMD, "drop_en functionality not supported by device\n");
        return (-EINVAL);
    }

    /* 之前setup过, 释放资源 */
    /* Free memory prior to re-allocation if needed. */
    if (dev->data->rx_queues[queue_idx] != NULL) {
        em_rx_queue_release(dev->data->rx_queues[queue_idx]);
        dev->data->rx_queues[queue_idx] = NULL;
    }

    /* 名为rte_em_pmd_rx_ring_0_1的memzone分配,用于收包描述符 */
    /* Allocate RX ring for max possible mumber of hardware descriptors. */
    rsize = sizeof (rxq->rx_ring[0]) * EM_MAX_RING_DESC;
    if ((rz = ring_dma_zone_reserve(dev, "rx_ring", queue_idx, rsize,
            socket_id)) == NULL)
        return (-ENOMEM);

    /* rx队列控制块内存分配 */
    /* Allocate the RX queue data structure. */
    if ((rxq = rte_zmalloc("ethdev RX queue", sizeof(*rxq),
            CACHE_LINE_SIZE)) == NULL)
        return (-ENOMEM);

    /* 与rx描述符管理的mbuf指针 */
    /* Allocate software ring. */
    if ((rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
            sizeof (rxq->sw_ring[0]) * nb_desc,
            CACHE_LINE_SIZE)) == NULL) {
        em_rx_queue_release(rxq);
        return (-ENOMEM);
    }

    rxq->mb_pool = mp;
    rxq->nb_rx_desc = nb_desc;
    rxq->pthresh = rx_conf->rx_thresh.pthresh;
    rxq->hthresh = rx_conf->rx_thresh.hthresh;
    rxq->wthresh = rx_conf->rx_thresh.wthresh;
    rxq->rx_free_thresh = rx_conf->rx_free_thresh;
    rxq->queue_id = queue_idx;
    rxq->port_id = dev->data->port_id;
    rxq->crc_len = (uint8_t) ((dev->data->dev_conf.rxmode.hw_strip_crc) ?
                0 : ETHER_CRC_LEN);

    rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(queue_idx));
    rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(queue_idx));
#ifndef RTE_LIBRTE_XEN_DOM0    
    rxq->rx_ring_phys_addr = (uint64_t) rz->phys_addr;
#else
    rxq->rx_ring_phys_addr = rte_mem_phy2mch(rz->memseg_id, rz->phys_addr); 
#endif 
    rxq->rx_ring = (struct e1000_rx_desc *) rz->addr;

    PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
        rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);

    dev->data->rx_queues[queue_idx] = rxq;
    em_reset_rx_queue(rxq);

    return (0);
}

 

TX queue setup

int
rte_eth_tx_queue_setup(uint8_t port_id, uint16_t tx_queue_id,
               uint16_t nb_tx_desc, unsigned int socket_id,
               const struct rte_eth_txconf *tx_conf)
{
    struct rte_eth_dev *dev;

    /* This function is only safe when called from the primary process
     * in a multi-process setup*/
    PROC_PRIMARY_OR_ERR_RET(-E_RTE_SECONDARY);

    if (port_id >= RTE_MAX_ETHPORTS || port_id >= nb_ports) {
        PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
        return (-EINVAL);
    }
    dev = &rte_eth_devices[port_id];
    if (tx_queue_id >= dev->data->nb_tx_queues) {
        PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", tx_queue_id);
        return (-EINVAL);
    }

    /* 必须在设备启动前做初始化操作 */
    if (dev->data->dev_started) {
        PMD_DEBUG_TRACE(
            "port %d must be stopped to allow configuration\n", port_id);
        return -EBUSY;
    }

    /* 调用PMD驱动的tx_queue_setup */
    FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_setup, -ENOTSUP);
    return (*dev->dev_ops->tx_queue_setup)(dev, tx_queue_id, nb_tx_desc,
                           socket_id, tx_conf);
}

 

int
eth_em_tx_queue_setup(struct rte_eth_dev *dev,
             uint16_t queue_idx,
             uint16_t nb_desc,
             unsigned int socket_id,
             const struct rte_eth_txconf *tx_conf)
{
    const struct rte_memzone *tz;
    struct em_tx_queue *txq;
    struct e1000_hw     *hw;
    uint32_t tsize;
    uint16_t tx_rs_thresh, tx_free_thresh;

    hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);

    /* tx descriptor必须是cache line对齐的 */
    /*
     * Validate number of transmit descriptors.
     * It must not exceed hardware maximum, and must be multiple
     * of EM_ALIGN.
     */
    if (((nb_desc * sizeof(*txq->tx_ring)) % EM_ALIGN) != 0 ||
            (nb_desc > EM_MAX_RING_DESC) ||
            (nb_desc < EM_MIN_RING_DESC)) {
        return -(EINVAL);
    }

    /* threshold 配置 */
    tx_free_thresh = tx_conf->tx_free_thresh;
    if (tx_free_thresh == 0)
        tx_free_thresh = (uint16_t)RTE_MIN(nb_desc / 4,
                    DEFAULT_TX_FREE_THRESH);

    tx_rs_thresh = tx_conf->tx_rs_thresh;
    if (tx_rs_thresh == 0)
        tx_rs_thresh = (uint16_t)RTE_MIN(tx_free_thresh,
                    DEFAULT_TX_RS_THRESH);

    if (tx_free_thresh >= (nb_desc - 3)) {
        RTE_LOG(ERR, PMD, "tx_free_thresh must be less than the "
            "number of TX descriptors minus 3. (tx_free_thresh=%u "
            "port=%d queue=%d)\n", (unsigned int)tx_free_thresh,
                (int)dev->data->port_id, (int)queue_idx);
        return -(EINVAL);
    }
    if (tx_rs_thresh > tx_free_thresh) {
        RTE_LOG(ERR, PMD, "tx_rs_thresh must be less than or equal to "
            "tx_free_thresh. (tx_free_thresh=%u tx_rs_thresh=%u "
            "port=%d queue=%d)\n", (unsigned int)tx_free_thresh,
            (unsigned int)tx_rs_thresh, (int)dev->data->port_id,
                            (int)queue_idx);
        return -(EINVAL);
    }

    /*
     * If rs_bit_thresh is greater than 1, then TX WTHRESH should be
     * set to 0. If WTHRESH is greater than zero, the RS bit is ignored
     * by the NIC and all descriptors are written back after the NIC
     * accumulates WTHRESH descriptors.
     */
    if (tx_conf->tx_thresh.wthresh != 0 && tx_rs_thresh != 1) {
        RTE_LOG(ERR, PMD, "TX WTHRESH must be set to 0 if "
            "tx_rs_thresh is greater than 1. (tx_rs_thresh=%u "
            "port=%d queue=%d)\n", (unsigned int)tx_rs_thresh,
                (int)dev->data->port_id, (int)queue_idx);
        return -(EINVAL);
    }

    /* txq不为空,释放原先的队列中的mbuf和txq */
    /* Free memory prior to re-allocation if needed... */
    if (dev->data->tx_queues[queue_idx] != NULL) {
        em_tx_queue_release(dev->data->tx_queues[queue_idx]);
        dev->data->tx_queues[queue_idx] = NULL;
    }

    /* 分配名为rte_em_pmd_tx_ring_p_q的memzone, 用于存放EM_MAX_RING_DESC个tx descriptor */
    /*
     * Allocate TX ring hardware descriptors. A memzone large enough to
     * handle the maximum ring size is allocated in order to allow for
     * resizing in later calls to the queue setup function.
     */
    tsize = sizeof (txq->tx_ring[0]) * EM_MAX_RING_DESC;
    if ((tz = ring_dma_zone_reserve(dev, "tx_ring", queue_idx, tsize,
            socket_id)) == NULL)
        return (-ENOMEM);

    /* txq内存分配 */
    /* Allocate the tx queue data structure. */
    if ((txq = rte_zmalloc("ethdev TX queue", sizeof(*txq),
            CACHE_LINE_SIZE)) == NULL)
        return (-ENOMEM);

    /* txq sw_ring内存分配 */
    /* Allocate software ring */
    if ((txq->sw_ring = rte_zmalloc("txq->sw_ring",
            sizeof(txq->sw_ring[0]) * nb_desc,
            CACHE_LINE_SIZE)) == NULL) {
        em_tx_queue_release(txq);
        return (-ENOMEM);
    }

    txq->nb_tx_desc = nb_desc;
    txq->tx_free_thresh = tx_free_thresh;
    txq->tx_rs_thresh = tx_rs_thresh;
    txq->pthresh = tx_conf->tx_thresh.pthresh;
    txq->hthresh = tx_conf->tx_thresh.hthresh;
    txq->wthresh = tx_conf->tx_thresh.wthresh;
    txq->queue_id = queue_idx;
    txq->port_id = dev->data->port_id;

    txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(queue_idx));

    /* tx_ring的物理地址 */
#ifndef RTE_LIBRTE_XEN_DOM0
    txq->tx_ring_phys_addr = (uint64_t) tz->phys_addr;
#else   
    txq->tx_ring_phys_addr = rte_mem_phy2mch(tz->memseg_id, tz->phys_addr);
#endif
    /* tx_ring的虚拟地址 */
    txq->tx_ring = (struct e1000_data_desc *) tz->addr;

    PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
        txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);

    /* 环状队列初始化,每个entry的next指向下一个,最后一个指向第一个 */
    em_reset_tx_queue(txq);

    dev->data->tx_queues[queue_idx] = txq;
    return (0);
}

端口初始化的最后一步是使能端口收发包功能,其中主要是通知E1000驱动tx ring和rx ring的地址, 细节就不再跟进

void
eth_em_tx_init(struct rte_eth_dev *dev)
{
    struct e1000_hw     *hw;
    struct em_tx_queue *txq;
    uint32_t tctl;
    uint32_t txdctl;
    uint16_t i;

    hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);

    /* 把每一个queue的tx ring的物理地址通告给E1000驱动 */
    /* Setup the Base and Length of the Tx Descriptor Rings. */
    for (i = 0; i < dev->data->nb_tx_queues; i++) {
        uint64_t bus_addr;

        txq = dev->data->tx_queues[i];
        bus_addr = txq->tx_ring_phys_addr;
        E1000_WRITE_REG(hw, E1000_TDLEN(i),
                txq->nb_tx_desc *
                sizeof(*txq->tx_ring));
        E1000_WRITE_REG(hw, E1000_TDBAH(i),
                (uint32_t)(bus_addr >> 32));
        E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);

        /* Setup the HW Tx Head and Tail descriptor pointers. */
        E1000_WRITE_REG(hw, E1000_TDT(i), 0);
        E1000_WRITE_REG(hw, E1000_TDH(i), 0);

        /* Setup Transmit threshold registers. */
        txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
        /*
         * bit 22 is reserved, on some models should always be 0,
         * on others  - always 1.
         */
        txdctl &= E1000_TXDCTL_COUNT_DESC;
        txdctl |= txq->pthresh & 0x3F;
        txdctl |= (txq->hthresh & 0x3F) << 8;
        txdctl |= (txq->wthresh & 0x3F) << 16;
        txdctl |= E1000_TXDCTL_GRAN;
        E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
    }

    /* Program the Transmit Control Register. */
    tctl = E1000_READ_REG(hw, E1000_TCTL);
    tctl &= ~E1000_TCTL_CT;
    tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
         (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));

    /* This write will effectively turn on the transmit unit. */
    E1000_WRITE_REG(hw, E1000_TCTL, tctl);
}

 

int
eth_em_rx_init(struct rte_eth_dev *dev)
{
    struct e1000_hw *hw;
    struct em_rx_queue *rxq;
    uint32_t rctl;
    uint32_t rfctl;
    uint32_t rxcsum;
    uint32_t rctl_bsize;
    uint16_t i;
    int ret;

    hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);

    /*
     * Make sure receives are disabled while setting
     * up the descriptor ring.
     */
    rctl = E1000_READ_REG(hw, E1000_RCTL);
    E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);

    rfctl = E1000_READ_REG(hw, E1000_RFCTL);

    /* Disable extended descriptor type. */
    rfctl &= ~E1000_RFCTL_EXTEN;
    /* Disable accelerated acknowledge */
    if (hw->mac.type == e1000_82574)
        rfctl |= E1000_RFCTL_ACK_DIS;

    E1000_WRITE_REG(hw, E1000_RFCTL, rfctl);

    /*
     * XXX TEMPORARY WORKAROUND: on some systems with 82573
     * long latencies are observed, like Lenovo X60. This
     * change eliminates the problem, but since having positive
     * values in RDTR is a known source of problems on other
     * platforms another solution is being sought.
     */
    if (hw->mac.type == e1000_82573)
        E1000_WRITE_REG(hw, E1000_RDTR, 0x20);

    dev->rx_pkt_burst = (eth_rx_burst_t)eth_em_recv_pkts;

    /* 计算pkt buf的大小 */
    /* Determine RX bufsize. */
    rctl_bsize = EM_MAX_BUF_SIZE;
    for (i = 0; i < dev->data->nb_rx_queues; i++) {
        struct rte_pktmbuf_pool_private *mbp_priv;
        uint32_t buf_size;

        rxq = dev->data->rx_queues[i];
        mbp_priv = rte_mempool_get_priv(rxq->mb_pool);
        buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
        rctl_bsize = RTE_MIN(rctl_bsize, buf_size);
    }

    rctl |= em_rctl_bsize(hw->mac.type, &rctl_bsize);

    /* Configure and enable each RX queue. */
    for (i = 0; i < dev->data->nb_rx_queues; i++) {
        uint64_t bus_addr;
        uint32_t rxdctl;

        rxq = dev->data->rx_queues[i];

        /* 从mbuf pool中分配mbuf, 填写到rxq->sw_ring,记录每个pkt buf的物理地址到rxq->rx_ring */
        /* Allocate buffers for descriptor rings and setup queue */
        ret = em_alloc_rx_queue_mbufs(rxq);
        if (ret)
            return ret;

        /* 把rx ring的物理地址通告给E1000驱动 */

        /*
         * Reset crc_len in case it was changed after queue setup by a
         *  call to configure
         */
        rxq->crc_len =
            (uint8_t)(dev->data->dev_conf.rxmode.hw_strip_crc ?
                            0 : ETHER_CRC_LEN);

        bus_addr = rxq->rx_ring_phys_addr;
        E1000_WRITE_REG(hw, E1000_RDLEN(i),
                rxq->nb_rx_desc *
                sizeof(*rxq->rx_ring));
        E1000_WRITE_REG(hw, E1000_RDBAH(i),
                (uint32_t)(bus_addr >> 32));
        E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);

        E1000_WRITE_REG(hw, E1000_RDH(i), 0);
        E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);

        rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(0));
        rxdctl &= 0xFE000000;
        rxdctl |= rxq->pthresh & 0x3F;
        rxdctl |= (rxq->hthresh & 0x3F) << 8;
        rxdctl |= (rxq->wthresh & 0x3F) << 16;
        rxdctl |= E1000_RXDCTL_GRAN;
        E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);

        /* 收大报文用的收包函数 */
        /*
         * Due to EM devices not having any sort of hardware
         * limit for packet length, jumbo frame of any size
         * can be accepted, thus we have to enable scattered
         * rx if jumbo frames are enabled (or if buffer size
         * is too small to accomodate non-jumbo packets)
         * to avoid splitting packets that don't fit into
         * one buffer.
         */
        if (dev->data->dev_conf.rxmode.jumbo_frame ||
                rctl_bsize < ETHER_MAX_LEN) {
            dev->rx_pkt_burst =
                (eth_rx_burst_t)eth_em_recv_scattered_pkts;
            dev->data->scattered_rx = 1;
        }
    }

    /* 以下省略 */
    ...

    return 0;
}

 

到此端口初始化完成,比启动,回到main函数中, 在每个lcore上启动循环收包函数

/* launch per-lcore init on every lcore */
rte_eal_mp_remote_launch(l2fwd_launch_one_lcore, NULL, CALL_MASTER);

lcore的主线程处理如下

/* main processing loop */
static void
l2fwd_main_loop(void)
{
    struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
    struct rte_mbuf *m;
    unsigned lcore_id;
    uint64_t prev_tsc, diff_tsc, cur_tsc, timer_tsc;
    unsigned i, j, portid, nb_rx;
    struct lcore_queue_conf *qconf;
    const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;

    prev_tsc = 0;
    timer_tsc = 0;

    lcore_id = rte_lcore_id();
    qconf = &lcore_queue_conf[lcore_id];

    if (qconf->n_rx_port == 0) {
        RTE_LOG(INFO, L2FWD, "lcore %u has nothing to do\n", lcore_id);
        return;
    }

    RTE_LOG(INFO, L2FWD, "entering main loop on lcore %u\n", lcore_id);

    /* 当前lcore需要处理哪些port(queue) */
    for (i = 0; i < qconf->n_rx_port; i++) {

        portid = qconf->rx_port_list[i];
        RTE_LOG(INFO, L2FWD, " -- lcoreid=%u portid=%u\n", lcore_id,
            portid);
    }

    while (1) {

        cur_tsc = rte_rdtsc();

        /*
         * TX burst queue drain
         */
        diff_tsc = cur_tsc - prev_tsc;

        /* 隔一段时间才把所有要发送的报文发送出去并打印统计信息 */
        if (unlikely(diff_tsc > drain_tsc)) {

            for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++) {
                /* 当前port没有需要发送的报文 */
                if (qconf->tx_mbufs[portid].len == 0)
                    continue;

                /* 调用device的发包函数并统计发送的报文个数 */
                l2fwd_send_burst(&lcore_queue_conf[lcore_id],
                         qconf->tx_mbufs[portid].len,
                         (uint8_t) portid);

                /* 到此应该当前端口需要发送的报文全部发送,因此len置为0 */
                qconf->tx_mbufs[portid].len = 0;
            }

            /* if timer is enabled */
            if (timer_period > 0) {

                /* advance the timer */
                timer_tsc += diff_tsc;

                /* if timer has reached its timeout */
                if (unlikely(timer_tsc >= (uint64_t) timer_period)) {

                    /* do this only on master core */
                    if (lcore_id == rte_get_master_lcore()) {
                        print_stats();
                        /* reset the timer */
                        timer_tsc = 0;
                    }
                }
            }

            prev_tsc = cur_tsc;
        }

        /* 当前lcore需要处理的queue */
        /*
         * Read packet from RX queues
         */
        for (i = 0; i < qconf->n_rx_port; i++) {

            portid = qconf->rx_port_list[i];

            /* 当前port只有queue0 */
            nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
                         pkts_burst, MAX_PKT_BURST);

            /* 更新收包统计 */
            port_statistics[portid].rx += nb_rx;

            /* 把所有收上来的报文修改目的MAC后加入到发包队列 */
            for (j = 0; j < nb_rx; j++) {
                m = pkts_burst[j];

                /* PKT DATA部分载入cache,这个好像收包部分已经prefetch过了 */
                rte_prefetch0(rte_pktmbuf_mtod(m, void *));

                /* forword */
                l2fwd_simple_forward(m, portid);
            }
        }
    }
}

 

首先看报文是如何收上来的, 调用device的rx_pkt_burst

static inline uint16_t
rte_eth_rx_burst(uint8_t port_id, uint16_t queue_id,
         struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
{
    struct rte_eth_dev *dev;

    dev = &rte_eth_devices[port_id];
    return (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id], rx_pkts, nb_pkts);
}

PMD的收包函数如下:

uint16_t
eth_em_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
        uint16_t nb_pkts)
{
    /* volatile防止编译器优化,每次使用必须重新从memory中取而不是用寄存器的值 */
    volatile struct e1000_rx_desc *rx_ring;
    volatile struct e1000_rx_desc *rxdp;
    struct em_rx_queue *rxq;
    struct em_rx_entry *sw_ring;
    struct em_rx_entry *rxe;
    struct rte_mbuf *rxm;
    struct rte_mbuf *nmb;
    struct e1000_rx_desc rxd;
    uint64_t dma_addr;
    uint16_t pkt_len;
    uint16_t rx_id;
    uint16_t nb_rx;
    uint16_t nb_hold;
    uint8_t status;

    rxq = rx_queue;

    nb_rx = 0;
    nb_hold = 0;
    rx_id = rxq->rx_tail;       /* 当前收包位置 */
    rx_ring = rxq->rx_ring;     /* rx descriptor */
    sw_ring = rxq->sw_ring;     /* mbuf */

    /* 一次性收32个报文 */
    while (nb_rx < nb_pkts) {
        /*
         * The order of operations here is important as the DD status
         * bit must not be read after any other descriptor fields.
         * rx_ring and rxdp are pointing to volatile data so the order
         * of accesses cannot be reordered by the compiler. If they were
         * not volatile, they could be reordered which could lead to
         * using invalid descriptor fields when read from rxd.
         */
        
        /* 当前报文的descriptor */
        rxdp = &rx_ring[rx_id];

        /* 结束标记,必须首先读取 */
        status = rxdp->status;
        if (! (status & E1000_RXD_STAT_DD))
            break;

        /* 复制一份 */
        rxd = *rxdp;

        /*
         * End of packet.
         *
         * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
         * likely to be invalid and to be dropped by the various
         * validation checks performed by the network stack.
         *
         * Allocate a new mbuf to replenish the RX ring descriptor.
         * If the allocation fails:
         *    - arrange for that RX descriptor to be the first one
         *      being parsed the next time the receive function is
         *      invoked [on the same queue].
         *
         *    - Stop parsing the RX ring and return immediately.
         *
         * This policy do not drop the packet received in the RX
         * descriptor for which the allocation of a new mbuf failed.
         * Thus, it allows that packet to be later retrieved if
         * mbuf have been freed in the mean time.
         * As a side effect, holding RX descriptors instead of
         * systematically giving them back to the NIC may lead to
         * RX ring exhaustion situations.
         * However, the NIC can gracefully prevent such situations
         * to happen by sending specific "back-pressure" flow control
         * frames to its peer(s).
         */
        PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "
            "status=0x%x pkt_len=%u\n",
            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
            (unsigned) rx_id, (unsigned) status,
            (unsigned) rte_le_to_cpu_16(rxd.length));

        /* 分配新的mbuf给驱动 */
        nmb = rte_rxmbuf_alloc(rxq->mb_pool);
        if (nmb == NULL) {
            PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
                "queue_id=%u\n",
                (unsigned) rxq->port_id,
                (unsigned) rxq->queue_id);
            rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
            break;
        }

        /* 表示当前descriptor被上层软件占用 */
        nb_hold++;

        /* 当前收到的mbuf */
        rxe = &sw_ring[rx_id];

        /* 收包位置,如果超过环状数组则回滚 */
        rx_id++;
        if (rx_id == rxq->nb_rx_desc)
            rx_id = 0;

        /* mbuf载入cache下次循环使用 */
        /* Prefetch next mbuf while processing current one. */
        rte_em_prefetch(sw_ring[rx_id].mbuf);

        /* 取下一个descriptor,以及mbuf指针下次循环使用 */
        /* 一个cache line是4个descriptor大小(64字节) */
        /*
         * When next RX descriptor is on a cache-line boundary,
         * prefetch the next 4 RX descriptors and the next 8 pointers
         * to mbufs.
         */
        if ((rx_id & 0x3) == 0) {
            rte_em_prefetch(&rx_ring[rx_id]);
            rte_em_prefetch(&sw_ring[rx_id]);
        }

        /* Rearm RXD: attach new mbuf and reset status to zero. */

        /* 替换sw_ring entry的mbuf指针 */
        rxm = rxe->mbuf;
        rxe->mbuf = nmb;
        dma_addr =
            rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
        rxdp->buffer_addr = dma_addr;

        /* 重置当前descriptor的status */
        rxdp->status = 0;

        /*
         * Initialize the returned mbuf.
         * 1) setup generic mbuf fields:
         *    - number of segments,
         *    - next segment,
         *    - packet length,
         *    - RX port identifier.
         * 2) integrate hardware offload data, if any:
         *    - RSS flag & hash,
         *    - IP checksum flag,
         *    - VLAN TCI, if any,
         *    - error flags.
         */
        pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.length) -
                rxq->crc_len);
        rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
        rte_packet_prefetch(rxm->pkt.data);
        rxm->pkt.nb_segs = 1;
        rxm->pkt.next = NULL;
        rxm->pkt.pkt_len = pkt_len;
        rxm->pkt.data_len = pkt_len;
        rxm->pkt.in_port = rxq->port_id;

        rxm->ol_flags = rx_desc_status_to_pkt_flags(status);
        rxm->ol_flags = (uint16_t)(rxm->ol_flags |
                rx_desc_error_to_pkt_flags(rxd.errors));

        /* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
        rxm->pkt.vlan_macip.f.vlan_tci = rte_le_to_cpu_16(rxd.special);

        /* 把收到的mbuf返回给用户 */
        /*
         * Store the mbuf address into the next entry of the array
         * of returned packets.
         */
        rx_pkts[nb_rx++] = rxm;
    }

    /* 收包位置更新 */
    rxq->rx_tail = rx_id;

    /* 更新被上层软件使用的descriptor个数 */
    /*
     * If the number of free RX descriptors is greater than the RX free
     * threshold of the queue, advance the Receive Descriptor Tail (RDT)
     * register.
     * Update the RDT with the value of the last processed RX descriptor
     * minus 1, to guarantee that the RDT register is never equal to the
     * RDH register, which creates a "full" ring situtation from the
     * hardware point of view...
     */
    nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
    if (nb_hold > rxq->rx_free_thresh) {
        PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
            "nb_hold=%u nb_rx=%u\n",
            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
            (unsigned) rx_id, (unsigned) nb_hold,
            (unsigned) nb_rx);
        rx_id = (uint16_t) ((rx_id == 0) ?
            (rxq->nb_rx_desc - 1) : (rx_id - 1));
        E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
        nb_hold = 0;
    }
    rxq->nb_rx_hold = nb_hold;
    return (nb_rx);
}

 

发包函数

static inline uint16_t
rte_eth_tx_burst(uint8_t port_id, uint16_t queue_id,
         struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
{
    struct rte_eth_dev *dev;

    dev = &rte_eth_devices[port_id];
    return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
}

调用的PMD的发包函数

uint16_t
eth_em_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
        uint16_t nb_pkts)
{
    struct em_tx_queue *txq;
    struct em_tx_entry *sw_ring;
    struct em_tx_entry *txe, *txn;
    volatile struct e1000_data_desc *txr;
    volatile struct e1000_data_desc *txd;
    struct rte_mbuf     *tx_pkt;
    struct rte_mbuf     *m_seg;
    uint64_t buf_dma_addr;
    uint32_t popts_spec;
    uint32_t cmd_type_len;
    uint16_t slen;
    uint16_t ol_flags;
    uint16_t tx_id;
    uint16_t tx_last;
    uint16_t nb_tx;
    uint16_t nb_used;
    uint16_t tx_ol_req;
    uint32_t ctx;
    uint32_t new_ctx;
    union rte_vlan_macip hdrlen;

    txq = tx_queue;
    sw_ring = txq->sw_ring;
    txr     = txq->tx_ring;
    /* 发包位置 */
    tx_id   = txq->tx_tail;
    /* 先把旧的已发送的mbuf回收,然后把新的要发送的mbuf写入 */
    txe = &sw_ring[tx_id];

    /* 可用tx descriptor太少的话做cleanup */
    /* Determine if the descriptor ring needs to be cleaned. */
    if ((txq->nb_tx_desc - txq->nb_tx_free) > txq->tx_free_thresh) {
        em_xmit_cleanup(txq);
    }

    /* nb_pkts为一共要发送的报文个数(32) */
    /* TX loop */
    for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
        new_ctx = 0;

        /* 要发送的mbuf指针 */
        tx_pkt = *tx_pkts++;

        /* 载入L1,L2 cache,用于释放mbuf */
        RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);

        /*
         * Determine how many (if any) context descriptors
         * are needed for offload functionality.
         */
        ol_flags = tx_pkt->ol_flags;

        /* If hardware offload required */
        tx_ol_req = (uint16_t)(ol_flags & (PKT_TX_IP_CKSUM |
                            PKT_TX_L4_MASK));
        if (tx_ol_req) {
            hdrlen = tx_pkt->pkt.vlan_macip;
            /* 检查是否需要新的context descriptor */
            /* If new context to be built or reuse the exist ctx. */
            ctx = what_ctx_update(txq, tx_ol_req, hdrlen);

            /* Only allocate context descriptor if required*/
            new_ctx = (ctx == EM_CTX_NUM);
        }

        /* 需要的descriptor个数为报文的segment数+是否需要context descriptor */
        /*
         * Keep track of how many descriptors are used this loop
         * This will always be the number of segments + the number of
         * Context descriptors required to transmit the packet
         */
        nb_used = (uint16_t)(tx_pkt->pkt.nb_segs + new_ctx);

        /* 结束位置, 从tx_id处用起,因此-1 */
        /* 
         * The number of descriptors that must be allocated for a
         * packet is the number of segments of that packet, plus 1
         * Context Descriptor for the hardware offload, if any.
         * Determine the last TX descriptor to allocate in the TX ring
         * for the packet, starting from the current position (tx_id)
         * in the ring.
         */
        tx_last = (uint16_t) (tx_id + nb_used - 1);

        /* 回滚 */
        /* Circular ring */
        if (tx_last >= txq->nb_tx_desc)
            tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);

        PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
            " tx_first=%u tx_last=%u\n",
            (unsigned) txq->port_id,
            (unsigned) txq->queue_id,
            (unsigned) tx_pkt->pkt.pkt_len,
            (unsigned) tx_id,
            (unsigned) tx_last);

        /*
         * Make sure there are enough TX descriptors available to
         * transmit the entire packet.
         * nb_used better be less than or equal to txq->tx_rs_thresh
         */
        while (unlikely (nb_used > txq->nb_tx_free)) {
            PMD_TX_FREE_LOG(DEBUG,
                    "Not enough free TX descriptors "
                    "nb_used=%4u nb_free=%4u "
                    "(port=%d queue=%d)",
                    nb_used, txq->nb_tx_free,
                    txq->port_id, txq->queue_id);

            if (em_xmit_cleanup(txq) != 0) {
                /* Could not clean any descriptors */
                if (nb_tx == 0)
                    return (0);
                goto end_of_tx;
            }
        }

        /*
         * By now there are enough free TX descriptors to transmit
         * the packet.
         */

        /*
         * Set common flags of all TX Data Descriptors.
         *
         * The following bits must be set in all Data Descriptors:
         *    - E1000_TXD_DTYP_DATA
         *    - E1000_TXD_DTYP_DEXT
         *
         * The following bits must be set in the first Data Descriptor
         * and are ignored in the other ones:
         *    - E1000_TXD_POPTS_IXSM
         *    - E1000_TXD_POPTS_TXSM
         *
         * The following bits must be set in the last Data Descriptor
         * and are ignored in the other ones:
         *    - E1000_TXD_CMD_VLE
         *    - E1000_TXD_CMD_IFCS
         *
         * The following bits must only be set in the last Data
         * Descriptor:
         *   - E1000_TXD_CMD_EOP
         *
         * The following bits can be set in any Data Descriptor, but
         * are only set in the last Data Descriptor:
         *   - E1000_TXD_CMD_RS
         */
        cmd_type_len = E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D |
            E1000_TXD_CMD_IFCS;
        popts_spec = 0;

        /* Set VLAN Tag offload fields. */
        if (ol_flags & PKT_TX_VLAN_PKT) {
            cmd_type_len |= E1000_TXD_CMD_VLE;
            popts_spec = tx_pkt->pkt.vlan_macip.f.vlan_tci <<
                E1000_TXD_VLAN_SHIFT;
        }

        if (tx_ol_req) {
            /*
             * Setup the TX Context Descriptor if required
             */
            if (new_ctx) {
                volatile struct e1000_context_desc *ctx_txd;

                /* 如果需要context descriptor, tx_id处存放ctx的tx descriptor */
                ctx_txd = (volatile struct e1000_context_desc *)
                    &txr[tx_id];

                /* 下一个tx descriptor */
                txn = &sw_ring[txe->next_id];
                RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);

                if (txe->mbuf != NULL) {
                    rte_pktmbuf_free_seg(txe->mbuf);
                    txe->mbuf = NULL;
                }

                /* 设置ctx值到txq */
                em_set_xmit_ctx(txq, ctx_txd, tx_ol_req,
                    hdrlen);

                txe->last_id = tx_last;

                /* tx_id,txe 都分别指向下一个 */
                tx_id = txe->next_id;
                txe = txn;
            }

            /*
             * Setup the TX Data Descriptor,
             * This path will go through
             * whatever new/reuse the context descriptor
             */
            popts_spec |= tx_desc_cksum_flags_to_upper(ol_flags);
        }

        m_seg = tx_pkt;
        do {
            txd = &txr[tx_id];
            txn = &sw_ring[txe->next_id];

            /* 已发送的mbuf,回收,实际的pkt addr已经写入tx descriptor了,mbuf已经没用了 */
            if (txe->mbuf != NULL)
                rte_pktmbuf_free_seg(txe->mbuf);

            /* 当前mbuf加入txe */
            txe->mbuf = m_seg;

            /*
             * Set up Transmit Data Descriptor.
             */
            slen = m_seg->pkt.data_len;
            buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(m_seg);

            txd->buffer_addr = rte_cpu_to_le_64(buf_dma_addr);
            txd->lower.data = rte_cpu_to_le_32(cmd_type_len | slen);
            txd->upper.data = rte_cpu_to_le_32(popts_spec);

            txe->last_id = tx_last;

            /* tx_id更新 */
            tx_id = txe->next_id;
            txe = txn;
            m_seg = m_seg->pkt.next;
        } while (m_seg != NULL);

        /* 驱动相关的flag,vlan ip checksum之类,略过 */
        /*
         * The last packet data descriptor needs End Of Packet (EOP)
         */
        cmd_type_len |= E1000_TXD_CMD_EOP;
        txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);
        txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_used);

        /* Set RS bit only on threshold packets' last descriptor */
        if (txq->nb_tx_used >= txq->tx_rs_thresh) {
            PMD_TX_FREE_LOG(DEBUG,
                    "Setting RS bit on TXD id="
                    "%4u (port=%d queue=%d)",
                    tx_last, txq->port_id, txq->queue_id);

            cmd_type_len |= E1000_TXD_CMD_RS;

            /* Update txq RS bit counters */
            txq->nb_tx_used = 0;
        }
        txd->lower.data |= rte_cpu_to_le_32(cmd_type_len);
    }
end_of_tx:
    rte_wmb();

    /* 通知驱动有报文发送 */
    /*
     * Set the Transmit Descriptor Tail (TDT)
     */
    PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
        (unsigned) txq->port_id, (unsigned) txq->queue_id,
        (unsigned) tx_id, (unsigned) nb_tx);
    E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);

    /* 更新tx队列位置 */
    txq->tx_tail = tx_id;

    return (nb_tx);
}

 

至于驱动(E1000)中的处理等需要了再分析吧.

posted @ 2014-04-04 12:56  chanwai1219  阅读(6721)  评论(0编辑  收藏  举报