数据包接收系列 — 下半部实现（软中断） - 张大大123

数据包接收系列 — 下半部实现（软中断）

本文主要内容：下半部的实现，分析数据包从上半部结束后到L3的处理过程。

内核版本：2.6.37

Author：zhangskd @ csdn blog

下半部的实现

接收数据包的下半部处理流程为：

net_rx_action // 软中断

|--> process_backlog() // 默认poll

|--> __netif_receive_skb() // L2处理函数

|--> ip_rcv() // L3入口

net_rx_action

软中断(NET_RX_SOFTIRQ)的处理函数net_rx_action()主要做了：

遍历sd->poll_list，对于每个处于轮询状态的设备，调用它的poll()函数来处理数据包。

如果设备NAPI被禁止了，则把设备从sd->poll_list上删除，否则把设备移动到sd->poll_list的队尾。

每次软中断最多允许处理netdev_budget(300)个数据包，最长运行时间为2jiffies(2ms)。

每个设备一次最多允许处理weight_p(64)个数据包(非NAPI)。

如果在这次软中断中没处理玩，则再次设置NET_RX_SOFTIRQ标志触发软中断。

static void net_rx_action(struct softirq_action *h)
{
    struct softnet_data *sd = &__get_cpu_var(softnet_data); /* 当前CPU的softnet_data实例 */
    unsigned long time_limit = jiffies + 2; /* 一次软中断的最长处理时间(2ms) */
    int budget = netdev_budget; /* 一次软中断最多能够处理的skb个数(300) */
    void *have;

    local_irq_disable(); /* 禁止本地中断 */

    /* 如果有处于轮询状态的设备 */
    while(! list_empty(&sd->poll_list)) {

        struct napi_struct *n;
        int work, weight;
      
        /* If softirq window is exhuasted then punt.
         * Allow this to run for 2 jiffies since which will allow an average
         * latency of 1.5/HZ.
         * 如果处理的数据包过多了，或者处理的时间过长了，则退出。
         */
        if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
            goto softnet_break;

        local_irq_enable(); /* 开启本地中断 */

        /* Even though interrupts have been re-enabled, this access is safe because
         * interrupts can only add new entries to the tail of this list, and only ->poll()
         * calls can remove this head entry from the list.
         */

        /* 获取链表上的第一个napi_struct实例 */
        n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);

        have = netpoll_poll_lock(n);
        weight = n->weight; /* 这个设备每次能poll的数据包上限 */

        /* This NAPI_STATE_SCHED test is for avoiding a race with netpoll's
         * poll_napi(). Only the entity which obtains the lock and sees NAPI_STATE_SCHED
         * set will actually make the ->poll() call. Therefore we avoid accidently calling ->poll()
         * when NAPI is not scheduled.
         */
        work = 0;

        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
            /* 调用napi_struct的poll方法，返回处理的数据包个数 */
            work = n->poll(n, weight); /* 默认为process_backlog() */

            trace_napi_poll(n);
        }
        WARN_ON_ONCE(work > weight);

        budget -= work; /* 总预算减去本次处理的数据包数 */

        local_irq_disable(); /* 禁止本地中断 */

        if (unlikely(work == weight)) {
            /* 如果NAPI被禁止了，则把当前napi_struct从poll_list中删除 */
            if (unlikely(napi_disable_pending(n))) {
                local_irq_enable();
                napi_complete(n);
                local_irq_disable();

            } else
                /* 把当前napi_struct移动到poll_list的队尾 */
                list_move_tail(&n->poll_list, &sd->poll_list);
        }
        netpoll_poll_unlock(have);
    }

out:
    net_rps_action_and_irq_enable(sd); /* 开启本地中断 */

#ifdef CONFIG_NET_DMA
    ...
#endif

    return;

softnet_break:
    sd->time_squeeze++; /* 跑满2ms，或处理了300个包 */
    __raise_softirq_irqoff(NET_RX_SOFTIRQ); /* 因为没处理完，再次触发软中断 */
    goto out;
}

当调用napi_struct的poll()来处理数据包时，本地中断是开启的，这意味着新的数据包可以继续添加到

输入队列中。

process_backlog

如果网卡驱动不支持NAPI，则默认的napi_struct->poll()函数为process_backlog()。

process_backlog()的主要工作：

1. 处理sd->process_queue中的数据包

分别取出每个skb，从队列中删除。

开本地中断，调用__netif_rx_skb()把skb从L2传递到L3，然后关本地中断。

这说明在处理skb时，是允许网卡中断把数据包添加到接收队列(sd->input_pkt_queue)中的。

2. 如果处理完sd->process_queue中的数据包了，quota还没用完

把接收队列添加到sd->process_queue处理队列的尾部后，初始化接收队列。

接下来会继续处理sd->process_queue中的数据包。

3. 如果本次能处理完sd->process_queue和sd->input_pkt_queue中的所有数据包

把napi_struct从sd->poll_list队列中删除掉，清除NAPI_STATE_SCHED标志。

static int process_backlog(struct napi_struct *napi, int quota)
{
    int work = 0;
    struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);

#ifdef CONFIG_RPS
    ...
#endif

    napi->weight = weight_p; /* 每次处理的最大数据包数，默认为64 */
    local_irq_disable(); /* 禁止本地中断 */
    
    while(work < quota) { /* 配额允许时 */
        struct sk_buff *skb;
        unsigned int qlen;

        /* 从sd->process_queue队列取出第一个skb，并把它从队列中删除。
         * sd->process_queue用于存储即将处理的数据包。
         */
        while((skb = __skb_dequeue(&sd->process_queue))) {
            local_irq_enable(); /* 开启本地中断 */

            __netif_receive_skb(skb); /* 进行二层处理后转发给网络层 */

            local_irq_disable();
            input_queue_head_incr(sd);

            if (++work >= quota) { /* 处理的数据包个数超过上限了，返回 */
                local_irq_enable();
                return work;
            }
        }

        rps_lock(sd);
        qlen = skb_queue_len(&sd->input_pkt_queue); /* 接收队列的长度 */
        /* 把接收队列添加到sd->process_queue的尾部，然后初始化接收队列 */
        if (qlen)
            skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue);

        /* 如果能在本次处理完接收队列的数据包 */
        if (qlen < quota - work) {
            /* 把napi_struct从sd->poll_list队列中删除，因为马上要全部处理完了 */
            list_del(&napi->poll_list);

            napi->state = 0; /* 清除掉NAPI_STATE_SCHED标志 */
            quota = work + qlen; /* 减小quota，使接下来处理完process_queue的qlen个包即退出 */
        }
        rps_unlock(sd);
    }    
 
    local_irq_enable();
    return work;
}

从sk_buff_head队列中取出第一个skb，并把它从队列中删除。

/**
 * __skb_dequeue - remove from the head of the queue
 * @list: list to dequeue from
 * Remove the head of the list. 
 * The head item is returned or %NULL if the list is empty.
 */
static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
{
    struct sk_buff *skb = skb_peek(list); /* 取出队列的第一个元素 */
    if (skb)
        __skb_unlink(skb, list); /* 把skb从sk_buff_head队列中删除 */
    return skb;
}

把list添加到head的队尾，然后把list重新初始化。

/**
 * skb_queue_splice_tail - join two skb lists and reinitialise the emptied list
 * @list: the new list to add
 * @head: the place to add it in the first list
 * Each of the lists is a queue.
 * The list at @list is reinitialised
 */
static inline void skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head)
{
    if (! skb_queue_empty(list)) {
        __skb_queue_splice(list, head->prev, (struct sk_buff *)head);
        head->qlen += list->qlen;
        __skb_queue_head_init(list);
    }
}

__netif_receive_skb

__netif_receive_skb()的主要工作为：

处理NETPOLL、网卡绑定、入口流量控制、桥接、VLAN。

遍历嗅探器(ETH_P_ALL)链表ptype_all。对于每个注册的sniffer，调用它的处理函数

packet_type->func()，例如tcpdump。

赋值skb->network_header，根据skb->protocol从三层协议哈希表ptype_base中找到对应的

三层协议。如果三层协议是ETH_P_IP，相应的packet_type为ip_packet_type，协议处理函数为ip_rcv()。

static int __netif_receive_skb(struct sk_buff *skb)
{
    struct packet_type *ptype, *pt_prev;
    rx_handler_func_t *rx_handler;
    struct net_device *orig_dev;
    struct net_device *master;
    struct net_device *null_or_orig;
    struct net_device *orig_or_bond;
    int ret = NET_RX_DROP;
    __be16 type;

    if (! netdev_tstamp_prequeue)
        net_timestamp_check(skb); /* 记录接收时间到skb->tstamp */
    trace_netif_receive_skb(skb);

    /* If we've gotten here through NAPI, check netpoll */
    if (netpoll_receive_skb(skb))
        return NET_RX_DROP;
 
    if (! skb->skb_iif)
        skb->skb_iif = skb->dev->ifinex; /* 记录设备编号 */

    /* 处理网卡绑定(bonding) */
    null_or_orig = NULL;
    orig_dev = skb->dev;
    master = ACCESS_ONCE(orig_dev->master);

    if (skb->deliver_no_wcard)
        null_or_orig = orig_dev;
    else if (master) {
        if (skb_bond_should_drop(skb, master)) {
            skb->deliver_no_wcard = 1;
            null_or_orig = orig_dev; /* deliver only exact match */
        } else
            skb->dev = master;
    }

    __this_cpu_inc(softnet_data.processed); /* 增加本cpu处理过的数据包个数 */
    skb_reset_network_header(skb); /* 赋值skb->network_header */
    skb_reset_network_header(skb); /* 赋值skb->transport_header */
    skb->mac_len = skb->network_header - skb->mac_header; /* MAC头的长度，一般为14 */
    pt_prev = NULL;

    rcu_read_lock();

/* 入口流量控制 */
#ifdef CONFIG_NET_CLS_ACT 
    if (skb->tc_verd & TC_NCLS) {
        skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
        goto ncls;
    }
#endif

    /* 遍历嗅探器(ETH_P_ALL)链表ptype_all。对于每个注册的sniffer，
     * 调用它的处理函数packet_type->func()，例如tcpdump。
     */
    list_for_each_entry_rcu(ptype, &ptype_all, list) {
        if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
            ptype->dev == orig_dev) {
            if (pt_prev)
                ret = deliver_skb(skb, pt_prev, orig_dev); /* 嗅探器的处理函数 */
                pt_prev = ptype;
        }
    }

#ifdef CONFIG_NET_CLS_ACT
    skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
    if (! skb)
        goto out;
ncls:
#endif

    /* Handle special case of bridge or macvlan，接收的特殊过程 */
    rx_handler = rcu_dereference(skb->dev->rx_handler);
    if (rx_handler) {
        if (pt_prev) {
            ret = deliver_skb(skb, pt_prev, orig_dev);
            pt_prev = NULL;
        }
        skb = rx_handler(skb);
        if (! skb)
            goto out;
    }

    /* VLAN虚拟局域网 */
    if (vlan_tx_tag_present(skb)) {
        if (pt_prev) {
            ret = deliver_skb(skb, pt_prev, orig_dev);
            pt_prev = NULL;
        }

        if (vlan_hwaccel_do_receive(&skb)) {
            ret = __netif_receive_skb(skb);
            goto out;
        } else if (unlikely(! skb))
            goto out;
    }

    /* Make sure frames received on VLAN interfaces stacked on bonding
     * interfaces still make their way to any base bonding device that may
     * have registered for a specific ptype. The handler may have to adjust
     * skb->dev and orig_dev.
     */
    orig_or_bond = orig_dev;
    if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
        (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
        orig_or_bond = vlan_dev_real_dev(skb->dev);
    }

    type = skb->protocol; /* 三层协议类型 */

    list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
        if (ptype->type == type && (ptype->dev == null_or_orig || ptype->dev == skb->dev
            || ptype->dev == orig_dev || ptype->dev == orig_or_bond)) {

            /* 如果三层协议是ETH_P_IP，相应的packet_type为ip_packet_type，
             * 协议处理函数为ip_rcv()。
             */
            if (pt_prev)
                ret = deliver_skb(skb, pt_prev, orig_dev);
            pt_prev = ptype;
        }
    }

    if (pt_prev) {
        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
    } else { /* 说明没找到对应的三层协议 */
        atomic_long_inc(&skb->dev->rx_dropped);
        kfree_skb(skb);
        ret = NET_RX_DROP;
    }

out:
    rcu_read_unlock();
    return ret;
}

L3协议处理函数

#define PTYPE_HASH_SIZE (16)
#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
static DEFINE_SPINLOCK(ptype_lock);

static struct list_head ptype_base[PTYPE_HASH_SIZE]; /* 协议哈希表 */
static struct list_head ptype_all; /* 嗅探器(ETH_P_ALL)的链表 */

packet_type用于描述一个协议：

struct packet_type {
    __be16 type; /* This is really htons(ether_type). 协议代码 */
    struct net_device *dev; /* NULL is wildcarded here */

    /* 协议处理函数，如ip_rcv() */
    int (*func) (struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
    ...
    struct list_head list;
}

IP协议：

/* IP protocol layer initialiser */
static struct packet_type ip_packet_type = {
    .type = cpu_to_be16(ETH_P_IP),
    .func = ip_rcv,
    ...
};
#define ETH_P_IP 0x0800 /* Internet Protocol packet */

posted on 2014-03-27 17:06 张大大123 阅读(617) 评论(0) 编辑收藏举报

刷新页面返回顶部

张大大123