ICMP 实现

以下代码取自 kernel-2.6.24 .

[数据结构]
struct icmp_control {
    void (*handler)(struct sk_buff *skb); //icmp处理函数,根据icmp的类型字段
    short   error;          /* This ICMP is classed as an error message */
};
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; //每个icmp类型有一个项

[/数据结构]
[初始化]
文件net/ipv4/af_inet.c中,函数
static int __init inet_init(void)
{
    ......
    if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) //注册协议处理函数,参看下面协议处理实现
        printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
    ......
    icmp_init(&inet_family_ops); //icmp协议初始化
    ......
}
icmp初始化函数
static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL; //每cpu变量
void __init icmp_init(struct net_proto_family *ops)
{
    struct inet_sock *inet;
    int i;
    for_each_possible_cpu(i) { //循环所有的cpu
        int err;
        //在每个cpu上调用__sock_create函数创建一个 socket实例。
        err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP, &per_cpu(__icmp_socket, i));
        if (err < 0)
            panic("Failed to create the ICMP control socket.\n");

        per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC; //指定分配内存方法为atomic
        /* Enough space for 2 * 64K ICMP packets, including sk_buff struct overhead. */
        per_cpu(__icmp_socket, i)->sk->sk_sndbuf = (2 * ((64 * 1024) + sizeof(struct sk_buff))); //指定发送缓冲区大小

        inet = inet_sk(per_cpu(__icmp_socket, i)->sk); //获取inet_sock指针, 分配sock结构时空间大小就是inet_sock的大小
        inet->uc_ttl = -1;
        inet->pmtudisc = IP_PMTUDISC_DONT;

        /* Unhash it so that IP input processing does not even see it, we do not wish this socket to see incoming packets. */
        //进入的包看不到这些socket结构
        per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk);
    }
}
[/初始化]
[协议处理实现]
注册的协议处理函数,当ip向上递交数据包时,如果发现是icmp协议就会调用这个函数。
static struct net_protocol icmp_protocol = {
    .handler =      icmp_rcv,
};
    处理进入的icmp包
int icmp_rcv(struct sk_buff *skb)
{
    struct icmphdr *icmph;
    struct rtable *rt = (struct rtable *)skb->dst; //路由缓存

    ICMP_INC_STATS_BH(ICMP_MIB_INMSGS);

    switch (skb->ip_summed) { //skb的ip校验和标志
        case CHECKSUM_COMPLETE:
            if (!csum_fold(skb->csum)) //没有伪头部的校验和检测
                break;

            /* fall through */
        case CHECKSUM_NONE:
            skb->csum = 0;
            if (__skb_checksum_complete(skb)) //全部内容的校验和检测
                goto error;
    }
    if (!pskb_pull(skb, sizeof(struct icmphdr))) //是否有icmp头空间,如果有移动data指针到icmp头后面
        goto error;

    icmph = icmp_hdr(skb); //获取icmp头
    ICMPMSGIN_INC_STATS_BH(icmph->type);

    /*                                                                                         
     *  18 is the highest 'known' ICMP type. Anything else is a mystery                    
     *  RFC 1122: 3.2.2  Unknown ICMP messages types MUST be silently discarded.                              
     */
    if (icmph->type > NR_ICMP_TYPES)
        goto error;
    //icmp是发送到本地的多播或广播地址
    if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
        /*  RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be silently ignored (we let user decide with a sysctl).                         * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently discarded if to broadcast/multicast.*/
        if ((icmph->type == ICMP_ECHO || icmph->type == ICMP_TIMESTAMP) && sysctl_icmp_echo_ignore_broadcasts) {
            goto error;
        }
        //除了回显和时间截,地址掩码请求和应答,其他到广播和多播的icmp包全部丢弃
        if (icmph->type != ICMP_ECHO && icmph->type != ICMP_TIMESTAMP &&
                icmph->type != ICMP_ADDRESS && icmph->type != ICMP_ADDRESSREPLY) {
            goto error;
        }
    }
    icmp_pointers[icmph->type].handler(skb); //根据icmp类型调用相应的处理函数
drop:
    kfree_skb(skb); //处理完了释放skb
    return 0;
error:
    ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
    goto drop;
}
类型处理函数在内核中被静态的初始化.
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
    [ICMP_ECHOREPLY] = {
        .handler = icmp_discard, //空函数
    },
    [1] = {
        .handler = icmp_discard,
        .error = 1,
    },
    [2] = {
        .handler = icmp_discard,
        .error = 1,
    },
    [ICMP_DEST_UNREACH] = {
        .handler = icmp_unreach,
        .error = 1,
    },
    [ICMP_SOURCE_QUENCH] = {
        .handler = icmp_unreach,
        .error = 1,
    },
    [ICMP_REDIRECT] = {
        .handler = icmp_redirect,
        .error = 1,
    },
    [6] = {
        .handler = icmp_discard,
        .error = 1,
    },
    [7] = {
        .handler = icmp_discard,
        .error = 1,
    },
    [ICMP_ECHO] = {
        .handler = icmp_echo,
    },
    [9] = {
        .handler = icmp_discard,
        .error = 1,
    },
    [10] = {
        .handler = icmp_discard,
        .error = 1,
    },
    [ICMP_TIME_EXCEEDED] = {
        .handler = icmp_unreach,
        .error = 1,
    },
    [ICMP_PARAMETERPROB] = {
        .handler = icmp_unreach,
        .error = 1,
    },
    [ICMP_TIMESTAMP] = {
        .handler = icmp_timestamp,
    },
    [ICMP_TIMESTAMPREPLY] = {
        .handler = icmp_discard,
    },
    [ICMP_INFO_REQUEST] = {
        .handler = icmp_discard,
    },
    [ICMP_INFO_REPLY] = {
        .handler = icmp_discard,
    },
    [ICMP_ADDRESS] = {
        .handler = icmp_address,
    },
    [ICMP_ADDRESSREPLY] = {
        .handler = icmp_address_reply,
    },
};
我们一个一个看。
    icmp接收到不可达包的处理,不可达包括ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
static void icmp_unreach(struct sk_buff *skb)
{
    struct iphdr *iph;
    struct icmphdr *icmph;
    int hash, protocol;
    struct net_protocol *ipprot;
    struct sock *raw_sk;
    u32 info = 0;
    //数据部分包括了携带的ip头吗
    if (!pskb_may_pull(skb, sizeof(struct iphdr)))
        goto out_err;

    icmph = icmp_hdr(skb); //icmp头
    iph   = (struct iphdr *)skb->data; //携带的ip头

    //ip头损坏
    if (iph->ihl < 5) /* Mangled header, drop. */
        goto out_err;

    if (icmph->type == ICMP_DEST_UNREACH) { //icmp类型是目的不可达
        switch (icmph->code & 15) { //错误码标识
            case ICMP_NET_UNREACH: //网络
            case ICMP_HOST_UNREACH: //主机
            case ICMP_PROT_UNREACH: //协议
            case ICMP_PORT_UNREACH: //端口
                break; //不可达
            case ICMP_FRAG_NEEDED: //需要分片
                if (ipv4_config.no_pmtu_disc) {
                    LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: fragmentation needed and DF set.\n", NIPQUAD(iph->daddr));
                } else {
                    //在到那个目的地址的路由缓存中保存mtu的大小,在发送数据时就会根据这个mtu大小进行分片
                    info = ip_rt_frag_needed(iph, ntohs(icmph->un.frag.mtu));
                    if (!info)
                        goto out;
                }
            case ICMP_SR_FAILED:
                LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source Route Failed.\n", NIPQUAD(iph->daddr));
                break;
            default:
                break;
        }
        if (icmph->code > NR_ICMP_UNREACH) //超过限制,错误的的不可达码
            goto out;
    } else if (icmph->type == ICMP_PARAMETERPROB)
        info = ntohl(icmph->un.gateway) >> 24;
    //一些路由器会发送应答到广播地址,可能是用户工具引起的问题
    if (!sysctl_icmp_ignore_bogus_error_responses && inet_addr_type(iph->daddr) == RTN_BROADCAST) {
        if (net_ratelimit())
            printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP type %u, code %u "
                    "error to a broadcast: %u.%u.%u.%u on %s\n", NIPQUAD(ip_hdr(skb)->saddr),
                    icmph->type, icmph->code, NIPQUAD(iph->daddr), skb->dev->name);
        goto out;
    }
    /* Checkin full IP header plus 8 bytes of protocol to avoid additional coding at protocol handlers. */
    if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) //ip头加8字节的协议
        goto out;

    iph = (struct iphdr *)skb->data;
    protocol = iph->protocol; //获取协议
    hash = protocol & (MAX_INET_PROTOS - 1);

    //递交icmp信息到 raw socket, why ??????
    read_lock(&raw_v4_lock);
    if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) {
        while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr, iph->saddr, skb->dev->ifindex)) != NULL) {
            raw_err(raw_sk, skb, info);
            raw_sk = sk_next(raw_sk);
            iph = (struct iphdr *)skb->data;
        }
    }
    read_unlock(&raw_v4_lock);

    rcu_read_lock();
    ipprot = rcu_dereference(inet_protos[hash]); //根据协议查找协议处理结构
    if (ipprot && ipprot->err_handler) //如果有,调用相关的协议错误处理函数处理这个icmp不可达包
        ipprot->err_handler(skb, info);
    rcu_read_unlock();
out:
    return;
out_err:
    ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
    goto out;
}
    icmp重定向处理
static void icmp_redirect(struct sk_buff *skb)
{
    struct iphdr *iph;

    if (skb->len < sizeof(struct iphdr)) 长度检测
        goto out_err;
    /* Get the copied header of the packet that caused the redirect */
    if (!pskb_may_pull(skb, sizeof(struct iphdr))) //ip头长度检测
        goto out;

    iph = (struct iphdr *)skb->data; //取出ip头

    switch (icmp_hdr(skb)->code & 7) { //编码
        case ICMP_REDIR_NET: //网络重定向
        case ICMP_REDIR_NETTOS:
            /* As per RFC recommendations now handle it as a host redirect.*/
        case ICMP_REDIR_HOST: //主机重定向
        case ICMP_REDIR_HOSTTOS:
            //在路由告诉缓存中,更新相同缓存项的rt_gateway字段
            ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr, icmp_hdr(skb)->un.gateway, iph->saddr, skb->dev);
            break;
    }
out:
    return;
out_err:
    ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
    goto out;
}
    icmp回显请求
static void icmp_echo(struct sk_buff *skb)
{
    if (!sysctl_icmp_echo_ignore_all) { //是否忽略回显请求
        struct icmp_bxm icmp_param;
        //保存一些icmp内容
        icmp_param.data.icmph      = *icmp_hdr(skb);
        icmp_param.data.icmph.type = ICMP_ECHOREPLY;
        icmp_param.skb             = skb;
        icmp_param.offset          = 0;
        icmp_param.data_len        = skb->len;
        icmp_param.head_len        = sizeof(struct icmphdr);
        icmp_reply(&icmp_param, skb);
    }
}
static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
{
    struct sock *sk = icmp_socket->sk;
    struct inet_sock *inet = inet_sk(sk);
    struct ipcm_cookie ipc;
    struct rtable *rt = (struct rtable *)skb->dst; //路由缓存
    __be32 daddr;
    //解析其中的ip选项
    if (ip_options_echo(&icmp_param->replyopts, skb))
        return;

    if (icmp_xmit_lock()) //是否可以锁定这个cpu上的icmp_socket.
        return;
    icmp_param->data.icmph.checksum = 0;

    inet->tos = ip_hdr(skb)->tos;
    daddr = ipc.addr = rt->rt_src; //目的地址
    ipc.opt = NULL;
    if (icmp_param->replyopts.optlen) { //有ip选项
        ipc.opt = &icmp_param->replyopts;
        if (ipc.opt->srr)
            daddr = icmp_param->replyopts.faddr;
    }
    {
        struct flowi fl = { .nl_u = { .ip4_u =
            { .daddr = daddr,
                .saddr = rt->rt_spec_dst,
                .tos = RT_TOS(ip_hdr(skb)->tos) } },
                 .proto = IPPROTO_ICMP };

        security_skb_classify_flow(skb, &fl);
        if (ip_route_output_key(&rt, &fl)) //路由查找,如果没找到那么什么也不发送了
            goto out_unlock;
    }
    //是否立即发送应答
    if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type, icmp_param->data.icmph.code))
        icmp_push_reply(icmp_param, &ipc, rt); //发送应答

    ip_rt_put(rt);
out_unlock:
    icmp_xmit_unlock();
}
    判断应答是否发送
static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code)
{
    struct dst_entry *dst = &rt->u.dst;
    int rc = 1;

    if (type > NR_ICMP_TYPES) //类型超过范围, 这应该是个bug,需要添加 rc = 0
        goto out;

    /* Don't limit PMTU discovery. */
    //这两个类型不做限制
    if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
        goto out;

    /* No rate limit on loopback */
    if (dst->dev && (dst->dev->flags & IFF_LOOPBACK)) //回环设备也不限制
        goto out;

    /* Limit if icmp type is enabled in ratemask. */
    if ((1 << type) & sysctl_icmp_ratemask) //用户通过/proc配置了限制速度的icmp类型掩码
        rc = xrlim_allow(dst, sysctl_icmp_ratelimit);
out:
    return rc;
}
#define XRLIM_BURST_FACTOR 6
int xrlim_allow(struct dst_entry *dst, int timeout)
{
    unsigned long now;
    int rc = 0; //不发送

    now = jiffies;
    dst->rate_tokens += now - dst->rate_last; //累加过去的时间
    dst->rate_last = now; //最后使用时间

    if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout) //累加时间超过指定的范围
        dst->rate_tokens = XRLIM_BURST_FACTOR * timeout; //设为最大值

    if (dst->rate_tokens >= timeout) { //超过用户配置的时间限制
        dst->rate_tokens -= timeout; //递减配置的时间限制
        rc = 1; //发送
    }
    return rc;
}
    发送icmp应答函数
static void icmp_push_reply(struct icmp_bxm *icmp_param, struct ipcm_cookie *ipc, struct rtable *rt)
{
    struct sk_buff *skb;
    //分配skb拷贝接收的skb数据到新分配的skb内存中,新skb被链入到icmp_socket->sk->sk_write_queue中.
    if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len,
                icmp_param->head_len, ipc, rt, MSG_DONTWAIT) < 0)
        ip_flush_pending_frames(icmp_socket->sk); //拷贝失败
    else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { //提取分配的skb
        struct icmphdr *icmph = icmp_hdr(skb);
        __wsum csum = 0;
        struct sk_buff *skb1;
        //计算校验和
        skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) {
            csum = csum_add(csum, skb1->csum);
        }
        csum = csum_partial_copy_nocheck((void *)&icmp_param->data, (char *)icmph, icmp_param->head_len, csum);
        icmph->checksum = csum_fold(csum);
        skb->ip_summed = CHECKSUM_NONE;
        ip_push_pending_frames(icmp_socket->sk); //发送队列中的skb
    }
}
static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,  struct sk_buff *skb)
{
    struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
    __wsum csum;
    //拷贝数据
    csum = skb_copy_and_csum_bits(icmp_param->skb, icmp_param->offset + offset, to, len, 0);
    //添加所有icmp_param->skb的校验和到地一个skb中
    skb->csum = csum_block_add(skb->csum, csum, odd);
    if (icmp_pointers[icmp_param->data.icmph.type].error)
        nf_ct_attach(skb, icmp_param->skb);
    return 0;
}
    拷贝数据到ip数据负载部分,如果需要将所有碎片链入到sk->sk_write_queue队列中
int ip_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
        void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable *rt, unsigned int flags)
{
    struct inet_sock *inet = inet_sk(sk);
    struct sk_buff *skb;

    struct ip_options *opt = NULL;
    int hh_len;
    int exthdrlen;
    int mtu;
    int copy;
    int err;
    int offset = 0;
    unsigned int maxfraglen, fragheaderlen;
    int csummode = CHECKSUM_NONE;

    if (flags & MSG_PROBE)
        return 0;
    if (skb_queue_empty(&sk->sk_write_queue)) { //写队列为空
        opt = ipc->opt;
        if (opt) { //有ip选项
            if (inet->cork.opt == NULL) { //inet socket中ip选项指针为空,分配一个ip选项+ip最长头空间
                inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
                if (unlikely(inet->cork.opt == NULL))
                    return -ENOBUFS;
            }
            //拷贝icmp中携带的ip选项
            memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
            inet->cork.flags |= IPCORK_OPT;
            inet->cork.addr = ipc->addr; //记录发送这个icmp的地址
        }
        //IP_PMTUDISC_PROBE 表示忽略对方的mtu, 如果忽略使用本地设备的mtu,设置分片大小
        inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
        inet->cork.rt = rt; //保存路由
        inet->cork.length = 0;
        sk->sk_sndmsg_page = NULL;
        sk->sk_sndmsg_off = 0;
        if ((exthdrlen = rt->u.dst.header_len) != 0) { //需要额外的头长度
            length += exthdrlen;
            transhdrlen += exthdrlen;
        }
    } else { //队列不为空,用保存好的数据初始化一些变量
        rt = inet->cork.rt;
        if (inet->cork.flags & IPCORK_OPT)
            opt = inet->cork.opt;

        transhdrlen = 0;
        exthdrlen = 0;
        mtu = inet->cork.fragsize;
    }
    hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); //足够的硬件头空间

    fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); //每个碎片的ip头长度
    maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; //每个碎片的最大长度

    if (inet->cork.length + length > 0xFFFF - fragheaderlen) { //发送来的数据长度超过了允许的最大ip数据长度(65535 - ip头 + ip选项)
        ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
        return -EMSGSIZE;
    }
    /* transhdrlen > 0 means that this is the first fragment and we wish it won't be fragmented in the future. */
    if (transhdrlen && length + fragheaderlen <= mtu && rt->u.dst.dev->features & NETIF_F_V4_CSUM && !exthdrlen)
        csummode = CHECKSUM_PARTIAL;
    inet->cork.length += length; //累加这个长度
    //长度 > mtu ,协议是 udp,且网卡设备支持GSO分片
    if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && (rt->u.dst.dev->features & NETIF_F_UFO)) {
        err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags);
        if (err)
            goto error;

        return 0;
    }
    if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) //队列为空
        goto alloc_new_skb;

    while (length > 0) {
        /* Check if the remaining data fits into current packet. */
        copy = mtu - skb->len;////这的mtu,我认为应该为maxfraglen, 这样就不用 fraggap变量和相关的操作了
        if (copy < length)
            copy = maxfraglen - skb->len;
        if (copy <= 0) {
            char *data;
            unsigned int datalen;
            unsigned int fraglen;
            unsigned int fraggap;
            unsigned int alloclen;
            struct sk_buff *skb_prev;
alloc_new_skb:

            skb_prev = skb;
            if (skb_prev)
                fraggap = skb_prev->len - maxfraglen;
            else
                fraggap = 0;

            /* If remaining data exceeds the mtu, we know we need more fragment(s). */
            datalen = length + fraggap;

            //这的mtu,我认为应该为maxfraglen, 这样就不用 fraggap变量和相关的操作了
            if (datalen > mtu - fragheaderlen) //数据长度超过mtu - ip头长度,需要分片
                datalen = maxfraglen - fragheaderlen; //设置成合适的长度


            fraglen = datalen + fragheaderlen; //一个碎片的完整长度
            if ((flags & MSG_MORE) && !(rt->u.dst.dev->features & NETIF_F_SG))
                alloclen = mtu;
            else
                alloclen = datalen + fragheaderlen;

            /* The last fragment gets additional space at tail. Note, with MSG_MORE we overallocate on fragments,                                * because we have no idea what fragment will be the last. */
            if (datalen == length + fraggap) //最后一个分片将添加额外的长度
                alloclen += rt->u.dst.trailer_len;

            if (transhdrlen) { //指定了传输层头长度
                //分配内存hh_len是硬件地址长度
                skb = sock_alloc_send_skb(sk, alloclen + hh_len + 15, (flags & MSG_DONTWAIT), &err);
            } else {
                skb = NULL;
                if (atomic_read(&sk->sk_wmem_alloc) <= 2 * sk->sk_sndbuf)
                    skb = sock_wmalloc(sk, alloclen + hh_len + 15, 1, sk->sk_allocation);

                if (unlikely(skb == NULL))
                    err = -ENOBUFS;
            }
            if (skb == NULL) //分配失败
                goto error;

            /* Fill in the control structures */
            skb->ip_summed = csummode;
            skb->csum = 0;
            skb_reserve(skb, hh_len); //保留出硬件地址空间 data和tail向后移动 hh_len

            /*Find where to start putting bytes. */
            data = skb_put(skb, fraglen); //返回data移动tail和增加len
            skb_set_network_header(skb, exthdrlen);//如果有额外头,移动网络头位置
            //传输层头在网络头后面
            skb->transport_header = (skb->network_header + fragheaderlen);//fragheaderlen 可能包括ip选项长度
            data += fragheaderlen; //data指向传输层头位置

            if (fraggap) { //把上一个skb最后几个没有对齐的字节拷贝到这新包的 data + transhdrlen位置
                skb->csum = skb_copy_and_csum_bits(skb_prev, maxfraglen, data + transhdrlen, fraggap, 0);
                skb_prev->csum = csum_sub(skb_prev->csum, skb->csum);
                data += fraggap; //移动指针
                pskb_trim_unique(skb_prev, maxfraglen); //修改上一个skb的数据长度,进行缩小

            }
            //datalen包括传输层头和数据
            copy = datalen - transhdrlen - fraggap;//要拷贝的数据长度
            //从from拷贝一些传输层头后面的数据到data+transhdrlen的位置
            if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
                err = -EFAULT;
                kfree_skb(skb);
                goto error;
            }
            offset += copy; //偏移累加
            length -= datalen - fraggap; //长度递减,包含传输层头长度
            transhdrlen = 0;
            exthdrlen = 0;
            csummode = CHECKSUM_NONE;

            /* Put the packet on the pending queue. */
            __skb_queue_tail(&sk->sk_write_queue, skb); //链入队列
            continue;
        }
        if (copy > length)
            copy = length;
        if (!(rt->u.dst.dev->features & NETIF_F_SG)) { //设备不支持SG
            unsigned int off;
            off = skb->len;
            if (getfrag(from, skb_put(skb, copy), offset, copy, off, skb) < 0) {
                __skb_trim(skb, off);

                err = -EFAULT;
                goto error;
            }
        } else { //按SG分页处理
            int i = skb_shinfo(skb)->nr_frags;
            skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
            struct page *page = sk->sk_sndmsg_page;
            int off = sk->sk_sndmsg_off;
            unsigned int left;

            if (page && (left = PAGE_SIZE - off) > 0) {
                if (copy >= left)
                    copy = left;

                if (page != frag->page) {
                    if (i == MAX_SKB_FRAGS) {
                        err = -EMSGSIZE;
                        goto error;
                    }
                    get_page(page);
                    skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
                    frag = &skb_shinfo(skb)->frags[i];
                }

            } else if (i < MAX_SKB_FRAGS) {
                if (copy > PAGE_SIZE)
                    copy = PAGE_SIZE;

                page = alloc_pages(sk->sk_allocation, 0);
                if (page == NULL)  {
                    err = -ENOMEM;
                    goto error;
                }
                sk->sk_sndmsg_page = page;
                sk->sk_sndmsg_off = 0;

                skb_fill_page_desc(skb, i, page, 0, 0);
                frag = &skb_shinfo(skb)->frags[i];

            } else {
                err = -EMSGSIZE;
                goto error;
            }
            if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
                err = -EFAULT;
                goto error;
            }
            sk->sk_sndmsg_off += copy;
            frag->size += copy;
            skb->len += copy;
            skb->data_len += copy;
            skb->truesize += copy;
            atomic_add(copy, &sk->sk_wmem_alloc);
        }
        offset += copy;
        length -= copy;
    }
    return 0;
error:
    inet->cork.length -= length;
    IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
    return err;
}
    ip_append_data函数失败就会调用这个函数十分所有skb
void ip_flush_pending_frames(struct sock *sk)
{
    struct sk_buff *skb;
    while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
        kfree_skb(skb);

    ip_cork_release(inet_sk(sk));
}
    icmp_push_reply-> 取出队列中的skb,然后添加完整的ip头然后发送出去
int ip_push_pending_frames(struct sock *sk)
{
    struct sk_buff *skb, *tmp_skb;
    struct sk_buff **tail_skb;
    struct inet_sock *inet = inet_sk(sk);
    struct ip_options *opt = NULL;
    struct rtable *rt = inet->cork.rt;
    struct iphdr *iph;
    __be16 df = 0;
    __u8 ttl;
    int err = 0;

    if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) //取出一个skb
        goto out;
    tail_skb = &(skb_shinfo(skb)->frag_list); //指向分片连表头

    /* move skb->data to ip header from ext header */
    if (skb->data < skb_network_header(skb))
        __skb_pull(skb, skb_network_offset(skb)); //移动data指针到ip头位置

    while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { //循环出队所有skb
        __skb_pull(tmp_skb, skb_network_header_len(skb)); //移动data到传输层头位置
        *tail_skb = tmp_skb; //当执行第一次时等于是(skb_shinfo(skb)->frag_list) = tmp_skb
        tail_skb = &(tmp_skb->next); //指向了tmp_skb的next
        //累加这个包的长度
        skb->len += tmp_skb->len;
        skb->data_len += tmp_skb->len;
        skb->truesize += tmp_skb->truesize;
        __sock_put(tmp_skb->sk); //递减sock的引用计数
        tmp_skb->destructor = NULL;
        tmp_skb->sk = NULL;
    }
    //到这就是把所有在sk->sk_write_queue中的skb(所有分片)组合到第一个skb的skb_shinfo(skb)->frag_list连表中了。

    /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow to fragment the frame generated here.
     * No matter, what transforms how transforms change size of the packet, it will come out. */
    if (inet->pmtudisc < IP_PMTUDISC_DO)
        skb->local_df = 1; //不分片
    /* DF bit is set when we want to see DF on outgoing frames.                                
     * If local_df is set too, we still allow to fragment this frame locally. */
    if (inet->pmtudisc >= IP_PMTUDISC_DO || (skb->len <= dst_mtu(&rt->u.dst) && ip_dont_fragment(sk, &rt->u.dst)))
        df = htons(IP_DF); //设置不分片标志
    if (inet->cork.flags & IPCORK_OPT) //有ip选项
        opt = inet->cork.opt;

    if (rt->rt_type == RTN_MULTICAST) //多播ttl
        ttl = inet->mc_ttl;
    else
        ttl = ip_select_ttl(inet, &rt->u.dst); //单播,需要计算

    iph = (struct iphdr *)skb->data; //在第一个skb中添加ip头
    iph->version = 4;
    iph->ihl = 5;
    if (opt) {
        iph->ihl += opt->optlen>>2;
        ip_options_build(skb, opt, inet->cork.addr, rt, 0);
    }
    iph->tos = inet->tos;
    iph->tot_len = htons(skb->len);
    iph->frag_off = df;
    ip_select_ident(iph, &rt->u.dst, sk); //选择一个ip标识
    iph->ttl = ttl;
    iph->protocol = sk->sk_protocol;
    iph->saddr = rt->rt_src;
    iph->daddr = rt->rt_dst;
    ip_send_check(iph); //校验和

    skb->priority = sk->sk_priority;
    skb->dst = dst_clone(&rt->u.dst);

    if (iph->protocol == IPPROTO_ICMP)
        icmp_out_count(((struct icmphdr *)skb_transport_header(skb))->type); //更新一些统计信息

    //发送这个skb到netfilter的LOCAL_OUT hook
    err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
    if (err) {
        if (err > 0)
            err = inet->recverr ? net_xmit_errno(err) : 0;

        if (err)
            goto error;
    }
out:
    ip_cork_release(inet);
    return err;
error:
    IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
    goto out;
}
到这需要简单说一下,其实我们看的是icmp回显请求相关的流程,其中什么ip碎片应该就根本不会发生,
但一些函数在ip层使用所以有些看起来十分的复杂。

    icmp时间截请求处理
static void icmp_timestamp(struct sk_buff *skb)
{
    struct timeval tv;
    struct icmp_bxm icmp_param;

    if (skb->len < 4) //长度不对
        goto out_err;
    /* Fill in the current time as ms since midnight UT: */
    do_gettimeofday(&tv); //获取当前时间
    icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
    icmp_param.data.times[2] = icmp_param.data.times[1];
    //拷贝skb中的数据到 times[0]中
    if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
        BUG();

    icmp_param.data.icmph      = *icmp_hdr(skb);
    icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; //时间截应答
    icmp_param.data.icmph.code = 0;
    icmp_param.skb             = skb;
    icmp_param.offset          = 0;
    icmp_param.data_len        = 0;
    icmp_param.head_len        = sizeof(struct icmphdr) + 12;
    icmp_reply(&icmp_param, skb);
out:
    return;
out_err:
    ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
    goto out;
}
    地址掩码请求,linux没有实现它,参考内核中这函数的注释
static void icmp_address(struct sk_buff *skb)
{
#if 0
    if (net_ratelimit())
        printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
#endif
}
    地址掩码应答处理
static void icmp_address_reply(struct sk_buff *skb)
{
    struct rtable *rt = (struct rtable *)skb->dst; //路由缓存
    struct net_device *dev = skb->dev;
    struct in_device *in_dev;
    struct in_ifaddr *ifa;
    //长度不对或没有标志重定向源地址
    if (skb->len < 4 || !(rt->rt_flags & RTCF_DIRECTSRC))
        goto out;
    in_dev = in_dev_get(dev);
    if (!in_dev)
        goto out;
    rcu_read_lock();
    //设备有地址,打开调试项,设备允许转发
    if (in_dev->ifa_list && IN_DEV_LOG_MARTIANS(in_dev) && IN_DEV_FORWARD(in_dev)) {
        __be32 _mask, *mp;
        //取出掩码
        mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
        BUG_ON(mp == NULL);
        for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
            //循环所有地址,如果掩码匹配且路由地址也匹配
            if (*mp == ifa->ifa_mask && inet_ifa_match(rt->rt_src, ifa))
                break;
        }
        if (!ifa && net_ratelimit()) { //都不匹配
            printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from %s/%u.%u.%u.%u\n", 
                    NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src));
        }
    }
    rcu_read_unlock();
    in_dev_put(in_dev);
out:;
}
[/协议处理实现]

 

posted on 2013-08-28 10:30  SuperKing  阅读(2711)  评论(0编辑  收藏  举报

导航