ICMP 实现
以下代码取自 kernel-2.6.24 . [数据结构] struct icmp_control { void (*handler)(struct sk_buff *skb); //icmp处理函数,根据icmp的类型字段 short error; /* This ICMP is classed as an error message */ }; static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; //每个icmp类型有一个项 [/数据结构] [初始化] 文件net/ipv4/af_inet.c中,函数 static int __init inet_init(void) { ...... if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) //注册协议处理函数,参看下面协议处理实现 printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n"); ...... icmp_init(&inet_family_ops); //icmp协议初始化 ...... } icmp初始化函数 static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL; //每cpu变量 void __init icmp_init(struct net_proto_family *ops) { struct inet_sock *inet; int i; for_each_possible_cpu(i) { //循环所有的cpu int err; //在每个cpu上调用__sock_create函数创建一个 socket实例。 err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP, &per_cpu(__icmp_socket, i)); if (err < 0) panic("Failed to create the ICMP control socket.\n"); per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC; //指定分配内存方法为atomic /* Enough space for 2 * 64K ICMP packets, including sk_buff struct overhead. */ per_cpu(__icmp_socket, i)->sk->sk_sndbuf = (2 * ((64 * 1024) + sizeof(struct sk_buff))); //指定发送缓冲区大小 inet = inet_sk(per_cpu(__icmp_socket, i)->sk); //获取inet_sock指针, 分配sock结构时空间大小就是inet_sock的大小 inet->uc_ttl = -1; inet->pmtudisc = IP_PMTUDISC_DONT; /* Unhash it so that IP input processing does not even see it, we do not wish this socket to see incoming packets. */ //进入的包看不到这些socket结构 per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk); } } [/初始化] [协议处理实现] 注册的协议处理函数,当ip向上递交数据包时,如果发现是icmp协议就会调用这个函数。 static struct net_protocol icmp_protocol = { .handler = icmp_rcv, }; 处理进入的icmp包 int icmp_rcv(struct sk_buff *skb) { struct icmphdr *icmph; struct rtable *rt = (struct rtable *)skb->dst; //路由缓存 ICMP_INC_STATS_BH(ICMP_MIB_INMSGS); switch (skb->ip_summed) { //skb的ip校验和标志 case CHECKSUM_COMPLETE: if (!csum_fold(skb->csum)) //没有伪头部的校验和检测 break; /* fall through */ case CHECKSUM_NONE: skb->csum = 0; if (__skb_checksum_complete(skb)) //全部内容的校验和检测 goto error; } if (!pskb_pull(skb, sizeof(struct icmphdr))) //是否有icmp头空间,如果有移动data指针到icmp头后面 goto error; icmph = icmp_hdr(skb); //获取icmp头 ICMPMSGIN_INC_STATS_BH(icmph->type); /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently discarded. */ if (icmph->type > NR_ICMP_TYPES) goto error; //icmp是发送到本地的多播或广播地址 if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { /* RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be silently ignored (we let user decide with a sysctl). * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently discarded if to broadcast/multicast.*/ if ((icmph->type == ICMP_ECHO || icmph->type == ICMP_TIMESTAMP) && sysctl_icmp_echo_ignore_broadcasts) { goto error; } //除了回显和时间截,地址掩码请求和应答,其他到广播和多播的icmp包全部丢弃 if (icmph->type != ICMP_ECHO && icmph->type != ICMP_TIMESTAMP && icmph->type != ICMP_ADDRESS && icmph->type != ICMP_ADDRESSREPLY) { goto error; } } icmp_pointers[icmph->type].handler(skb); //根据icmp类型调用相应的处理函数 drop: kfree_skb(skb); //处理完了释放skb return 0; error: ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); goto drop; } 类型处理函数在内核中被静态的初始化. static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { [ICMP_ECHOREPLY] = { .handler = icmp_discard, //空函数 }, [1] = { .handler = icmp_discard, .error = 1, }, [2] = { .handler = icmp_discard, .error = 1, }, [ICMP_DEST_UNREACH] = { .handler = icmp_unreach, .error = 1, }, [ICMP_SOURCE_QUENCH] = { .handler = icmp_unreach, .error = 1, }, [ICMP_REDIRECT] = { .handler = icmp_redirect, .error = 1, }, [6] = { .handler = icmp_discard, .error = 1, }, [7] = { .handler = icmp_discard, .error = 1, }, [ICMP_ECHO] = { .handler = icmp_echo, }, [9] = { .handler = icmp_discard, .error = 1, }, [10] = { .handler = icmp_discard, .error = 1, }, [ICMP_TIME_EXCEEDED] = { .handler = icmp_unreach, .error = 1, }, [ICMP_PARAMETERPROB] = { .handler = icmp_unreach, .error = 1, }, [ICMP_TIMESTAMP] = { .handler = icmp_timestamp, }, [ICMP_TIMESTAMPREPLY] = { .handler = icmp_discard, }, [ICMP_INFO_REQUEST] = { .handler = icmp_discard, }, [ICMP_INFO_REPLY] = { .handler = icmp_discard, }, [ICMP_ADDRESS] = { .handler = icmp_address, }, [ICMP_ADDRESSREPLY] = { .handler = icmp_address_reply, }, }; 我们一个一个看。 icmp接收到不可达包的处理,不可达包括ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. static void icmp_unreach(struct sk_buff *skb) { struct iphdr *iph; struct icmphdr *icmph; int hash, protocol; struct net_protocol *ipprot; struct sock *raw_sk; u32 info = 0; //数据部分包括了携带的ip头吗 if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out_err; icmph = icmp_hdr(skb); //icmp头 iph = (struct iphdr *)skb->data; //携带的ip头 //ip头损坏 if (iph->ihl < 5) /* Mangled header, drop. */ goto out_err; if (icmph->type == ICMP_DEST_UNREACH) { //icmp类型是目的不可达 switch (icmph->code & 15) { //错误码标识 case ICMP_NET_UNREACH: //网络 case ICMP_HOST_UNREACH: //主机 case ICMP_PROT_UNREACH: //协议 case ICMP_PORT_UNREACH: //端口 break; //不可达 case ICMP_FRAG_NEEDED: //需要分片 if (ipv4_config.no_pmtu_disc) { LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: fragmentation needed and DF set.\n", NIPQUAD(iph->daddr)); } else { //在到那个目的地址的路由缓存中保存mtu的大小,在发送数据时就会根据这个mtu大小进行分片 info = ip_rt_frag_needed(iph, ntohs(icmph->un.frag.mtu)); if (!info) goto out; } case ICMP_SR_FAILED: LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source Route Failed.\n", NIPQUAD(iph->daddr)); break; default: break; } if (icmph->code > NR_ICMP_UNREACH) //超过限制,错误的的不可达码 goto out; } else if (icmph->type == ICMP_PARAMETERPROB) info = ntohl(icmph->un.gateway) >> 24; //一些路由器会发送应答到广播地址,可能是用户工具引起的问题 if (!sysctl_icmp_ignore_bogus_error_responses && inet_addr_type(iph->daddr) == RTN_BROADCAST) { if (net_ratelimit()) printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP type %u, code %u " "error to a broadcast: %u.%u.%u.%u on %s\n", NIPQUAD(ip_hdr(skb)->saddr), icmph->type, icmph->code, NIPQUAD(iph->daddr), skb->dev->name); goto out; } /* Checkin full IP header plus 8 bytes of protocol to avoid additional coding at protocol handlers. */ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) //ip头加8字节的协议 goto out; iph = (struct iphdr *)skb->data; protocol = iph->protocol; //获取协议 hash = protocol & (MAX_INET_PROTOS - 1); //递交icmp信息到 raw socket, why ?????? read_lock(&raw_v4_lock); if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) { while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr, iph->saddr, skb->dev->ifindex)) != NULL) { raw_err(raw_sk, skb, info); raw_sk = sk_next(raw_sk); iph = (struct iphdr *)skb->data; } } read_unlock(&raw_v4_lock); rcu_read_lock(); ipprot = rcu_dereference(inet_protos[hash]); //根据协议查找协议处理结构 if (ipprot && ipprot->err_handler) //如果有,调用相关的协议错误处理函数处理这个icmp不可达包 ipprot->err_handler(skb, info); rcu_read_unlock(); out: return; out_err: ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); goto out; } icmp重定向处理 static void icmp_redirect(struct sk_buff *skb) { struct iphdr *iph; if (skb->len < sizeof(struct iphdr)) 长度检测 goto out_err; /* Get the copied header of the packet that caused the redirect */ if (!pskb_may_pull(skb, sizeof(struct iphdr))) //ip头长度检测 goto out; iph = (struct iphdr *)skb->data; //取出ip头 switch (icmp_hdr(skb)->code & 7) { //编码 case ICMP_REDIR_NET: //网络重定向 case ICMP_REDIR_NETTOS: /* As per RFC recommendations now handle it as a host redirect.*/ case ICMP_REDIR_HOST: //主机重定向 case ICMP_REDIR_HOSTTOS: //在路由告诉缓存中,更新相同缓存项的rt_gateway字段 ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr, icmp_hdr(skb)->un.gateway, iph->saddr, skb->dev); break; } out: return; out_err: ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); goto out; } icmp回显请求 static void icmp_echo(struct sk_buff *skb) { if (!sysctl_icmp_echo_ignore_all) { //是否忽略回显请求 struct icmp_bxm icmp_param; //保存一些icmp内容 icmp_param.data.icmph = *icmp_hdr(skb); icmp_param.data.icmph.type = ICMP_ECHOREPLY; icmp_param.skb = skb; icmp_param.offset = 0; icmp_param.data_len = skb->len; icmp_param.head_len = sizeof(struct icmphdr); icmp_reply(&icmp_param, skb); } } static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { struct sock *sk = icmp_socket->sk; struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; struct rtable *rt = (struct rtable *)skb->dst; //路由缓存 __be32 daddr; //解析其中的ip选项 if (ip_options_echo(&icmp_param->replyopts, skb)) return; if (icmp_xmit_lock()) //是否可以锁定这个cpu上的icmp_socket. return; icmp_param->data.icmph.checksum = 0; inet->tos = ip_hdr(skb)->tos; daddr = ipc.addr = rt->rt_src; //目的地址 ipc.opt = NULL; if (icmp_param->replyopts.optlen) { //有ip选项 ipc.opt = &icmp_param->replyopts; if (ipc.opt->srr) daddr = icmp_param->replyopts.faddr; } { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = daddr, .saddr = rt->rt_spec_dst, .tos = RT_TOS(ip_hdr(skb)->tos) } }, .proto = IPPROTO_ICMP }; security_skb_classify_flow(skb, &fl); if (ip_route_output_key(&rt, &fl)) //路由查找,如果没找到那么什么也不发送了 goto out_unlock; } //是否立即发送应答 if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type, icmp_param->data.icmph.code)) icmp_push_reply(icmp_param, &ipc, rt); //发送应答 ip_rt_put(rt); out_unlock: icmp_xmit_unlock(); } 判断应答是否发送 static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code) { struct dst_entry *dst = &rt->u.dst; int rc = 1; if (type > NR_ICMP_TYPES) //类型超过范围, 这应该是个bug,需要添加 rc = 0 goto out; /* Don't limit PMTU discovery. */ //这两个类型不做限制 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) goto out; /* No rate limit on loopback */ if (dst->dev && (dst->dev->flags & IFF_LOOPBACK)) //回环设备也不限制 goto out; /* Limit if icmp type is enabled in ratemask. */ if ((1 << type) & sysctl_icmp_ratemask) //用户通过/proc配置了限制速度的icmp类型掩码 rc = xrlim_allow(dst, sysctl_icmp_ratelimit); out: return rc; } #define XRLIM_BURST_FACTOR 6 int xrlim_allow(struct dst_entry *dst, int timeout) { unsigned long now; int rc = 0; //不发送 now = jiffies; dst->rate_tokens += now - dst->rate_last; //累加过去的时间 dst->rate_last = now; //最后使用时间 if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout) //累加时间超过指定的范围 dst->rate_tokens = XRLIM_BURST_FACTOR * timeout; //设为最大值 if (dst->rate_tokens >= timeout) { //超过用户配置的时间限制 dst->rate_tokens -= timeout; //递减配置的时间限制 rc = 1; //发送 } return rc; } 发送icmp应答函数 static void icmp_push_reply(struct icmp_bxm *icmp_param, struct ipcm_cookie *ipc, struct rtable *rt) { struct sk_buff *skb; //分配skb拷贝接收的skb数据到新分配的skb内存中,新skb被链入到icmp_socket->sk->sk_write_queue中. if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, ipc, rt, MSG_DONTWAIT) < 0) ip_flush_pending_frames(icmp_socket->sk); //拷贝失败 else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { //提取分配的skb struct icmphdr *icmph = icmp_hdr(skb); __wsum csum = 0; struct sk_buff *skb1; //计算校验和 skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) { csum = csum_add(csum, skb1->csum); } csum = csum_partial_copy_nocheck((void *)&icmp_param->data, (char *)icmph, icmp_param->head_len, csum); icmph->checksum = csum_fold(csum); skb->ip_summed = CHECKSUM_NONE; ip_push_pending_frames(icmp_socket->sk); //发送队列中的skb } } static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct icmp_bxm *icmp_param = (struct icmp_bxm *)from; __wsum csum; //拷贝数据 csum = skb_copy_and_csum_bits(icmp_param->skb, icmp_param->offset + offset, to, len, 0); //添加所有icmp_param->skb的校验和到地一个skb中 skb->csum = csum_block_add(skb->csum, csum, odd); if (icmp_pointers[icmp_param->data.icmph.type].error) nf_ct_attach(skb, icmp_param->skb); return 0; } 拷贝数据到ip数据负载部分,如果需要将所有碎片链入到sk->sk_write_queue队列中 int ip_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable *rt, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; struct ip_options *opt = NULL; int hh_len; int exthdrlen; int mtu; int copy; int err; int offset = 0; unsigned int maxfraglen, fragheaderlen; int csummode = CHECKSUM_NONE; if (flags & MSG_PROBE) return 0; if (skb_queue_empty(&sk->sk_write_queue)) { //写队列为空 opt = ipc->opt; if (opt) { //有ip选项 if (inet->cork.opt == NULL) { //inet socket中ip选项指针为空,分配一个ip选项+ip最长头空间 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); if (unlikely(inet->cork.opt == NULL)) return -ENOBUFS; } //拷贝icmp中携带的ip选项 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); inet->cork.flags |= IPCORK_OPT; inet->cork.addr = ipc->addr; //记录发送这个icmp的地址 } //IP_PMTUDISC_PROBE 表示忽略对方的mtu, 如果忽略使用本地设备的mtu,设置分片大小 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path); inet->cork.rt = rt; //保存路由 inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; if ((exthdrlen = rt->u.dst.header_len) != 0) { //需要额外的头长度 length += exthdrlen; transhdrlen += exthdrlen; } } else { //队列不为空,用保存好的数据初始化一些变量 rt = inet->cork.rt; if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; transhdrlen = 0; exthdrlen = 0; mtu = inet->cork.fragsize; } hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); //足够的硬件头空间 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); //每个碎片的ip头长度 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; //每个碎片的最大长度 if (inet->cork.length + length > 0xFFFF - fragheaderlen) { //发送来的数据长度超过了允许的最大ip数据长度(65535 - ip头 + ip选项) ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); return -EMSGSIZE; } /* transhdrlen > 0 means that this is the first fragment and we wish it won't be fragmented in the future. */ if (transhdrlen && length + fragheaderlen <= mtu && rt->u.dst.dev->features & NETIF_F_V4_CSUM && !exthdrlen) csummode = CHECKSUM_PARTIAL; inet->cork.length += length; //累加这个长度 //长度 > mtu ,协议是 udp,且网卡设备支持GSO分片 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && (rt->u.dst.dev->features & NETIF_F_UFO)) { err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags); if (err) goto error; return 0; } if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) //队列为空 goto alloc_new_skb; while (length > 0) { /* Check if the remaining data fits into current packet. */ copy = mtu - skb->len;////这的mtu,我认为应该为maxfraglen, 这样就不用 fraggap变量和相关的操作了 if (copy < length) copy = maxfraglen - skb->len; if (copy <= 0) { char *data; unsigned int datalen; unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; struct sk_buff *skb_prev; alloc_new_skb: skb_prev = skb; if (skb_prev) fraggap = skb_prev->len - maxfraglen; else fraggap = 0; /* If remaining data exceeds the mtu, we know we need more fragment(s). */ datalen = length + fraggap; //这的mtu,我认为应该为maxfraglen, 这样就不用 fraggap变量和相关的操作了 if (datalen > mtu - fragheaderlen) //数据长度超过mtu - ip头长度,需要分片 datalen = maxfraglen - fragheaderlen; //设置成合适的长度 fraglen = datalen + fragheaderlen; //一个碎片的完整长度 if ((flags & MSG_MORE) && !(rt->u.dst.dev->features & NETIF_F_SG)) alloclen = mtu; else alloclen = datalen + fragheaderlen; /* The last fragment gets additional space at tail. Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be the last. */ if (datalen == length + fraggap) //最后一个分片将添加额外的长度 alloclen += rt->u.dst.trailer_len; if (transhdrlen) { //指定了传输层头长度 //分配内存hh_len是硬件地址长度 skb = sock_alloc_send_skb(sk, alloclen + hh_len + 15, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (atomic_read(&sk->sk_wmem_alloc) <= 2 * sk->sk_sndbuf) skb = sock_wmalloc(sk, alloclen + hh_len + 15, 1, sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; } if (skb == NULL) //分配失败 goto error; /* Fill in the control structures */ skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); //保留出硬件地址空间 data和tail向后移动 hh_len /*Find where to start putting bytes. */ data = skb_put(skb, fraglen); //返回data移动tail和增加len skb_set_network_header(skb, exthdrlen);//如果有额外头,移动网络头位置 //传输层头在网络头后面 skb->transport_header = (skb->network_header + fragheaderlen);//fragheaderlen 可能包括ip选项长度 data += fragheaderlen; //data指向传输层头位置 if (fraggap) { //把上一个skb最后几个没有对齐的字节拷贝到这新包的 data + transhdrlen位置 skb->csum = skb_copy_and_csum_bits(skb_prev, maxfraglen, data + transhdrlen, fraggap, 0); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; //移动指针 pskb_trim_unique(skb_prev, maxfraglen); //修改上一个skb的数据长度,进行缩小 } //datalen包括传输层头和数据 copy = datalen - transhdrlen - fraggap;//要拷贝的数据长度 //从from拷贝一些传输层头后面的数据到data+transhdrlen的位置 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; } offset += copy; //偏移累加 length -= datalen - fraggap; //长度递减,包含传输层头长度 transhdrlen = 0; exthdrlen = 0; csummode = CHECKSUM_NONE; /* Put the packet on the pending queue. */ __skb_queue_tail(&sk->sk_write_queue, skb); //链入队列 continue; } if (copy > length) copy = length; if (!(rt->u.dst.dev->features & NETIF_F_SG)) { //设备不支持SG unsigned int off; off = skb->len; if (getfrag(from, skb_put(skb, copy), offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; } } else { //按SG分页处理 int i = skb_shinfo(skb)->nr_frags; skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; struct page *page = sk->sk_sndmsg_page; int off = sk->sk_sndmsg_off; unsigned int left; if (page && (left = PAGE_SIZE - off) > 0) { if (copy >= left) copy = left; if (page != frag->page) { if (i == MAX_SKB_FRAGS) { err = -EMSGSIZE; goto error; } get_page(page); skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); frag = &skb_shinfo(skb)->frags[i]; } } else if (i < MAX_SKB_FRAGS) { if (copy > PAGE_SIZE) copy = PAGE_SIZE; page = alloc_pages(sk->sk_allocation, 0); if (page == NULL) { err = -ENOMEM; goto error; } sk->sk_sndmsg_page = page; sk->sk_sndmsg_off = 0; skb_fill_page_desc(skb, i, page, 0, 0); frag = &skb_shinfo(skb)->frags[i]; } else { err = -EMSGSIZE; goto error; } if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { err = -EFAULT; goto error; } sk->sk_sndmsg_off += copy; frag->size += copy; skb->len += copy; skb->data_len += copy; skb->truesize += copy; atomic_add(copy, &sk->sk_wmem_alloc); } offset += copy; length -= copy; } return 0; error: inet->cork.length -= length; IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); return err; } ip_append_data函数失败就会调用这个函数十分所有skb void ip_flush_pending_frames(struct sock *sk) { struct sk_buff *skb; while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); ip_cork_release(inet_sk(sk)); } icmp_push_reply-> 取出队列中的skb,然后添加完整的ip头然后发送出去 int ip_push_pending_frames(struct sock *sk) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); struct ip_options *opt = NULL; struct rtable *rt = inet->cork.rt; struct iphdr *iph; __be16 df = 0; __u8 ttl; int err = 0; if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) //取出一个skb goto out; tail_skb = &(skb_shinfo(skb)->frag_list); //指向分片连表头 /* move skb->data to ip header from ext header */ if (skb->data < skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); //移动data指针到ip头位置 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { //循环出队所有skb __skb_pull(tmp_skb, skb_network_header_len(skb)); //移动data到传输层头位置 *tail_skb = tmp_skb; //当执行第一次时等于是(skb_shinfo(skb)->frag_list) = tmp_skb tail_skb = &(tmp_skb->next); //指向了tmp_skb的next //累加这个包的长度 skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; skb->truesize += tmp_skb->truesize; __sock_put(tmp_skb->sk); //递减sock的引用计数 tmp_skb->destructor = NULL; tmp_skb->sk = NULL; } //到这就是把所有在sk->sk_write_queue中的skb(所有分片)组合到第一个skb的skb_shinfo(skb)->frag_list连表中了。 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow to fragment the frame generated here. * No matter, what transforms how transforms change size of the packet, it will come out. */ if (inet->pmtudisc < IP_PMTUDISC_DO) skb->local_df = 1; //不分片 /* DF bit is set when we want to see DF on outgoing frames. * If local_df is set too, we still allow to fragment this frame locally. */ if (inet->pmtudisc >= IP_PMTUDISC_DO || (skb->len <= dst_mtu(&rt->u.dst) && ip_dont_fragment(sk, &rt->u.dst))) df = htons(IP_DF); //设置不分片标志 if (inet->cork.flags & IPCORK_OPT) //有ip选项 opt = inet->cork.opt; if (rt->rt_type == RTN_MULTICAST) //多播ttl ttl = inet->mc_ttl; else ttl = ip_select_ttl(inet, &rt->u.dst); //单播,需要计算 iph = (struct iphdr *)skb->data; //在第一个skb中添加ip头 iph->version = 4; iph->ihl = 5; if (opt) { iph->ihl += opt->optlen>>2; ip_options_build(skb, opt, inet->cork.addr, rt, 0); } iph->tos = inet->tos; iph->tot_len = htons(skb->len); iph->frag_off = df; ip_select_ident(iph, &rt->u.dst, sk); //选择一个ip标识 iph->ttl = ttl; iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; ip_send_check(iph); //校验和 skb->priority = sk->sk_priority; skb->dst = dst_clone(&rt->u.dst); if (iph->protocol == IPPROTO_ICMP) icmp_out_count(((struct icmphdr *)skb_transport_header(skb))->type); //更新一些统计信息 //发送这个skb到netfilter的LOCAL_OUT hook err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output); if (err) { if (err > 0) err = inet->recverr ? net_xmit_errno(err) : 0; if (err) goto error; } out: ip_cork_release(inet); return err; error: IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); goto out; } 到这需要简单说一下,其实我们看的是icmp回显请求相关的流程,其中什么ip碎片应该就根本不会发生, 但一些函数在ip层使用所以有些看起来十分的复杂。 icmp时间截请求处理 static void icmp_timestamp(struct sk_buff *skb) { struct timeval tv; struct icmp_bxm icmp_param; if (skb->len < 4) //长度不对 goto out_err; /* Fill in the current time as ms since midnight UT: */ do_gettimeofday(&tv); //获取当前时间 icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000); icmp_param.data.times[2] = icmp_param.data.times[1]; //拷贝skb中的数据到 times[0]中 if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) BUG(); icmp_param.data.icmph = *icmp_hdr(skb); icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; //时间截应答 icmp_param.data.icmph.code = 0; icmp_param.skb = skb; icmp_param.offset = 0; icmp_param.data_len = 0; icmp_param.head_len = sizeof(struct icmphdr) + 12; icmp_reply(&icmp_param, skb); out: return; out_err: ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); goto out; } 地址掩码请求,linux没有实现它,参考内核中这函数的注释 static void icmp_address(struct sk_buff *skb) { #if 0 if (net_ratelimit()) printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n"); #endif } 地址掩码应答处理 static void icmp_address_reply(struct sk_buff *skb) { struct rtable *rt = (struct rtable *)skb->dst; //路由缓存 struct net_device *dev = skb->dev; struct in_device *in_dev; struct in_ifaddr *ifa; //长度不对或没有标志重定向源地址 if (skb->len < 4 || !(rt->rt_flags & RTCF_DIRECTSRC)) goto out; in_dev = in_dev_get(dev); if (!in_dev) goto out; rcu_read_lock(); //设备有地址,打开调试项,设备允许转发 if (in_dev->ifa_list && IN_DEV_LOG_MARTIANS(in_dev) && IN_DEV_FORWARD(in_dev)) { __be32 _mask, *mp; //取出掩码 mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask); BUG_ON(mp == NULL); for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { //循环所有地址,如果掩码匹配且路由地址也匹配 if (*mp == ifa->ifa_mask && inet_ifa_match(rt->rt_src, ifa)) break; } if (!ifa && net_ratelimit()) { //都不匹配 printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from %s/%u.%u.%u.%u\n", NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src)); } } rcu_read_unlock(); in_dev_put(in_dev); out:; } [/协议处理实现]
posted on 2013-08-28 10:30 SuperKing 阅读(2711) 评论(0) 编辑 收藏 举报