TSO-GSO reading
对 TCP,在网卡不支持 TSO 时,使用和不使用 GSO 的情形
TSO :
在 分析:IP层发包时:如果是gso 报文会调用
ip_finish_output_gso
来处理
static int ip_finish_output(struct sock *sk, struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { //仅经过ip_forward流程处理的报文携带该对象 IPCB(skb)->flags |= IPSKB_REROUTED; //该flag会影响后续报文的GSO处理 return dst_output_sk(sk, skb); //由于SNAT等策略处理,需要再次调用xfrm4_output函数来发包 } #endif if (skb_is_gso(skb)) return ip_finish_output_gso(sk, skb); //如果是gso报文 if (skb->len > ip_skb_dst_mtu(skb)) //非gso报文,报文大小超过设备MTU值,则需要进行IP分片 return ip_fragment(sk, skb, ip_finish_output2); return ip_finish_output2(sk, skb); //直接发送报文 }
static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) { netdev_features_t features; struct sk_buff *segs; int ret = 0; /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || //只有ip forward流程该条件才会不成立,否则该条件成立 skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) return ip_finish_output2(sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. * * This can happen in two cases: * 1) TCP GRO packet, DF bit not set * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly * from host network stack. */ features = netif_skb_features(skb); //获取dev的offload feature segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); //skb gso报文分段 if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); return -ENOMEM; } consume_skb(skb); do { struct sk_buff *nskb = segs->next; int err; segs->next = NULL; err = ip_fragment(sk, segs, ip_finish_output2); //分段报文经过ip分片后通过ip_finish_output2发送 if (err && ret == 0) ret = err; segs = nskb; } while (segs); return ret; }
SO:可知正常情况下本地发包是不需要进行gso处理的;
实际上本地发包都是延迟到网络设备发包时在处理;一般不会再IP层处理,一般都是在网络设备层处理---->进行软件GSO(硬件不支持)
-
检测当前报文是GSO数据包,同时物理设备不支此种GSO的分片聚合,或者当前报文已经不需要物理设备进行校验和,则直接到软件GSO逻辑处理
static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features) { return net_gso_ok(features, skb_shinfo(skb)->gso_type) && (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST)); } //skb_is_gso 判断skb的shinfo中gso_size字段是否有值来确定当前是GSO包 //skb_gso_ok 检测设备是否支持当前gso包类型(gso可以有UDP、TCP等几种) static inline bool netif_needs_gso(struct sk_buff *skb, netdev_features_t features) { return skb_is_gso(skb) && (!skb_gso_ok(skb, features) || unlikely((skb->ip_summed != CHECKSUM_PARTIAL) && (skb->ip_summed != CHECKSUM_UNNECESSARY))); //skb->ip_summed != CHECKSUM_PARTIAL 表明该包软件实现校验和 }
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { netdev_features_t features; features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) goto out_null; if (netif_needs_gso(skb, features)) { //检测当前报文是GSO数据包,同时物理设备不支此种GSO的分片聚合,或者当前报 //文已经不需要物理设备进行校验和,则直接进行软件实现GSO处理。 struct sk_buff *segs; segs = skb_gso_segment(skb, features); if (IS_ERR(segs)) { goto out_kfree_skb; } else if (segs) { consume_skb(skb); skb = segs; } } else { //如果当前报文有多个frag_list组成,并且当前设备不支持多段处理,则需要使用 //__skb_linearize进行线性化,也就是需要将多个段数据和入到一个单独的skb中 //如果__skb_linearize处理失败,该包需要丢弃,这里失败原因比如说创建一个大的 //skb时没有足够内存资源等。 if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. *///如果当前报文需要硬件设备进行校验和, //但当前设备不支持任何校验和处理,或者当前设备不支持IP校验和,或者当前设备 //支持IP校验和可是当前报文不是IP报文。 //则需要进行软件校验和处理, if (skb->ip_summed == CHECKSUM_PARTIAL) { if (skb->encapsulation) skb_set_inner_transport_header(skb, skb_checksum_start_offset(skb)); else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); if (!(features & NETIF_F_CSUM_MASK) && skb_checksum_help(skb)) goto out_kfree_skb; } } return skb; out_kfree_skb: kfree_skb(skb); out_null: atomic_long_inc(&dev->tx_dropped); return NULL; } int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; --------------------------------------------- /* Note that we validate skb (GSO, checksum, ...) outside of locks */ if (validate)//报文校验,gso分段、csum计算 skb = validate_xmit_skb_list(skb, dev); if (likely(skb)) { HARD_TX_LOCK(dev, txq, smp_processor_id()); /*如果说txq被stop,即置位QUEUE_STATE_ANY_XOFF_OR_FROZEN,就直接ret = NETDEV_TX_BUSY *如果说txq 正常运行,那么直接调用dev_hard_start_xmit发送数据包*/ skb = dev_hard_start_xmit(skb, dev, txq, &ret);//调用驱动发送报文 ----------------------------------- }
看下 gso 的处理方式:入口函数skb_gso_segment
这个函数将skb分片,并返回一个skb list。如果skb不需要分片则返回NULL。
/** * __skb_gso_segment - Perform segmentation on skb. * @skb: buffer to segment * @features: features for the output path (see dev->features) * @tx_path: whether it is called in TX path * * This function segments the given skb and returns a list of segments. * * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. * * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) { struct sk_buff *segs; if (unlikely(skb_needs_check(skb, tx_path))) {// 判断等于 skb->ip_summed != CHECKSUM_PARTIAL CHECKSUM_UNNECESSARY int err; /* We're going to init ->check field in TCP or UDP header copy header of skb when it is required * If the skb passed lacks sufficient headroom or its data part * is shared, data is reallocated. If reallocation fails, an error * is returned and original skb is not changed. */ err = skb_cow_head(skb, 0); if (err < 0) return ERR_PTR(err); } /* Only report GSO partial support if it will enable us to * support segmentation on this frame without needing additional * work. */ if (features & NETIF_F_GSO_PARTIAL) { netdev_features_t partial_features = NETIF_F_GSO_ROBUST; struct net_device *dev = skb->dev; partial_features |= dev->features & dev->gso_partial_features; if (!skb_gso_ok(skb, features | partial_features)) features &= ~NETIF_F_GSO_PARTIAL; } BUILD_BUG_ON(SKB_SGO_CB_OFFSET + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);//设置mac_offset, 用于skb_segment分段拷贝外层报文 SKB_GSO_CB(skb)->encap_level = 0;//encap_level为零,说明是最外层的报文 skb_reset_mac_header(skb);//重置mac header skb_reset_mac_len(skb);//重置mac len segs = skb_mac_gso_segment(skb, features); if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; }
/** * skb_mac_gso_segment - mac layer segmentation handler. * @skb: buffer to segment * @features: features for the output path (see dev->features) */ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; int vlan_depth = skb->mac_len;//__skb_gso_segment函数中计算得到 __be16 type = skb_network_protocol(skb, &vlan_depth);//得到skb协议 if (unlikely(!type)) return ERR_PTR(-EINVAL); __skb_pull(skb, vlan_depth);//skb data指针移动到IP头 rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features);//调用IP层的GSO segment函数 break; } } rcu_read_unlock(); __skb_push(skb, skb->data - skb_mac_header(skb));//skb data指针移动到MAC头 return segs; }
IP层对GSO的支持
需要做gso分段,则先进入ip层的分段处理,在ip层分段处理函数里,主要工作是调用tcp层的分段处理函数,等tcp层分段完成后,重新对分段的skb的ip头做checksum
static struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; unsigned int offset = 0; bool udpfrag, encap; struct iphdr *iph; int proto; int nhoff; int ihl; int id; //校验待软GSO分段的的skb,其gso_tpye是否存在其他非法值 if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM | 0))) goto out; skb_reset_network_header(skb); nhoff = skb_network_header(skb) - skb_mac_header(skb); //根据network header和mac header得到IP头相对MAC的偏移 if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) //分段数据至少大于IP首部长度 goto out; iph = ip_hdr(skb); //检验首部中的长度字段是否有效 ihl = iph->ihl * 4; //得到IP包头的实际长度,基于此可以得到L4的首地址 if (ihl < sizeof(*iph)) goto out; id = ntohs(iph->id);//取出首部中的id字段 proto = iph->protocol; //取出IP首部的协议值,L4层协议类型 用于定位与之对应的传输层接口(tcp还是udp) /* Warning: after this point, iph might be no longer valid */ //再次通过首部中的长度字段检测skb长度是否有效 if (unlikely(!pskb_may_pull(skb, ihl))) //检测skb是否可以移动到L4头? goto out; __skb_pull(skb, ihl); //报文data指针移动到传输层 encap = SKB_GSO_CB(skb)->encap_level > 0; if (encap) features &= skb->dev->hw_enc_features; //如果encap,那么feature与hw_enc_features取交集 SKB_GSO_CB(skb)->encap_level += ihl; //用来标示是否为内层报文 skb_reset_transport_header(skb); //设置transport header值 segs = ERR_PTR(-EPROTONOSUPPORT); if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) udpfrag = proto == IPPROTO_UDP && encap; else udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; //vxlan封装报文走此分支,此时udpfrag为false ops = rcu_dereference(inet_offloads[proto]);//调用上册协议的GSO处理函数 if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); //UDP或TCP的分段函数 if (IS_ERR_OR_NULL(segs)) goto out; skb = segs;//开始处理分段后的skb do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); //根据分段报文的mac header 和 IP偏移 if (udpfrag) { //ip分片报文 iph->id = htons(id); iph->frag_off = htons(offset >> 3); //设置ip头的frag_off值 if (skb->next) iph->frag_off |= htons(IP_MF); //后面还有报文,需要设置more frag标记 offset += skb->len - nhoff - ihl; //计算offset值,下一个报文需要使用 } else { iph->id = htons(id++); //每个报文为完整的IP报文 } iph->tot_len = htons(skb->len - nhoff); ip_send_check(iph); //计算ip头 csum值 if (encap) //如果encap值非空,说明当前处于内层报文中,所以需要设置inner heaer值 skb_reset_inner_headers(skb); skb->network_header = (u8 *)iph - skb->head; //设置network header } while ((skb = skb->next)); out: return segs; }
TCP层对GSO的支持
UDP经过GSO分片后每个分片的IP头部id是一样的,这个符合IP分片的逻辑,但是为什么TCP的GSO分片,IP头部的id会依次加1呢?原因是: tcp建立三次握手的过程中产生合适的mss,这个mss肯定是<=网络层的最大路径MTU,然后tcp数据封装成ip数据包通过网络层发送,当服务器端传输层接收到tcp数据之后进行tcp重组。所以正常情况下tcp产生的ip数据包在传输过程中是不会发生分片的!由于GSO应该保证对外透明,所以其效果应该也和在TCP层直接分片的效果是一样的,所以这里对UDP的处理是IP分片逻辑,但对TCP的处理是构造新的skb逻辑。
l 对于GSO
UDP:所有分片ip头部id都相同,设置IP_MF分片标志(除最后一片) (等同于IP分片)
TCP:分片后,每个分片IP头部中id加1, (等同于TCP分段)
static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!pskb_may_pull(skb, sizeof(struct tcphdr))) return ERR_PTR(-EINVAL); if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); //ip层报文保证了transport header值 /* Set up checksum pseudo header, usually expect stack to * have done this already. */ th->check = 0; skb->ip_summed = CHECKSUM_PARTIAL; __tcp_v4_send_check(skb, iph->saddr, iph->daddr); //计算伪头check值 } return tcp_gso_segment(skb, features); //TCP GSO分段 }
struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int sum_truesize = 0; struct tcphdr *th; unsigned int thlen; unsigned int seq; __be32 delta; unsigned int oldlen; unsigned int mss; struct sk_buff *gso_skb = skb; __sum16 newcheck; bool ooo_okay, copy_destructor; th = tcp_hdr(skb); thlen = th->doff * 4; //得到tcp头的长度 if (thlen < sizeof(*th)) goto out; if (!pskb_may_pull(skb, thlen)) //再次通过首部中的长度字段检测skb长度是否有效 goto out; //把tcp header移到skb header里,把skb->len存到oldlen中,此时skb->len就只有ip payload的长度(包含TCP首部) oldlen = (u16)~skb->len; __skb_pull(skb, thlen); //skb移动到用户数据区(payload) mss = tcp_skb_mss(skb); //得到mss值 if (unlikely(skb->len <= mss)) goto out; if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ int type = skb_shinfo(skb)->gso_type; if (unlikely(type & ~(SKB_GSO_TCPV4 | SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | SKB_GSO_GRE | SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM | 0) || !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; //校验待软GSO分段的的skb,其gso_tpye是否存在其他非法值 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); //如果报文来源不可信,则重新计算segs,返回 segs = NULL; goto out; } copy_destructor = gso_skb->destructor == tcp_wfree; ooo_okay = gso_skb->ooo_okay; /* All segments but the first should have ooo_okay cleared */ skb->ooo_okay = 0; segs = skb_segment(skb, features); //调用payload根据mss值分段 if (IS_ERR(segs)) goto out; /* Only first segment might have ooo_okay set */ segs->ooo_okay = ooo_okay; delta = htonl(oldlen + (thlen + mss)); //TCP头+mss - 原始报文,该值为负值 skb = segs; th = tcp_hdr(skb); //skb_segment分段后,可以直接从skb中获取tcp头, skb_segment或udp4_ufo_fragment保证 seq = ntohl(th->seq); if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP)) tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss); newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + //第一个报文基于原先值,根据delta快速计算 (__force u32)delta)); do { //刷新分段后报文的TCP头设置 th->fin = th->psh = 0; th->check = newcheck; //计算每个分片的校验和 if (skb->ip_summed != CHECKSUM_PARTIAL) th->check = gso_make_checksum(skb, ~th->check); //重新计算check值 seq += mss; //重新初始化每个分片的序列号 if (copy_destructor) { skb->destructor = gso_skb->destructor; skb->sk = gso_skb->sk; sum_truesize += skb->truesize; } skb = skb->next; th = tcp_hdr(skb); th->seq = htonl(seq); th->cwr = 0; } while (skb->next); /* Following permits TCP Small Queues to work well with GSO : * The callback to TCP stack will be called at the time last frag * is freed at TX completion, and not right now when gso_skb * is freed by GSO engine */ if (copy_destructor) { swap(gso_skb->sk, skb->sk); swap(gso_skb->destructor, skb->destructor); sum_truesize += skb->truesize; atomic_add(sum_truesize - gso_skb->truesize, &skb->sk->sk_wmem_alloc); } delta = htonl(oldlen + (skb_tail_pointer(skb) - skb_transport_header(skb)) + //最后一个报文的delta值不同 skb->data_len); th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); if (skb->ip_summed != CHECKSUM_PARTIAL) th->check = gso_make_checksum(skb, ~th->check); //重新计算check值 out: return segs; }
skg_segment是实现封装报文GSO分段的基础
/** * skb_segment - Perform protocol segmentation on skb. * @head_skb: buffer to segment * @features: features for the output path (see dev->features) * * This function performs segmentation on the given skb. It returns * a pointer to the first in a list of new skbs for the segments. * In case of error it returns ERR_PTR(err). */ struct sk_buff *skb_segment(struct sk_buff *head_skb, netdev_features_t features) { struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; skb_frag_t *frag = skb_shinfo(head_skb)->frags; unsigned int mss = skb_shinfo(head_skb)->gso_size; //mac头+ip头+tcp头 或mac头+ip头(对于UDP传入时没有将头部偏移过去) unsigned int doffset = head_skb->data - skb_mac_header(head_skb); //得到内层报头的长度 struct sk_buff *frag_skb = head_skb; unsigned int offset = doffset; unsigned int tnl_hlen = skb_tnl_header_len(head_skb); //得到外层报头的长度,非封装报文该值为0, 是支持封装报文GSO的基础 unsigned int headroom; unsigned int len; __be16 proto; bool csum; int sg = !!(features & NETIF_F_SG); //是否支持SG int nfrags = skb_shinfo(head_skb)->nr_frags; int err = -ENOMEM; int i = 0; int pos; int dummy; __skb_push(head_skb, doffset); //报文移到内层报文的mac头 proto = skb_network_protocol(head_skb, &dummy); //报文协议类型 if (unlikely(!proto)) return ERR_PTR(-EINVAL); csum = !head_skb->encap_hdr_csum && !!can_checksum_protocol(features, proto); headroom = skb_headroom(head_skb); //得到报文的headroom大小 pos = skb_headlen(head_skb); //报文线性区长度 do { struct sk_buff *nskb; skb_frag_t *nskb_frag; int hsize; int size; /* offset为分片已处理的长度,len为skb->len减去直到offset的部分。开始时,offset只是mac header + ip header + tcp header的长度, len即tcp payload的长度。随着segment增加, offset每次都增加mss长度。 因此len的定义是每个segment的payload长度(最后一个segment的payload可能小于一个mss长度) */ len = head_skb->len - offset; //计算报文待拷贝的长度,不包括包头 if (len > mss) //len为本次要创建的新分片的长度 len = mss; //len超过mss,则只能拷贝mss长度 // hsize为线性区部分的payload减去offset后的大小,如果hsize小于0,那么说明payload在skb的frags或frag_list中。 //随着offset一直增长,必定会有hsize一直<0的情况开始出现,除非skb是一个完全linearize化的skb hsize = skb_headlen(head_skb) - offset; //待拷贝的线性区长度 if (hsize < 0) hsize = 0;//这种情况说明线性区已经没有tcp payload的部分,需要pull数据过来 if (hsize > len || !sg) hsize = len;//如果不支持NETIF_F_SG或者hsize大于len,那么hsize就为len(本次新分片的长度),此时说明segment的payload还在skb 线性区中 //如果把frags数组中的数据拷贝完还不够len长度,则需要从frag_list中拷贝了 //表示需要从frags数组或者frag_list链表中拷贝出数据,i >= nfrags说明frags数组中的数据也拷贝完了//下面需要从frag_list链表中拷贝数据了 if (!hsize && i >= nfrags && skb_headlen(list_skb) && (skb_headlen(list_skb) == len || sg)) { BUG_ON(skb_headlen(list_skb) > len); //frag_list中的skb线性区长度不超过len,即mss值 i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; frag_skb = list_skb; pos += skb_headlen(list_skb); //增加线性区长度 while (pos < offset + len) { //只能拷贝len长度 BUG_ON(i >= nfrags); size = skb_frag_size(frag); if (pos + size > offset + len) break; i++; pos += size; //增加frag的长度 frag++; } //frag_list的数据不用真的拷贝,只需要拷贝其skb描述符,就可以复用其数据区 nskb = skb_clone(list_skb, GFP_ATOMIC); //克隆报文,该报文包含完整的数据,需要裁剪 list_skb = list_skb->next; if (unlikely(!nskb)) goto err; if (unlikely(pskb_trim(nskb, len))) { //裁剪报文到len长度 kfree_skb(nskb); goto err; } hsize = skb_end_offset(nskb); //保证新的skb的headroom有mac header+ip header+tcp/udp+header的大小 if (skb_cow_head(nskb, doffset + headroom)) { //扩展head,以容得下外层报头 kfree_skb(nskb); goto err; } //调整truesize,使其包含本次已分片的数据部分长度(hsize) nskb->truesize += skb_end_offset(nskb) - hsize; //truesize值刷新 skb_release_head_state(nskb); __skb_push(nskb, doffset); //skb移动到内层报文的mac头 } else { //每次要拷贝出的数据长度为len,其中hsize位于线性区 nskb = __alloc_skb(hsize + doffset + headroom, //skb的frag还未使用完,采用新申请skb的方式 GFP_ATOMIC, skb_alloc_rx_flag(head_skb), NUMA_NO_NODE); if (unlikely(!nskb)) goto err; skb_reserve(nskb, headroom); //skb预留headroom长度 __skb_put(nskb, doffset); //线性区扩展内层报头长度 } if (segs) tail->next = nskb; else segs = nskb; tail = nskb; __copy_skb_header(nskb, head_skb); //拷贝skb的相关信息,包括header都拷贝了 skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); //刷新header值 skb_reset_mac_len(nskb); //重置mac len值 //把skb->data开始doffset长度的内容拷贝到nskb->data中 skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, //拷贝外两层报头(如果封装的话) nskb->data - tnl_hlen, doffset + tnl_hlen); if (nskb->len == len + doffset) //对于使用frag_list场景,满足条件;拷贝frag场景不满足 goto perform_csum_check; if (!sg && !nskb->remcsum_offload) {//如果不支持NETIF_F_SG,说明frags数组中没有数据,只考虑从线性区中拷贝数据 nskb->ip_summed = CHECKSUM_NONE; nskb->csum = skb_copy_and_csum_bits(head_skb, offset, //计算cusm值 skb_put(nskb, len), len, 0); SKB_GSO_CB(nskb)->csum_start = skb_headroom(nskb) + doffset; continue; } nskb_frag = skb_shinfo(nskb)->frags; //如果hsize不为0,那么拷贝hsize的内容到nskb的线性区中 skb_copy_from_linear_data_offset(head_skb, offset, //拷贝线性区数据 skb_put(nskb, hsize), hsize); skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; //每次要拷贝的数据长度是len,其中hsize是位于线性区中,但是随着线性区数据逐渐被处理,hsize可能不够len,这时剩下的(len-hsize)长度就要从frags数组中拷贝了 while (pos < offset + len) { if (i >= nfrags) { BUG_ON(skb_headlen(list_skb)); i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; frag_skb = list_skb; BUG_ON(!nfrags); list_skb = list_skb->next; //frag_list场景,取下一个skb } if (unlikely(skb_shinfo(nskb)->nr_frags >= MAX_SKB_FRAGS)) { net_warn_ratelimited( "skb_segment: too many frags: %u %u\n", pos, mss); goto err; } if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) goto err; *nskb_frag = *frag; //frag_list的逻辑和frag的逻辑合并在了一起,增加了复杂度 __skb_frag_ref(nskb_frag); size = skb_frag_size(nskb_frag); if (pos < offset) {//pos初始为线性区长度,后来表示已经被拷贝的长度 nskb_frag->page_offset += offset - pos; skb_frag_size_sub(nskb_frag, offset - pos); //frag分拆 } skb_shinfo(nskb)->nr_frags++; if (pos + size <= offset + len) { i++; frag++; pos += size; } else { skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); //frag分拆 goto skip_fraglist; } nskb_frag++; } skip_fraglist: nskb->data_len = len - hsize; nskb->len += nskb->data_len; nskb->truesize += nskb->data_len; perform_csum_check: if (!csum && !nskb->remcsum_offload) { nskb->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); //计算csum值 nskb->ip_summed = CHECKSUM_NONE; SKB_GSO_CB(nskb)->csum_start = skb_headroom(nskb) + doffset; } } while ((offset += len) < head_skb->len); /* Some callers want to get the end of the list. * Put it in segs->prev to avoid walking the list. * (see validate_xmit_skb_list() for example) */ segs->prev = tail; /* Following permits correct backpressure, for protocols * using skb_set_owner_w(). * Idea is to tranfert ownership from head_skb to last segment. */ if (head_skb->destructor == sock_wfree) { swap(tail->truesize, head_skb->truesize); swap(tail->destructor, head_skb->destructor); swap(tail->sk, head_skb->sk); } return segs; err: kfree_skb_list(segs); return ERR_PTR(err); }
输出报文 分片:
int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph; int ptr; struct sk_buff *skb2; unsigned int mtu, hlen, left, len, ll_rs; int offset; __be16 not_last_frag; struct rtable *rt = skb_rtable(skb); int err = 0; /* for offloaded checksums cleanup checksum before fragmentation */ /* PARTIAL类型需要清除校验和 */ if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_checksum_help(skb))) goto fail; /* * Point into the IP datagram header. */ iph = ip_hdr(skb); /* 获取mtu */ mtu = ip_skb_dst_mtu(sk, skb); /* 接收到的最大分片长度 < mtu,则将mtu设置为该值 */ if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu) mtu = IPCB(skb)->frag_max_size; /* * Setup starting values. */ hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing * one, it is not prohibited. In this case fall back to copying. * * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment. */ /* 有分片列表 */ if (skb_has_frag_list(skb)) { struct sk_buff *frag, *frag2; /* 线性区域和分页区的数据长度 */ unsigned int first_len = skb_pagelen(skb); /* 以下情况,进入慢路处理 */ if (first_len - hlen > mtu || /* 分片长度>MTU */ ((first_len - hlen) & 7) || /* 没有8字节对齐 */ ip_is_fragment(iph) || /* 是一个分片 */ skb_cloned(skb)) /* 是克隆的 */ goto slow_path; /* 遍历分片列表 */ skb_walk_frags(skb, frag) { /* Correct geometry. */ /* 以下情况,恢复状态,进入慢速路径 */ if (frag->len > mtu || /* 分片长度>mtu */ ((frag->len & 7) && frag->next) || /* 除最后一个分片外,其余有非8字节对齐的 */ skb_headroom(frag) < hlen) /* 头部长度过小 */ goto slow_path_clean; /* Partially cloned skb? */ /* 克隆的,恢复状态,进入慢速路径 */ if (skb_shared(frag)) goto slow_path_clean; BUG_ON(frag->sk); /* 分片关联控制块 */ if (skb->sk) { frag->sk = skb->sk; frag->destructor = sock_wfree; } /* 第一个skb的长度去掉当前分片的长度 */ skb->truesize -= frag->truesize; } /* Everything is OK. Generate! */ /* 现在分片没问题了,设置分片信息 */ err = 0; offset = 0; frag = skb_shinfo(skb)->frag_list; skb_frag_list_init(skb); skb->data_len = first_len - skb_headlen(skb); skb->len = first_len; iph->tot_len = htons(first_len); iph->frag_off = htons(IP_MF); ip_send_check(iph); /* 循环设置分片信息,并发送 */ for (;;) { /* Prepare header of the next frame, * before previous one went down. */ /* 为每一片都拷贝ip头,设置偏移信息 */ if (frag) { frag->ip_summed = CHECKSUM_NONE; skb_reset_transport_header(frag); __skb_push(frag, hlen); skb_reset_network_header(frag); memcpy(skb_network_header(frag), iph, hlen); iph = ip_hdr(frag); iph->tot_len = htons(frag->len); ip_copy_metadata(frag, skb); if (offset == 0) ip_options_fragment(frag); offset += skb->len - hlen; iph->frag_off = htons(offset>>3); if (frag->next) iph->frag_off |= htons(IP_MF); /* Ready, complete checksum */ ip_send_check(iph); } /* 调用发送回调 */ err = output(net, sk, skb); if (!err) IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; skb = frag; frag = skb->next; skb->next = NULL; } if (err == 0) { IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return 0; } /* 出错,释放分片 */ while (frag) { skb = frag->next; kfree_skb(frag); frag = skb; } IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; slow_path_clean: /* 将分片恢复原状态 */ skb_walk_frags(skb, frag2) { if (frag2 == frag) break; frag2->sk = NULL; frag2->destructor = NULL; skb->truesize += frag2->truesize; } } slow_path: /* 慢速分片路径 */ iph = ip_hdr(skb); /* 除去首部的剩余空间 */ left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ /* 二层头部空间 */ ll_rs = LL_RESERVED_SPACE(rt->dst.dev); /* * Fragment the datagram. */ /* 初始化mf和offset */ offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; not_last_frag = iph->frag_off & htons(IP_MF); /* * Keep copying data until we run out. */ /* 开始分片了 */ while (left > 0) { /* len初始为剩余长度 */ len = left; /* IF: it doesn't fit, use 'mtu' - the data space left */ /* 根据mtu确认长度 */ if (len > mtu) len = mtu; /* IF: we are not sending up to and including the packet end then align the next start on an eight byte boundary */ /* 除最后分片外,其余8字节对齐 */ if (len < left) { len &= ~7; } /* Allocate buffer */ /* 分配skb */ skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC); if (!skb2) { err = -ENOMEM; goto fail; } /* * Set up data on packet */ /* 拷贝元数据 */ ip_copy_metadata(skb2, skb); /* 预留空间,设置头部偏移 */ skb_reserve(skb2, ll_rs); skb_put(skb2, len + hlen); skb_reset_network_header(skb2); skb2->transport_header = skb2->network_header + hlen; /* * Charge the memory for the fragment to any owner * it might possess */ /* 关联sk */ if (skb->sk) skb_set_owner_w(skb2, skb->sk); /* * Copy the packet header into the new buffer. */ /* 拷贝头部 */ skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); /* * Copy a block of the IP datagram. */ /* 拷贝数据 */ if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) BUG(); left -= len; /* * Fill in the new header fields. */ iph = ip_hdr(skb2); /* 设置偏移 *// iph->frag_off = htons((offset >> 3)); /* 转发的数据包,带有FRAG_PMTU标记,则打上DF */ if (IPCB(skb)->flags & IPSKB_FRAG_PMTU) iph->frag_off |= htons(IP_DF); /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE * on the initial skb, so that all the following fragments * will inherit fixed options. */ /* 第一个分片包含ip选项 */ if (offset == 0) ip_options_fragment(skb); /* * Added AC : If we are fragmenting a fragment that's not the * last fragment then keep MF on each bit */ /* 不是最后分片需要设定MF标记 */ if (left > 0 || not_last_frag) iph->frag_off |= htons(IP_MF); /* 指针和偏移更新 */ ptr += len; offset += len; /* * Put this fragment into the sending queue. */ /* 设置数据长度 */ iph->tot_len = htons(len + hlen); /* 校验和 */ ip_send_check(iph); /* 发送分片 */ err = output(net, sk, skb2); if (err) goto fail; IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); } /* 分片完成并发送,释放skb */ consume_skb(skb); IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return err; fail: /* 出错,释放skb */ kfree_skb(skb); IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; }
http代理服务器(3-4-7层代理)-网络事件库公共组件、内核kernel驱动 摄像头驱动 tcpip网络协议栈、netfilter、bridge 好像看过!!!!
但行好事 莫问前程
--身高体重180的胖子