TCP接收窗口的调整算法(上)
我们知道TCP首部中有一个16位的接收窗口字段,它可以告诉对端:我现在能接收多少数据。TCP的流控制主要
就是通过调整接收窗口的大小来进行的。
本文内容:分析TCP接收窗口的调整算法,包括一些相关知识和初始接收窗口的取值。
内核版本:3.2.12
作者:zhangskd @ csdn blog
数据结构
以下是涉及到的数据结构。
struct tcp_sock { ... /* 最早接收但未确认的段的序号,即当前接收窗口的左端*/ u32 rcv_wup; /* rcv_nxt on last window update sent */ u16 advmss; /* Advertised MSS. 本端能接收的MSS上限,建立连接时用来通告对端*/ u32 rcv_ssthresh; /* Current window clamp. 当前接收窗口大小的阈值*/ u32 rcv_wnd; /* Current receiver window,当前的接收窗口大小*/ u32 window_clamp; /* 接收窗口的最大值,这个值也会动态调整*/ ... }
struct tcp_options_received { ... snd_wscale : 4, /* Window scaling received from sender, 对端接收窗口扩大因子 */ rcv_wscale : 4; /* Window scaling to send to receiver, 本端接收窗口扩大因子 */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup,对端的最大mss */ }
/** * struct sock - network layer representation of sockets * @sk_rcvbuf: size of receive buffer in bytes * @sk_receive_queue: incoming packets * @sk_write_queue: packet sending queue * @sk_sndbuf: size of send buffer in bytes */ struct sock { ... struct sk_buff_head sk_receive_queue; /* 表示接收队列sk_receive_queue中所有段的数据总长度*/ #define sk_rmem_alloc sk_backlog.rmem_alloc int sk_rcvbuf; /* 接收缓冲区长度的上限*/ int sk_sndbuf; /* 发送缓冲区长度的上限*/ struct sk_buff_head sk_write_queue; ... } struct sk_buff_head { /* These two members must be first. */ struct sk_buff *next; struct sk_buff *prev; __u32 qlen; spinlock_t lock; };
/** * inet_connection_sock - INET connection oriented sock * @icsk_ack: Delayed ACK control data */ struct inet_connection_sock { ... struct { ... /* 在快速发送确认模式中,可以快速发送ACK段的数量*/ __u8 quick; /* Scheduled number of quick acks */ /* 由最近接收到的段计算出的对端发送MSS */ __16 rcv_mss; /* MSS used for delayed ACK decisions */ } icsk_ack; ... }
struct tcphdr { __be16 source; __be16 dest; __be32 seq; __be32 ack_seq; #if defined (__LITTLE_ENDIAN_BITFIELD) __u16 resl : 4, doff : 4, fin : 1, syn : 1, rst : 1, psh : 1, ack : 1, urg : 1, ece : 1, cwr : 1; #elif defined (__BIG_ENDIAN_BITFIELD) __u16 doff : 4, resl : 4, cwr : 1, ece : 1, urg : 1, ack : 1, psh : 1, rst : 1, syn : 1, fin : 1; #else #error "Adjust your <asm/byteorder.h> defines" #endif __be16 window; /* 接收窗口,在这边呢 */ __sum16 check; __be16 urg_ptr; }
发送窗口和接收窗口的更新:
MSS
先来看下MSS,它在接收窗口的调整中扮演着重要角色。
通过MSS (Max Segment Size),数据被分割成TCP认为合适发送的数据块,称为段(Segment)。
注意:这里说的段(Segment)不包括协议首部,只包含数据!
与MSS最为相关的一个参数就是网络设备接口的MTU(Max Transfer Unit)。
两台主机之间的路径MTU并不一定是个常数,它取决于当时所选的路由。而选路不一定是对称
的(从A到B的路由和从B到A的路由不同)。因此路径MTU在两个方向上不一定是对称的。
所以,从A到B的有效MSS、从B到A的有效MSS是动态变化的,并且可能不相同。
每个端同时具有几个不同的MSS:
(1)tp->advmss
本端在建立连接时使用的MSS,是本端能接收的MSS上限。
这是从路由缓存中获得的(dst->metrics[RTAX_ADVMSS - 1]),一般是1460。
(2)tp->rx_opt.mss_clamp
对端的能接收的MSS上限,min(tp->rx_opt.user_mss, 对端在建立连接时通告的MSS)。
(3)tp->mss_cache
本端当前有效的发送MSS。显然不能超过对端接收的上限,tp->mss_cache <= tp->mss_clamp。
(4)tp->rx_opt.user_mss
用户通过TCP_MAXSEG选项设置的MSS上限,用于决定本端和对端的接收MSS上限。
(5)icsk->icsk_ack.rcv_mss
对端有效的发送MSS的估算值。显然不能超过本端接收的上限,icsk->icsk_ack.rcv_mss <= tp->advmss。
Receive buffer
接收缓存sk->sk_rcvbuf分为两部分:
(1) network buffer,一般占3/4,这部分是协议能够使用的。
(2)application buffer,一般占1/4。
我们在计算连接可用接收缓存的时候,并不会使用整个的sk_rcvbuf,防止应用程序读取数据的速度比
网络数据包到达的速度慢时,接收缓存被耗尽的情况。
以下是详细的说明:
The idea is not to use a complete receive buffer space to calculate the receive buffer.
We reserve some space as an application buffer, and the rest is used to queue incoming data segments.
An application buffer corresponds to the space that should compensate for the delay in time it takes for
an application to read from the socket buffer.
If the application is reading more slowly than the rate at which data are arriving, data will be queued in
the receive buffer. In order to avoid queue getting full, we advertise less receive window so that the sender
can slow down the rate of data transmission and by that time the application gets a chance to read data
from the receiver buffer.
一个包含X字节数据的skb的最小真实内存消耗(truesize):
/* return minimum truesize of one skb containing X bytes of data,这里的X包含协议头 */ #define SKB_TRUESIZE(X) ((X) + \ SKB_DATA_ALIGN(sizeof(struct sk_buff)) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
接收窗口的初始化
从最简单的开始,先来看下接收窗口的初始值、接收窗口扩大因子是如何取值的。
/* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space will be offered. * Store the results in the tp structure. * NOTE: for smooth operation initial space offering should be a multiple of mss * if possible. We assume here that mss >= 1. This MUST be enforced by all calllers. */ void tcp_select_initial_window (int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); /* 接收缓存不能为负*/ /* If no clamp set the clamp to the max possible scaled window。 * 如果接收窗口上限的初始值为0,则把它设成最大。 */ if (*window_clamp == 0) (*window_clamp) = (65535 << 14); /*这是接收窗口的最大上限*/ /* 接收窗口不能超过它的上限 */ space = min(*window_clamp, space); /* Quantize space offering to a multiple of mss if possible. * 接收窗口大小最好是mss的整数倍。 */ if (space > mss) space = (space / mss) * mss; /* 让space为mss的整数倍*/ /* NOTE: offering an initial window larger than 32767 will break some * buggy TCP stacks. If the admin tells us it is likely we could be speaking * with such a buggy stack we will truncate our initial window offering to * 32K - 1 unless the remote has sent us a window scaling option, which * we interpret as a sign the remote TCP is not misinterpreting the window * field as a signed quantity. */ /* 当协议使用有符号的接收窗口时,则接收窗口大小不能超过32767*/ if (sysctl_tcp_workaround_signed_windows) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); esle (*rcv_wnd) = space; (*rcv_wscale) = 0; /* 计算接收窗口扩大因子rcv_wscale,需要多大才能表示本连接的最大接收窗口大小?*/ if (wscale_ok) { /* Set window scaling on max possible window * See RFC1323 for an explanation of the limit to 14 * tcp_rmem[2]为接收缓冲区长度上限的最大值,用于调整sk_rcvbuf。 * rmem_max为系统接收窗口的最大大小。 */ space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); space = min_t(u32, space, *window_clamp); /*受限于具体连接*/ while (space > 65535 && (*rcv_wscale) < 14) { space >>= 1; (*rcv_wscale)++; } } /* Set initial window to a value enough for senders starting with initial * congestion window of TCP_DEFAULT_INIT_RCVWND. Place a limit on the * initial window when mss is larger than 1460. * * 接收窗口的初始值在这里确定,一般是10个数据段大小左右。 */ if (mss > (1 << *rcv_wscale)) { int init_cwnd = TCP_DEFAULT_INIT_RCVWND; /* 10 */ if (mss > 1460) init_cwnd = max_t(u32, 1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); /* when initializing use the value from init_rcv_wnd rather than the * default from above. * 决定初始接收窗口时,先考虑路由缓存中的,如果没有,再考虑系统默认的。 */ if (init_rcv_wnd) /* 如果路由缓存中初始接收窗口大小不为0*/ *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); else *rcv_wnd = min(*rcv_wnd, init_cwnd *mss); } /* Set the clamp no higher than max representable value */ (*window_clamp) = min(65535 << (*rcv_wscale), *window_clamp); }
初始的接收窗口的取值(mss的整数倍):
(1)先考虑路由缓存中的RTAX_INITRWND
(2)在考虑系统默认的TCP_DEFAULT_INIT_RCVWND(10)
(3)最后考虑min(3/4 * sk_rcvbuf, window_clamp),如果这个值很低
窗口扩大因子的取值:
接收窗口取最大值为max(tcp_rmem[2], rmem_max),本连接接收窗口的最大值为
min(max(tcp_rmem[2], rmem_max), window_clamp)。
那么我们需要多大的窗口扩大因子,才能用16位来表示最大的接收窗口呢?
如果接收窗口的最大值受限于tcp_rmem[2] = 4194304,那么rcv_wscale = 7,窗口扩大倍数为128。
发送SYN/ACK时的调用路径:tcp_v4_send_synack -> tcp_make_synack -> tcp_select_initial_window。
/* Prepare a SYN-ACK. */ struct sk_buff *tcp_make_synack (struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct request_values *rvp) { struct inet_request_sock *ireq = inet_rsk(req); struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th; struct sk_buff *skb; ... mss = dst_metric_advmss(dst); /*路由缓存中的mss*/ /*如果用户有特别设置,则取其小者*/ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) mss = tp->rx_opt.user_mss; if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ __u8 rcv_wscale; /* Set this up on the first call only */ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) req->window_clamp = tcp_full_space(sk); /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; } ... }