TCP最大报文段MSS源码分析
概述
本文主要对MSS相关的几个字段结合源码流程进行分析;
字段含义
user_mss(tcp_options_received)–用户配置的mss,优先级最高;
mss_clamp(tcp_options_received)–对端通告的mss,即为对端能接受的最大mss,对端通告的mss与user_mss中的较小值;
advmss(tcp_sock)–用于通告对端的mss值,本端能接受的最大mss;
mss_cache(tcp_sock)–缓存发送方当前有效的mss值,根据pmtu变化,不会超过mss_clamp;
rcv_mss(inet_connection_sock)–由最近接收到的段估算的对端mss,主要用来确定是否执行延迟确认;
user_mss配置
user_mss是用户配置的MSS,该MSS优先级最高,如果配置了该MSS,则MSS均不能超过该值;下面为调用setsockopt设置user_mss的代码,其操作字段为TCP_MAXSEG;配置范围不能小于最小MSS,不能大于最大窗口值;
1 static int do_tcp_setsockopt(struct sock *sk, int level, 2 int optname, char __user *optval, unsigned int optlen) 3 { 4 switch (optname) { 5 case TCP_MAXSEG: 6 /* Values greater than interface MTU won't take effect. However 7 * at the point when this call is done we typically don't yet 8 * know which interface is going to be used */ 9 if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) { 10 err = -EINVAL; 11 break; 12 } 13 tp->rx_opt.user_mss = val; 14 break; 15 }
交互流程代码分析
第一次握手
客户端发送syn
在进行connect操作的初始化中对mss的设置如下:
(1) 如果有用户配置的user_mss,则将mss_clamp(本端最大mss)设置为user_mss;
(2) 调用tcp_sync_mss来同步mss,其主要是根据设备mtu,最大窗口等计算出当前有效的mss,并将该mss记录到tp->mss_cache中;因该函数涉及篇幅较大,在本文最后进行分析;
(3) 设置用于通告给对端的advmss,去路由表中查MSS,这里会用到pmtu,然后将这个值与user_mss比较,取较小的值设置为向对端通告的值;
(4) 估算对端的mss,根据advmss,mss_cache,rcv_wnd,MSS_DEFAULT,MIN_MSS估算rcv_mss;
1 static void tcp_connect_init(struct sock *sk) 2 { 3 /* If user gave his TCP_MAXSEG, record it to clamp */ 4 /* (1)如果配置了user_mss,则设置最大mss为user_mss */ 5 if (tp->rx_opt.user_mss) 6 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; 7 tp->max_window = 0; 8 tcp_mtup_init(sk); 9 /* (2)根据设备mtu同步mss */ 10 tcp_sync_mss(sk, dst_mtu(dst)); 11 12 tcp_ca_dst_init(sk, dst); 13 14 if (!tp->window_clamp) 15 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 16 17 /* 18 (3)设置向对端通告的mss 19 dst_metric_advmss-去路由表中查询mss 20 tcp_mss_clamp-取user_mss和上述查询到的mss之间的较小值 21 */ 22 tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); 23 24 /* (4)估算对端mss */ 25 tcp_initialize_rcv_mss(sk); 26 }
在发送syn流程中,会将advmss添加到tcp首部的选项中;调用关系为tcp_transmit_skb->tcp_syn_options->tcp_advertise_mss;可见这里不是直接使用前面的adv_mss,而是调用tcp_advertise_mss重新获取的;
1 /* Compute TCP options for SYN packets. This is not the final 2 * network wire format yet. 3 */ 4 static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, 5 struct tcp_out_options *opts, 6 struct tcp_md5sig_key **md5) 7 { 8 /* We always get an MSS option. The option bytes which will be seen in 9 * normal data packets should timestamps be used, must be in the MSS 10 * advertised. But we subtract them from tp->mss_cache so that 11 * calculations in tcp_sendmsg are simpler etc. So account for this 12 * fact here if necessary. If we don't do this correctly, as a 13 * receiver we won't recognize data packets as being full sized when we 14 * should, and thus we won't abide by the delayed ACK rules correctly. 15 * SACKs don't matter, we never delay an ACK when we have any of those 16 * going out. */ 17 opts->mss = tcp_advertise_mss(sk); 18 remaining -= TCPOLEN_MSS_ALIGNED; 19 }
tcp_advertise_mss重新取查路由表获取mss,并且与前面获取的mss取较小值;
1 /* Calculate mss to advertise in SYN segment. 2 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: 3 * 4 * 1. It is independent of path mtu. 5 * 2. Ideally, it is maximal possible segment size i.e. 65535-40. 6 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of 7 * attached devices, because some buggy hosts are confused by 8 * large MSS. 9 * 4. We do not make 3, we advertise MSS, calculated from first 10 * hop device mtu, but allow to raise it to ip_rt_min_advmss. 11 * This may be overridden via information stored in routing table. 12 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, 13 * probably even Jumbo". 14 */ 15 static __u16 tcp_advertise_mss(struct sock *sk) 16 { 17 struct tcp_sock *tp = tcp_sk(sk); 18 const struct dst_entry *dst = __sk_dst_get(sk); 19 int mss = tp->advmss; 20 21 if (dst) { 22 unsigned int metric = dst_metric_advmss(dst); 23 24 if (metric < mss) { 25 mss = metric; 26 tp->advmss = mss; 27 } 28 } 29 30 return (__u16)mss; 31 }
服务器接收syn
服务器当前处于LISTEN状态,收到客户端发来的syn包,在处理过程中,需要解析tcp首部的选项,调用关系为tcp_conn_request->tcp_parse_options,其中解析选项的MSS部分如下,解析mss选项,与user_mss进行对比取较小值,然后将mss_clamp(最大mss)设置为该值;
1 /* Look for tcp options. Normally only called on SYN and SYNACK packets. 2 * But, this can also be called on packets in the established flow when 3 * the fast version below fails. 4 */ 5 void tcp_parse_options(const struct sk_buff *skb, 6 struct tcp_options_received *opt_rx, int estab, 7 struct tcp_fastopen_cookie *foc) 8 { 9 switch (opcode) { 10 case TCPOPT_MSS: 11 if (opsize == TCPOLEN_MSS && th->syn && !estab) { 12 u16 in_mss = get_unaligned_be16(ptr); 13 if (in_mss) { 14 if (opt_rx->user_mss && opt_rx->user_mss < in_mss) 15 in_mss = opt_rx->user_mss; 16 opt_rx->mss_clamp = in_mss; 17 } 18 } 19 break; 20 }
在分配了请求控制块,对控制块进行初始化的时候,使用从选项中获取的最大mss初始化控制块的mss;
1 static void tcp_openreq_init(struct request_sock *req, 2 const struct tcp_options_received *rx_opt, 3 struct sk_buff *skb, const struct sock *sk) 4 { 5 struct inet_request_sock *ireq = inet_rsk(req); 6 /* ... */ 7 req->mss = rx_opt->mss_clamp; 8 /* ... */ 9 }
第二次握手
服务器发送syn+ack
在请求控制块添加到连接链表之后,需要向客户端发送syn+ack,在构造synack包时,需要在选项中指明本端的mss,调用关系如下:tcp_v4_send_synack–>tcp_make_synack–>tcp_synack_options;首先获取mss,方法与前客户端的方法一致,即从路由表中获取mss,与用户配置的user_mss进行比较,取其中较小值;然后调用选项设置将该mss加入到选项中;
1 struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, 2 struct request_sock *req, 3 struct tcp_fastopen_cookie *foc, 4 enum tcp_synack_type synack_type) 5 { 6 /* mss取从路由表中查询的mss与user_mss之间的较小值 */ 7 mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); 8 /* 设置tcp选项 */ 9 tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + sizeof(*th); 10 }
1 /* Set up TCP options for SYN-ACKs. */ 2 static unsigned int tcp_synack_options(struct request_sock *req, 3 unsigned int mss, struct sk_buff *skb, 4 struct tcp_out_options *opts, 5 const struct tcp_md5sig_key *md5, 6 struct tcp_fastopen_cookie *foc) 7 { 8 struct inet_request_sock *ireq = inet_rsk(req); 9 unsigned int remaining = MAX_TCP_OPTION_SPACE; 10 11 /* We always send an MSS option. */ 12 opts->mss = mss; 13 remaining -= TCPOLEN_MSS_ALIGNED; 14 }
客户端接收syn+ack
客户端当前处于SYN_SENT状态,此时收到服务器发来的syn+ack包,客户端进行以下工作:(1)解析该包tcp选项中的mss ,存入opt_rx->mss_clamp (2) 通过最新的pmtu计算mss (3) 估算对端mss (4) 如果需要进入快速模式,则需要通过rcv_mss计算快速模式额度;
1 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 2 const struct tcphdr *th) 3 { 4 struct inet_connection_sock *icsk = inet_csk(sk); 5 struct tcp_sock *tp = tcp_sk(sk); 6 struct tcp_fastopen_cookie foc = { .len = -1 }; 7 int saved_clamp = tp->rx_opt.mss_clamp; 8 bool fastopen_fail; 9 /* ... */ 10 /* (1)解析tcp选项 */ 11 tcp_parse_options(skb, &tp->rx_opt, 0, &foc); 12 /* ... */ 13 /* (2)计算mss */ 14 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 15 /* (3)初始化rcv_mss */ 16 tcp_initialize_rcv_mss(sk); 17 /* ... */ 18 /* (4)进入快速ack模式 */ 19 tcp_enter_quickack_mode(sk); 20 }
已连接状态发送数据
tcp发送数据系统调用最终会调用tcp_sendmsg函数,该函数会在发送数据之前,获取发送mss,该mss用于限制后续发送数据段大小;
1 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) 2 { 3 /*...*/ 4 mss_now = tcp_send_mss(sk, &size_goal, flags); 5 /*...*/ 6 }
1 static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) 2 { 3 int mss_now; 4 5 mss_now = tcp_current_mss(sk); 6 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); 7 8 return mss_now; 9 }
tcp_current_mss函数根据当前mtu和实际头部选项长度,来更新mss值;
1 /* Compute the current effective MSS, taking SACKs and IP options, 2 * and even PMTU discovery events into account. 3 */ 4 unsigned int tcp_current_mss(struct sock *sk) 5 { 6 const struct tcp_sock *tp = tcp_sk(sk); 7 const struct dst_entry *dst = __sk_dst_get(sk); 8 u32 mss_now; 9 unsigned int header_len; 10 struct tcp_out_options opts; 11 struct tcp_md5sig_key *md5; 12 13 /* 获取当前有效mss */ 14 mss_now = tp->mss_cache; 15 16 /* 路由缓存存在 */ 17 if (dst) { 18 /* 获取路径mtu */ 19 u32 mtu = dst_mtu(dst); 20 21 /* 两个mtu不相等,以当前mtu为准更新mss */ 22 if (mtu != inet_csk(sk)->icsk_pmtu_cookie) 23 mss_now = tcp_sync_mss(sk, mtu); 24 } 25 26 /* 获取头部长度 */ 27 header_len = tcp_established_options(sk, NULL, &opts, &md5) + 28 sizeof(struct tcphdr); 29 /* The mss_cache is sized based on tp->tcp_header_len, which assumes 30 * some common options. If this is an odd packet (because we have SACK 31 * blocks etc) then our calculated header_len will be different, and 32 * we have to adjust mss_now correspondingly */ 33 34 /* 头部长度不等,需要更新mss */ 35 if (header_len != tp->tcp_header_len) { 36 int delta = (int) header_len - tp->tcp_header_len; 37 mss_now -= delta; 38 } 39 40 /* 返回mss */ 41 return mss_now; 42 }
函数tcp_sync_mss
这个函数上面的诸多流程都有用到,这里统一进行分析说明;
1 /* This function synchronize snd mss to current pmtu/exthdr set. 2 3 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts 4 for TCP options, but includes only bare TCP header. 5 6 tp->rx_opt.mss_clamp is mss negotiated at connection setup. 7 It is minimum of user_mss and mss received with SYN. 8 It also does not include TCP options. 9 10 inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. 11 12 tp->mss_cache is current effective sending mss, including 13 all tcp options except for SACKs. It is evaluated, 14 taking into account current pmtu, but never exceeds 15 tp->rx_opt.mss_clamp. 16 17 NOTE1. rfc1122 clearly states that advertised MSS 18 DOES NOT include either tcp or ip options. 19 20 NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache 21 are READ ONLY outside this function. --ANK (980731) 22 */ 23 /*更新mss */ 24 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) 25 { 26 struct tcp_sock *tp = tcp_sk(sk); 27 struct inet_connection_sock *icsk = inet_csk(sk); 28 int mss_now; 29 30 /* 发现mtu上限>路径mtu,则重置为路径mtu */ 31 if (icsk->icsk_mtup.search_high > pmtu) 32 icsk->icsk_mtup.search_high = pmtu; 33 34 /* 计算当前mss */ 35 mss_now = tcp_mtu_to_mss(sk, pmtu); 36 /* 根据对端通知的最大窗口和当前mss大小调整mss */ 37 mss_now = tcp_bound_to_half_wnd(tp, mss_now); 38 39 /* And store cached results */ 40 /* 记录最新的路径mtu */ 41 icsk->icsk_pmtu_cookie = pmtu; 42 /* 启用了路径mtu发现 */ 43 if (icsk->icsk_mtup.enabled) 44 /* mss为当前mss和mss探测下限计算所得的最小值 */ 45 mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); 46 /* 当前mss缓存 */ 47 tp->mss_cache = mss_now; 48 49 return mss_now; 50 }
下面两个函数作用为根据mtu计算mss;
1 /* 计算mss,未包含SACK */ 2 int tcp_mtu_to_mss(struct sock *sk, int pmtu) 3 { 4 /* Subtract TCP options size, not including SACKs */ 5 /* 去掉tcp选项的长度 */ 6 return __tcp_mtu_to_mss(sk, pmtu) - 7 (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); 8 }
1 /* 在不根据tcp选项的情况下计算mss */ 2 static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) 3 { 4 const struct tcp_sock *tp = tcp_sk(sk); 5 const struct inet_connection_sock *icsk = inet_csk(sk); 6 int mss_now; 7 8 /* Calculate base mss without TCP options: 9 It is MMS_S - sizeof(tcphdr) of rfc1122 10 */ 11 /* 当前mss = 路径mtu - 网络头 - tcp头 */ 12 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); 13 14 /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ 15 if (icsk->icsk_af_ops->net_frag_header_len) { 16 const struct dst_entry *dst = __sk_dst_get(sk); 17 18 if (dst && dst_allfrag(dst)) 19 mss_now -= icsk->icsk_af_ops->net_frag_header_len; 20 } 21 22 /* Clamp it (mss_clamp does not include tcp options) */ 23 /* 当前mss > mss最大值,调整成最大值 */ 24 if (mss_now > tp->rx_opt.mss_clamp) 25 mss_now = tp->rx_opt.mss_clamp; 26 27 /* Now subtract optional transport overhead */ 28 /* mss减去ip选项长度 */ 29 mss_now -= icsk->icsk_ext_hdr_len; 30 31 /* Then reserve room for full set of TCP options and 8 bytes of data */ 32 /* 若不足48,则需要扩充保留40字节的tcp选项和8字节的tcp数据长度 */ 33 /* 8+20+20+18=64,最小包长 */ 34 if (mss_now < 48) 35 mss_now = 48; 36 37 /* 返回mss */ 38 return mss_now; 39 }
tcp_bound_to_half_wnd函数根据对端通告窗口的最大值来调整mss;如果最大窗口大于默认mss,则当前mss不能超过窗口的一半,当然也不能太小,最小68-headerlen;
1 static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) 2 { 3 int cutoff; 4 5 /* When peer uses tiny windows, there is no use in packetizing 6 * to sub-MSS pieces for the sake of SWS or making sure there 7 * are enough packets in the pipe for fast recovery. 8 * 9 * On the other hand, for extremely large MSS devices, handling 10 * smaller than MSS windows in this way does make sense. 11 */ 12 /* 13 对端通告的最大窗口> 默认mss 14 cutoff记录最大窗口的一半 15 */ 16 if (tp->max_window > TCP_MSS_DEFAULT) 17 cutoff = (tp->max_window >> 1); 18 /* <=默认mss,则记录最大窗口 */ 19 else 20 cutoff = tp->max_window; 21 22 23 /* 包大小值限制在68-header <= x <=cutoff之间 */ 24 25 26 27 /* 包大小> cutoff,则从cutoff和最小mtu之间取大的 */ 28 if (cutoff && pktsize > cutoff) 29 return max_t(int, cutoff, 68U - tp->tcp_header_len); 30 31 /* 包大小<= cutoff,返回包大小 */ 32 /* 窗口很大,则使用包大小 */ 33 else 34 return pktsize; 35 }