tcp pmtu探测
root@ubuntu:~/c++# cat /proc/sys/net/ipv4/tcp_mtu_probing 0 root@ubuntu:~/c++#
int val = 1; setsockopt(sd, IPPROTO_IP, IP_DONTFRAG, &val, sizeof(val));
Here's a page explaining this in further detail.
For Linux, it appears you have to use the IP_MTU_DISCOVER option with the value IP_PMTUDISC_DO (or IP_PMTUDISC_DONT to turn it off):
int val = IP_PMTUDISC_DO; setsockopt(sd, IPPROTO_IP, IP_MTU_DISCOVER, &val, sizeof(val));
/* emsgsize.c: test whether IP_PMTUDISC_PROBE suppresses EMSGSIZE * * Usage: emsgsize packet_size */ #include <arpa/inet.h> #include <errno.h> #include <netinet/in.h> #include <netinet/ip.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <sys/socket.h> #define CHECK(w_, s_) do { if ((s_) < 0) { perror(w_); return 1; }} while (0) /* Payload */ static unsigned char data[64 * 1024]; int main (int argc, char **argv) { int fd, on, s, size; struct sockaddr_in si; ssize_t sent; if (argc != 2) { fprintf(stderr, "usage: emsgsize size\n"); return 1; } size = atoi(argv[1]); memset(&si, 0, sizeof(si)); si.sin_family = AF_INET; fd = socket(si.sin_family, SOCK_DGRAM, 0); CHECK("socket", fd); s = bind(fd, (struct sockaddr *) &si, sizeof(si)); CHECK("bind", s); /* This is supposed to suppress sendmsg(2) returning -1 with * errno = EMSGSIZE, see ip(7): * " It is possible to implement RFC 4821 MTU probing with SOCK_DGRAM " or SOCK_RAW sockets by setting a value of IP_PMTUDISC_PROBE " (available since Linux 2.6.22). This is also particularly use- " ful for diagnostic tools such as tracepath(8) that wish to de- " liberately send probe packets larger than the observed Path MTU. */ on = IP_PMTUDISC_PROBE; s = setsockopt(fd, IPPROTO_IP, IP_MTU_DISCOVER, &on, sizeof(on)); CHECK("setsockopt", s); memset(&si, 0, sizeof(si)); si.sin_family = AF_INET; si.sin_port = htons(12345); /* Destination does not matter */ s = inet_pton(AF_INET, "127.0.0.1", &si.sin_addr); CHECK("inet_pton", s); sent = sendto(fd, data, (size_t) size, 0, (struct sockaddr *) &si, sizeof(si)); CHECK("sendto", sent); return 0; }
当TCP客户端发起连接建立请求时,在函数tcp_connect_init中调用TCP的MTU探测初始化函数tcp_mtup_init。如上所述默认情况下enabled为零,使用MSS最大限制值mss_clamp加上TCP头部长度和网络层头部长度作为MTU探测的上限值,下限值由函数tcp_mss_to_mtu通过基础MSS值计算得到。
void tcp_mtup_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; }
static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) { struct net *net = sock_net(sk); /* Black hole detection */ if (net->ipv4.sysctl_tcp_mtu_probing) { //2=启用, 1表示只有检测到black hole的时候才启用 if (!icsk->icsk_mtup.enabled) { //说明sysctl_tcp_mtu_probing=1 icsk->icsk_mtup.enabled = 1; //检测到black hole,启用tcp mtu probe icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); //更新mss_cache } else { //说明sysctl_tcp_mtu_probing=2,已经启用tcp mtu probe了 struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); int mss; mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; mss = min(net->ipv4.sysctl_tcp_base_mss, mss); mss = max(mss, 68 - tp->tcp_header_len); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); //减小下限,再试 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } } }
配置及初始化
想要启用tcp mtu probe, 先要设置ip_no_pmtu_disc=0(默认值), 表示启用pmtu discovery。 这样tcp发送的时候才会设置DF标记。
通过DF标记,中间路由设备如果需要分片就会返还ICMP消息通知, 但是有可能因为防火墙等原因,发送方收不到ICMP消息,因此发送方一直发送探测包,却一直没收到回应, 这个就称为black hole。
系统默认tcp_mtu_probe=1, 表示默认禁用mtu,只有当检测到black hole的时候,才会开启tcp mtu probe
root@ubuntu:~# ping -s 2500 -M do 8.8.8.8 PING 8.8.8.8 (8.8.8.8) 2500(2528) bytes of data. ping: local error: Message too long, mtu=1500 ping: local error: Message too long, mtu=1500 ping: local error: Message too long, mtu=1500 ping: local error: Message too long, mtu=1500 ping: local error: Message too long, mtu=1500 ping: local error: Message too long, mtu=1500 ping: local error: Message too long, mtu=1500 ping: local error: Message too long, mtu=1500 ping: local error: Message too long, mtu=1500 ping: local error: Message too long, mtu=1500 ^C --- 8.8.8.8 ping statistics --- 10 packets transmitted, 0 received, +10 errors, 100% packet loss, time 9213ms root@ubuntu:~#
demo2
root@ubuntu:~# ping -s 1400 -M do 8.8.8.8 PING 8.8.8.8 (8.8.8.8) 1400(1428) bytes of data. 76 bytes from 8.8.8.8: icmp_seq=1 ttl=101 (truncated) 76 bytes from 8.8.8.8: icmp_seq=2 ttl=101 (truncated) 76 bytes from 8.8.8.8: icmp_seq=3 ttl=101 (truncated) 76 bytes from 8.8.8.8: icmp_seq=4 ttl=101 (truncated) 76 bytes from 8.8.8.8: icmp_seq=5 ttl=101 (truncated) 76 bytes from 8.8.8.8: icmp_seq=6 ttl=101 (truncated) 76 bytes from 8.8.8.8: icmp_seq=7 ttl=101 (truncated) 76 bytes from 8.8.8.8: icmp_seq=8 ttl=101 (truncated) 76 bytes from 8.8.8.8: icmp_seq=9 ttl=101 (truncated) 76 bytes from 8.8.8.8: icmp_seq=10 ttl=101 (truncated) 76 bytes from 8.8.8.8: icmp_seq=11 ttl=101 (truncated) ^C --- 8.8.8.8 ping statistics --- 11 packets transmitted, 11 received, 0% packet loss, time 10006ms rtt min/avg/max/mdev = 47.750/47.863/48.119/0.285 ms root@ubuntu:~#
root@ubuntu:~# tcpdump -i enahisic2i0 icmp and host 8.8.8.8 -nnvv tcpdump: listening on enahisic2i0, link-type EN10MB (Ethernet), capture size 262144 bytes 16:05:25.061926 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto ICMP (1), length 1428) 10.10.1.82 > 8.8.8.8: ICMP echo request, id 20914, seq 1, length 1408 16:05:25.110142 IP (tos 0x0, ttl 101, id 0, offset 0, flags [none], proto ICMP (1), length 96) 8.8.8.8 > 10.10.1.82: ICMP echo reply, id 20914, seq 1, length 76 16:05:26.063245 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto ICMP (1), length 1428) 10.10.1.82 > 8.8.8.8: ICMP echo request, id 20914, seq 2, length 1408 16:05:26.111113 IP (tos 0x0, ttl 101, id 0, offset 0, flags [none], proto ICMP (1), length 96) 8.8.8.8 > 10.10.1.82: ICMP echo reply, id 20914, seq 2, length 76 16:05:27.065189 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto ICMP (1), length 1428) 10.10.1.82 > 8.8.8.8: ICMP echo request, id 20914, seq 3, length 1408 16:05:27.113076 IP (tos 0x0, ttl 101, id 0, offset 0, flags [none], proto ICMP (1), length 96) 8.8.8.8 > 10.10.1.82: ICMP echo reply, id 20914, seq 3, length 76