tcp pmtu探测

 

 

root@ubuntu:~/c++# cat /proc/sys/net/ipv4/tcp_mtu_probing
0
root@ubuntu:~/c++# 

 

 

int val = 1;
setsockopt(sd, IPPROTO_IP, IP_DONTFRAG, &val, sizeof(val));

Here's a page explaining this in further detail.

For Linux, it appears you have to use the IP_MTU_DISCOVER option with the value IP_PMTUDISC_DO (or IP_PMTUDISC_DONT to turn it off):

int val = IP_PMTUDISC_DO;
setsockopt(sd, IPPROTO_IP, IP_MTU_DISCOVER, &val, sizeof(val));


/* emsgsize.c: test whether IP_PMTUDISC_PROBE suppresses EMSGSIZE
 *
 * Usage: emsgsize packet_size
 */

#include <arpa/inet.h>
#include <errno.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>

#define CHECK(w_, s_) do { if ((s_) < 0) { perror(w_); return 1; }} while (0)

/* Payload */
static unsigned char data[64 * 1024];

int
main (int argc, char **argv)
{
    int fd, on, s, size;
    struct sockaddr_in si;
    ssize_t sent;

    if (argc != 2)
    {
        fprintf(stderr, "usage: emsgsize size\n");
        return 1;
    }
    size = atoi(argv[1]);

    memset(&si, 0, sizeof(si));
    si.sin_family = AF_INET;

    fd = socket(si.sin_family, SOCK_DGRAM, 0);
    CHECK("socket", fd);

    s = bind(fd, (struct sockaddr *) &si, sizeof(si));
    CHECK("bind", s);

    /* This is supposed to suppress sendmsg(2) returning -1 with
     * errno = EMSGSIZE, see ip(7):
     *
     "        It is possible to implement RFC 4821 MTU probing with SOCK_DGRAM
     "        or SOCK_RAW sockets by  setting  a  value  of  IP_PMTUDISC_PROBE
     "        (available  since Linux 2.6.22).  This is also particularly use-
     "        ful for diagnostic tools such as tracepath(8) that wish  to  de-
     "        liberately send probe packets larger than the observed Path MTU.
     */
    on = IP_PMTUDISC_PROBE;
    s = setsockopt(fd, IPPROTO_IP, IP_MTU_DISCOVER, &on, sizeof(on));
    CHECK("setsockopt", s);

    memset(&si, 0, sizeof(si));
    si.sin_family = AF_INET;
    si.sin_port = htons(12345); /* Destination does not matter */
    s = inet_pton(AF_INET, "127.0.0.1", &si.sin_addr);
    CHECK("inet_pton", s);
    sent = sendto(fd, data, (size_t) size, 0, (struct sockaddr *) &si,
                                                            sizeof(si));
    CHECK("sendto", sent);

    return 0;
}

 





当TCP客户端发起连接建立请求时,在函数tcp_connect_init中调用TCP的MTU探测初始化函数tcp_mtup_init。如上所述默认情况下enabled为零,使用MSS最大限制值mss_clamp加上TCP头部长度和网络层头部长度作为MTU探测的上限值,下限值由函数tcp_mss_to_mtu通过基础MSS值计算得到。
 

void tcp_mtup_init(struct sock *sk)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct inet_connection_sock *icsk = inet_csk(sk);
 
    icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
    icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len;
    icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
    icsk->icsk_mtup.probe_size = 0;
    if (icsk->icsk_mtup.enabled)
        icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;

}

 

static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
{
    struct net *net = sock_net(sk);
    /* Black hole detection */
    if (net->ipv4.sysctl_tcp_mtu_probing) {    //2=启用, 1表示只有检测到black hole的时候才启用
        if (!icsk->icsk_mtup.enabled) {    //说明sysctl_tcp_mtu_probing=1
            icsk->icsk_mtup.enabled = 1;    //检测到black hole,启用tcp mtu probe
            icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
            tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);    //更新mss_cache
        } else {    //说明sysctl_tcp_mtu_probing=2,已经启用tcp mtu probe了
            struct net *net = sock_net(sk);
            struct tcp_sock *tp = tcp_sk(sk);
            int mss;
            mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
            mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
            mss = max(mss, 68 - tp->tcp_header_len);
            icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);    //减小下限,再试
            tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
        }
    }
}

 

配置及初始化

想要启用tcp mtu probe, 先要设置ip_no_pmtu_disc=0(默认值), 表示启用pmtu discovery。 这样tcp发送的时候才会设置DF标记。
通过DF标记,中间路由设备如果需要分片就会返还ICMP消息通知, 但是有可能因为防火墙等原因,发送方收不到ICMP消息,因此发送方一直发送探测包,却一直没收到回应, 这个就称为black hole。
系统默认tcp_mtu_probe=1, 表示默认禁用mtu,只有当检测到black hole的时候,才会开启tcp mtu probe

 

 

 

 

root@ubuntu:~# ping -s 2500  -M do  8.8.8.8 
PING 8.8.8.8 (8.8.8.8) 2500(2528) bytes of data.
ping: local error: Message too long, mtu=1500
ping: local error: Message too long, mtu=1500
ping: local error: Message too long, mtu=1500
ping: local error: Message too long, mtu=1500
ping: local error: Message too long, mtu=1500
ping: local error: Message too long, mtu=1500
ping: local error: Message too long, mtu=1500
ping: local error: Message too long, mtu=1500
ping: local error: Message too long, mtu=1500
ping: local error: Message too long, mtu=1500
^C
--- 8.8.8.8 ping statistics ---
10 packets transmitted, 0 received, +10 errors, 100% packet loss, time 9213ms

root@ubuntu:~# 

 

demo2

root@ubuntu:~# ping -s 1400  -M do  8.8.8.8 
PING 8.8.8.8 (8.8.8.8) 1400(1428) bytes of data.
76 bytes from 8.8.8.8: icmp_seq=1 ttl=101 (truncated)
76 bytes from 8.8.8.8: icmp_seq=2 ttl=101 (truncated)
76 bytes from 8.8.8.8: icmp_seq=3 ttl=101 (truncated)
76 bytes from 8.8.8.8: icmp_seq=4 ttl=101 (truncated)
76 bytes from 8.8.8.8: icmp_seq=5 ttl=101 (truncated)
76 bytes from 8.8.8.8: icmp_seq=6 ttl=101 (truncated)
76 bytes from 8.8.8.8: icmp_seq=7 ttl=101 (truncated)
76 bytes from 8.8.8.8: icmp_seq=8 ttl=101 (truncated)
76 bytes from 8.8.8.8: icmp_seq=9 ttl=101 (truncated)
76 bytes from 8.8.8.8: icmp_seq=10 ttl=101 (truncated)
76 bytes from 8.8.8.8: icmp_seq=11 ttl=101 (truncated)
^C
--- 8.8.8.8 ping statistics ---
11 packets transmitted, 11 received, 0% packet loss, time 10006ms
rtt min/avg/max/mdev = 47.750/47.863/48.119/0.285 ms
root@ubuntu:~# 

 

root@ubuntu:~#  tcpdump -i enahisic2i0 icmp and  host 8.8.8.8 -nnvv
tcpdump: listening on enahisic2i0, link-type EN10MB (Ethernet), capture size 262144 bytes
16:05:25.061926 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto ICMP (1), length 1428)
    10.10.1.82 > 8.8.8.8: ICMP echo request, id 20914, seq 1, length 1408
16:05:25.110142 IP (tos 0x0, ttl 101, id 0, offset 0, flags [none], proto ICMP (1), length 96)
    8.8.8.8 > 10.10.1.82: ICMP echo reply, id 20914, seq 1, length 76
16:05:26.063245 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto ICMP (1), length 1428)
    10.10.1.82 > 8.8.8.8: ICMP echo request, id 20914, seq 2, length 1408
16:05:26.111113 IP (tos 0x0, ttl 101, id 0, offset 0, flags [none], proto ICMP (1), length 96)
    8.8.8.8 > 10.10.1.82: ICMP echo reply, id 20914, seq 2, length 76
16:05:27.065189 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto ICMP (1), length 1428)
    10.10.1.82 > 8.8.8.8: ICMP echo request, id 20914, seq 3, length 1408
16:05:27.113076 IP (tos 0x0, ttl 101, id 0, offset 0, flags [none], proto ICMP (1), length 96)
    8.8.8.8 > 10.10.1.82: ICMP echo reply, id 20914, seq 3, length 76

 

posted on 2021-04-02 15:47  tycoon3  阅读(1038)  评论(0编辑  收藏  举报

导航