getsockopt的TCP层实现剖析
应用层
NAME
getsockopt - get options on sockets
SYNOPSIS
#include <sys/types.h>
#include <sys/socket.h>
int getsockopt (int s, int level, int optname, void *optval, socklen_t *optlen);
调用关系
函数的调用关系图如下:
数据结构
struct tcp_sock { ... u32 mss_cache; /* Cached effective mss, not including SACKS */ u8 nonagle : 4, /* Disable Nagle algorithm? */ thin_lto : 1, /* Use linear timeouts for thin streams */ thin_dupack : 1, /* Fast retransmit on first dupack */ unused : 2; unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ u8 keepalive_probes; /* num of allowed keep alive probes */ int linger2; u8 ecn_flags; /* ECN status bits. */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ u32 window_clamp; /* Maximal window to advertise,本端通告窗口最大值 */ u32 rcv_ssthresh; /* Current window clamp */ u16 advmss; /* Advertised MSS,本端的通告MSS */ u32 total_retrans; /* Total retransmits for entire connection */ ... }
struct tcp_options_received { ... u16 saw_tstamp : 1, /* Saw TIMESTAMP on last packet */ tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */ dsack : 1, /* D-SACK is scheduled */ wscale_ok : 1, /* Wscale seen on SYN packet */ sack_ok : 4, /* SACK seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4, /* Window scaling to send to receiver */ ... }
/* for TCP_INFO socket option */ #define TCPI_OPT_TIMESTAMPS 1 #define TCPI_OPT_SACK 2 #define TCPI_OPT_WSCALE 4 #define TCPI_OPT_ECN 8 struct tcp_info { __u8 tcpi_state; /* TCP状态 */ __u8 tcpi_ca_state; /* TCP拥塞状态 */ __u8 tcpi_retransmits; /* 超时重传的次数 */ __u8 tcpi_probes; /* 持续定时器或保活定时器发送且未确认的段数*/ __u8 tcpi_backoff; /* 退避指数 */ __u8 tcpi_options; /* 时间戳选项、SACK选项、窗口扩大选项、ECN选项是否启用*/ __u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; /* 发送、接收的窗口扩大因子*/ __u32 tcpi_rto; /* 超时时间,单位为微秒*/ __u32 tcpi_ato; /* 延时确认的估值,单位为微秒*/ __u32 tcpi_snd_mss; /* 本端的MSS */ __u32 tcpi_rcv_mss; /* 对端的MSS */ __u32 tcpi_unacked; /* 未确认的数据段数,或者current listen backlog */ __u32 tcpi_sacked; /* SACKed的数据段数,或者listen backlog set in listen() */ __u32 tcpi_lost; /* 丢失且未恢复的数据段数 */ __u32 tcpi_retrans; /* 重传且未确认的数据段数 */ __u32 tcpi_fackets; /* FACKed的数据段数 */ /* Times. 单位为毫秒 */ __u32 tcpi_last_data_sent; /* 最近一次发送数据包在多久之前 */ __u32 tcpi_last_ack_sent; /* 不能用。Not remembered, sorry. */ __u32 tcpi_last_data_recv; /* 最近一次接收数据包在多久之前 */ __u32 tcpi_last_ack_recv; /* 最近一次接收ACK包在多久之前 */ /* Metrics. */ __u32 tcpi_pmtu; /* 最后一次更新的路径MTU */ __u32 tcpi_rcv_ssthresh; /* current window clamp,rcv_wnd的阈值 */ __u32 tcpi_rtt; /* 平滑的RTT,单位为微秒 */ __u32 tcpi_rttvar; /* 四分之一mdev,单位为微秒v */ __u32 tcpi_snd_ssthresh; /* 慢启动阈值 */ __u32 tcpi_snd_cwnd; /* 拥塞窗口 */ __u32 tcpi_advmss; /* 本端能接受的MSS上限,在建立连接时用来通告对端 */ __u32 tcpi_reordering; /* 没有丢包时,可以重新排序的数据段数 */ __u32 tcpi_rcv_rtt; /* 作为接收端,测出的RTT值,单位为微秒*/ __u32 tcpi_rcv_space; /* 当前接收缓存的大小 */ __u32 tcpi_total_retrans; /* 本连接的总重传个数 */ };
函数实现
内核版本:2.6.37
int tcp_getsockopt (struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) return icsk->icsk_af_ops->getsockopt(sk, level, optname, optval, optlen); return do_tcp_getsockopt(sk, level, optname, optval, optlen); }
static int do_tcp_getsockopt (struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int val, len; if (get_user(len, optlen)) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); if (len < 0) return -EINVAL; switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; /* 默认为1460 */ if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE) | TCPF_LISTEN)) val = tp->rx_opt.user_mss; /* mss requested by user in ioctl */ break; case TCP_NODELAY: val = !! (tp->nonagle & TCP_NAGLE_OFF); /* Nagle默认使用,故默认值为0 */ break; case TCP_CORK: val = !! (tp->nonagle & TCP_NAGLE_CORK); /* 默认值为0 */ break; case TCP_KEEPIDLE: val = keepalive_time_when(tp) / HZ; /* 默认为7200s */ break; case TCP_KEEPINTVL: val = keepalive_intvl_when(tp) / HZ; /* 默认为75s */ break; case TCP_KEEPCNT: val = keepalive_probes(tp); /* 默认为9 */ break; case TCP_SYNCNT: val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; /* 默认为5 */ break; case TCP_LINGER2: val = tp->linger2; if (val >= 0) val = (val ? : sysctl_tcp_fin_timeout) / HZ; /* 默认为60s */ case TCP_DEFER_ACCEPT: ... case TCP_WINDOW_CLAMP: val = tp->window_clamp; break; case TCP_INFO: { struct tcp_info info; if (get_user(len, optlen)) return -EFAULT; tcp_get_info(sk, &info); /* 获取TCP连接的详细信息!*/ len = min_t(unsigned int, len, sizeof(info)); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &info, len)) return -EFAULT; return 0; } case TCP_QUICKACK: val = ! icsk->icsk_ack.pingpong; /* 快速确认模式 */ break; case TCP_CONGESTION: if (get_user(len, optlen)) return -EFAULT; len = min_t(unsigned int, len, TCP_CA_NAME_MAX); /* 16 Bytes */ if (put_user(len, optlen) return -EFAULT; if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) /* 默认为cubic */ return -EFAULT; return 0; case TCP_COOKIE_TRANSACTIONS : ... case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; /* 默认为0 */ break; case TCP_THIN_DUPACK: val = tp->thin_dupack; /* 默认为0 */ break; case TCP_USER_TIMEOUT: val = jiffies_to_msecs(icsk->icsk_user_timeout); break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; }
static inline int keepalive_time_when(const struct tcp_sock *tp) { return tp->keepalive_time ? : sysctl_tcp_keepalive_time; } static inline int keepalive_intvl_when(const struct tcp_sock *tp) { return tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl; } static inline int keepalive_probes(const struct tcp_sock *tp) { return tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; }
TCP_INFO
/* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; memset(info, 0, sizeof(*info)); info->tcpi_state = sk->sk_state; /* TCP状态 */ info->tcpi_ca_state = icsk_ca_state; /* TCP拥塞状态 */ info->tcpi_retransmits = icsk->icsk_retransmits; /* Number of unrecovered [RTO] timeouts */ info->tcpi_probes = icsk->icsk_probes_out; /* unanswered 0 window probes */ info->tcpi_backoff = icsk->icsk_backoff; /* 退避指数 */ if (tp->rx_opt.tstamp_ok) /* TIMESTAMP seen on SYN packet */ info->tcpi_options |= TCPI_OPT_TIMESTAMPS; /* 时间戳选项使用与否 */ if (tcp_is_sack(tp)) info->tcpi_options |= TCPI_OPT_SACK; /* SACK选项使用与否 */ if (tp->rx_opt.wscale_ok) { /* Wscale seen on SYN packet */ info->tcpi_options |= TCPI_OPT_WSCALE; /* 窗口扩大选项使用与否 */ info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; /* 发送窗口扩大因子 */ info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; /* 接收窗口扩大因子 */ } if (tp->ecn_flags & TCP_ECN_OK) info->tcpi_options |= TCPI_OPT_ECN; /* ECN选项使用与否 */ info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); /* RTO,单位微秒 */ info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); /* Predicted tick of soft clock */ info->tcpi_snd_mss = tp->mss_cache; /* 本端MSS */ info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; /* 对端MSS */ if (sk->sk_state == TCP_LISTEN) { info->tcpi_unacked = sk->sk_ack_backlog; /* current listen backlog */ info->tcpi_sacked = sk->sk_max_ack_backlog; /* listen backlog set in listen() */ } else { info->tcpi_unacked = tp->packets_out; /* 未确认的数据包数 */ info->tcpi_sacked = tp->sacked_out; /* SACKed的数据包数 */ } info->tcpi_lost = tp->lost_out; /* 丢失的数据包数 */ info->tcpi_retrans = tp->retrans_out; /* 重传的数据包数 */ info->tcpi_fackets = tp->fackets_out; /* FACKed的数据包数 */ info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); /* 最近一次发数据包的时间间隔 */ info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); /* 最近一次收数据包的时间间隔 */ info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); /* 最近一次收ACK包的时间间隔 */ info->tcpi_pmtu = icsk->icsk_pmtu_cookie; /* Last pmtu seen by socket */ info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; /* current window clamp*/ info->tcpi_rtt = jiffies_to_usecs(tp->srtt) >> 3; /* 平滑的RTT */ info->tcpi_rttvar = jiffies_to_usecs(tp->mdev) >> 2; /* 四分之一mdev */ info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_snd_cwnd = tp->snd_cwnd; info->tcpi_advmss = tp->advmss; /* 本端能接受的MSS上限,在建立连接时用来通告对端 */ info->tcpi_reordering = tp->reordering; /* 没有丢包时,可以重新排序的数据段数 */ info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt) >> 3; /* 作为接收端,测出的RTT值 */ info->tcpi_rcv_space = tp->rcvq_space.space; /* 接收缓存的大小 */ info->tcpi_total_retrans = tp->total_retrans; /* 本连接的总重传个数 */ }
Author
zhangskd @ csdn blog