Linux tracepoint分析
tracepoint介绍
Brendan Gregg大神博客对tracepoint做了说明,同时看到taobao技术博客对文章进行了翻译,学习整理如下。
测试系统版本
$ uname -r
5.4.0-72-generic
利用bcc的tplist工具查看当前版本支持的tracepoint点:
$ sudo /usr/share/bcc/tools/tplist -v 'tcp:*'
tcp:tcp_retransmit_skb
const void * skbaddr;
const void * skaddr;
int state;
__u16 sport;
__u16 dport;
__u8 saddr[4];
__u8 daddr[4];
__u8 saddr_v6[16];
__u8 daddr_v6[16];
tcp:tcp_send_reset
const void * skbaddr;
const void * skaddr;
int state;
__u16 sport;
__u16 dport;
__u8 saddr[4];
__u8 daddr[4];
__u8 saddr_v6[16];
__u8 daddr_v6[16];
tcp:tcp_receive_reset
const void * skaddr;
__u16 sport;
__u16 dport;
__u8 saddr[4];
__u8 daddr[4];
__u8 saddr_v6[16];
__u8 daddr_v6[16];
__u64 sock_cookie;
tcp:tcp_destroy_sock
const void * skaddr;
__u16 sport;
__u16 dport;
__u8 saddr[4];
__u8 daddr[4];
__u8 saddr_v6[16];
__u8 daddr_v6[16];
__u64 sock_cookie;
tcp:tcp_rcv_space_adjust
const void * skaddr;
__u16 sport;
__u16 dport;
__u8 saddr[4];
__u8 daddr[4];
__u8 saddr_v6[16];
__u8 daddr_v6[16];
__u64 sock_cookie;
tcp:tcp_retransmit_synack
const void * skaddr;
const void * req;
__u16 sport;
__u16 dport;
__u8 saddr[4];
__u8 daddr[4];
__u8 saddr_v6[16];
__u8 daddr_v6[16];
tcp:tcp_probe
__u8 saddr[sizeof(struct sockaddr_in6)];
__u8 daddr[sizeof(struct sockaddr_in6)];
__u16 sport;
__u16 dport;
__u32 mark;
__u16 data_len;
__u32 snd_nxt;
__u32 snd_una;
__u32 snd_cwnd;
__u32 ssthresh;
__u32 snd_wnd;
__u32 srtt;
__u32 rcv_wnd;
__u64 sock_cookie;
$ sudo /usr/share/bcc/tools/tplist -v 'sock:*'
sock:sock_rcvqueue_full
int rmem_alloc;
unsigned int truesize;
int sk_rcvbuf;
sock:sock_exceed_buf_limit
char name[32];
long * sysctl_mem;
long allocated;
int sysctl_rmem;
int rmem_alloc;
int sysctl_wmem;
int wmem_alloc;
int wmem_queued;
int kind;
sock:inet_sock_set_state
const void * skaddr;
int oldstate;
int newstate;
__u16 sport;
__u16 dport;
__u16 family;
__u8 protocol;
__u8 saddr[4];
__u8 daddr[4];
__u8 saddr_v6[16];
__u8 daddr_v6[16];
对上述tracepoint的说明如下:
- tcp:tcp_retransmit_skb: 跟踪重传。对于理解包括拥塞在内的网络问题很有用。将会在笔者的 tcpretrans 工具中替换 kprobes。
- tcp:tcp_retransmit_synack: 跟踪 SYN 和 SYN/ACK 重传。将它们剥离出来很有趣,是因为它们可以表明服务器的饱和度(listen backlog 丢包)而不是网络拥塞。它对应着 LINUX_MIB_TCPSYNRETRANS。
- tcp:tcp_destroy_sock: 对于需要统计汇总 TCP 会话的内存详情的程序是需要的,它可以通过 sock 地址来作为主键索引。这个探测点可以得知会话是否已经结束,因此接下来sock 地址将会被复用,任何截止到现在的统计信息都应该被使用然后删除。
- tcp:tcp_send_reset: 这个会跟踪一个有效 socket 下的 RST 发送,用以诊断相关类型的问题。
- tcp:tcp_receive_reset: 跟踪 RST 接受。
- tcp:tcp_probe: 用以跟踪 TCP 拥塞窗口,这也让一个更老的 TCP probe 模块废弃并移除。这个是 Masami Hiramatsu 提交并在 4.16 合入。
- sock:inet_sock_set_state: 可以用来做很多事情。tcplife 工具就是其中一个,并且笔者的 tcpconnect 和 tcpaccept bcc 工具也可以转换为使用这个 tracepoint。我们可以添加单独的 tcp:tcp_connect 和 tcp:tcp_accept tracepoints (或者 tcp:tcp_active_open 和 tcp:tcp_passive_open), 但是可以直接使用 sock:inet_sock_set_state。
使用示例
编写ply程序,利用sock:inet_sock_set_state实现连接状态变化跟踪:
sudo ply 'tracepoint:sock/inet_sock_set_state { printf("saddr: %v sport: %v -> daddr: %v dport: %v, old_state: %v, new_state: %v\n", data->saddr, data->sport, data->daddr, data->dport, data->oldstate, data->newstate);}'
输出如下:
$ sudo ply 'tracepoint:sock/inet_sock_set_state { printf("saddr: %v sport: %v -> daddr: %v dport: %v, old_state: %v, new_state: %v\n", data->saddr, data->sport, data->daddr, data->dport, data->oldstate, data->newstate);}'
ply: active
saddr: [192, 168, 136, 163] sport: 0 -> daddr: [104, 193, 88, 77] dport: 443, old_state: 7, new_state: 2
saddr: [192, 168, 136, 163] sport: 47600 -> daddr: [104, 193, 88, 77] dport: 443, old_state: 2, new_state: 1
saddr: [192, 168, 136, 163] sport: 0 -> daddr: [13, 225, 93, 17] dport: 443, old_state: 7, new_state: 2
saddr: [192, 168, 136, 163] sport: 48316 -> daddr: [13, 225, 93, 17] dport: 443, old_state: 2, new_state: 1
saddr: [192, 168, 136, 163] sport: 0 -> daddr: [99, 84, 203, 13] dport: 443, old_state: 7, new_state: 2
saddr: [192, 168, 136, 163] sport: 42292 -> daddr: [99, 84, 203, 13] dport: 443, old_state: 2, new_state: 1
saddr: [192, 168, 136, 163] sport: 0 -> daddr: [104, 193, 88, 77] dport: 443, old_state: 7, new_state: 2
saddr: [192, 168, 136, 163] sport: 0 -> daddr: [104, 193, 88, 77] dport: 443, old_state: 7, new_state: 2
saddr: [192, 168, 136, 163] sport: 0 -> daddr: [104, 193, 88, 77] dport: 443, old_state: 7, new_state: 2
saddr: [192, 168, 136, 163] sport: 0 -> daddr: [104, 193, 88, 77] dport: 443, old_state: 7, new_state: 2
saddr: [192, 168, 136, 163] sport: 0 -> daddr: [104, 193, 90, 87] dport: 443, old_state: 7, new_state: 2
saddr: [192, 168, 136, 163] sport: 47610 -> daddr: [104, 193, 88, 77] dport: 443, old_state: 2, new_state: 1
saddr: [192, 168, 136, 163] sport: 47612 -> daddr: [104, 193, 88, 77] dport: 443, old_state: 2, new_state: 1
saddr: [192, 168, 136, 163] sport: 0 -> daddr: [104, 193, 88, 77] dport: 443, old_state: 7, new_state: 2
saddr: [192, 168, 136, 163] sport: 0 -> daddr: [104, 193, 88, 77] dport: 443, old_state: 7, new_state: 2
saddr: [192, 168, 136, 163] sport: 47606 -> daddr: [104, 193, 88, 77] dport: 443, old_state: 2, new_state: 1
saddr: [192, 168, 136, 163] sport: 0 -> daddr: [104, 193, 90, 87] dport: 443, old_state: 7, new_state: 2
saddr: [192, 168, 136, 163] sport: 44286 -> daddr: [104, 193, 90, 87] dport: 443, old_state: 2, new_state: 1
saddr: [192, 168, 136, 163] sport: 47618 -> daddr: [104, 193, 88, 77] dport: 443, old_state: 2, new_state: 1
saddr: [192, 168, 136, 163] sport: 44292 -> daddr: [104, 193, 90, 87] dport: 443, old_state: 2, new_state: 1
其中关于连接状态的枚举值,可以参考Linux内核源码:
# /include/net/tcp_states.h
enum {
TCP_ESTABLISHED = 1,
TCP_SYN_SENT,
TCP_SYN_RECV,
TCP_FIN_WAIT1,
TCP_FIN_WAIT2,
TCP_TIME_WAIT,
TCP_CLOSE,
TCP_CLOSE_WAIT,
TCP_LAST_ACK,
TCP_LISTEN,
TCP_CLOSING, /* Now a valid state */
TCP_NEW_SYN_RECV,
TCP_MAX_STATES /* Leave at the end! */
};
参考
http://www.brendangregg.com/blog/2018-03-22/tcp-tracepoints.html
https://kernel.taobao.org/2019/10/TCP-Tracepoints/