套接字之相关系统调用的调用流程
最近一直在读内核网络协议栈源码,这里以ipv4/tcp为例对socket相关系统调用的流程做一个简要整理,这些相关系统调用的内部细节虽然各有不同,但其调用流程则基本一致;
调用流程:
(1)系统调用 –> (2)查找socket –> (3)执行socket的对应操作函数 –> (4)执行传输层协议的对应操作函数;
中间核心数据结构为inetws_array[],位于af_inet.c,以第一个元素type=SOCK_STREAM,protocol=IPPROTO_TCP为例,该类型适用与tcp协议,当创建tcp socket时,其操作socket->ops赋值为&inet_stream_ops,对应的传输控制块操作sock->sk_prot赋值为&tcp_prot;
1 /* Upon startup we insert all the elements in inetsw_array[] into 2 * the linked list inetsw. 3 */ 4 static struct inet_protosw inetsw_array[] = 5 { 6 { 7 .type = SOCK_STREAM, 8 .protocol = IPPROTO_TCP, 9 .prot = &tcp_prot, 10 .ops = &inet_stream_ops, 11 .flags = INET_PROTOSW_PERMANENT | 12 INET_PROTOSW_ICSK, 13 }, 14 15 { 16 .type = SOCK_DGRAM, 17 .protocol = IPPROTO_UDP, 18 .prot = &udp_prot, 19 .ops = &inet_dgram_ops, 20 .flags = INET_PROTOSW_PERMANENT, 21 }, 22 23 { 24 .type = SOCK_DGRAM, 25 .protocol = IPPROTO_ICMP, 26 .prot = &ping_prot, 27 .ops = &inet_sockraw_ops, 28 .flags = INET_PROTOSW_REUSE, 29 }, 30 31 { 32 .type = SOCK_RAW, 33 .protocol = IPPROTO_IP, /* wild card */ 34 .prot = &raw_prot, 35 .ops = &inet_sockraw_ops, 36 .flags = INET_PROTOSW_REUSE, 37 } 38 };
查看inet_stream_ops结构会发现,其中包含了各种socket系统调用的对应的处理函数;
1 const struct proto_ops inet_stream_ops = { 2 .family = PF_INET, 3 .owner = THIS_MODULE, 4 .release = inet_release, 5 .bind = inet_bind, 6 .connect = inet_stream_connect, 7 .socketpair = sock_no_socketpair, 8 .accept = inet_accept, 9 .getname = inet_getname, 10 .poll = tcp_poll, 11 .ioctl = inet_ioctl, 12 .listen = inet_listen, 13 .shutdown = inet_shutdown, 14 .setsockopt = sock_common_setsockopt, 15 .getsockopt = sock_common_getsockopt, 16 .sendmsg = inet_sendmsg, 17 .recvmsg = inet_recvmsg, 18 .mmap = sock_no_mmap, 19 .sendpage = inet_sendpage, 20 .splice_read = tcp_splice_read, 21 .read_sock = tcp_read_sock, 22 .peek_len = tcp_peek_len, 23 #ifdef CONFIG_COMPAT 24 .compat_setsockopt = compat_sock_common_setsockopt, 25 .compat_getsockopt = compat_sock_common_getsockopt, 26 .compat_ioctl = inet_compat_ioctl, 27 #endif 28 };
具体实例,以tcp bind系统调用为例:
1 SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) 2 { 3 struct socket *sock; 4 struct sockaddr_storage address; 5 int err, fput_needed; 6 7 /* 获取socket ,fput_need标识是否需要减少文件引用计数*/ 8 sock = sockfd_lookup_light(fd, &err, &fput_needed); 9 if (sock) { 10 /* 将用户空间地址复制到内核空间 */ 11 err = move_addr_to_kernel(umyaddr, addrlen, &address); 12 if (err >= 0) { 13 /* 安全模块的bind检查 */ 14 err = security_socket_bind(sock, 15 (struct sockaddr *)&address, 16 addrlen); 17 if (!err) 18 /* 调用socket的bind操作 */ 19 err = sock->ops->bind(sock, 20 (struct sockaddr *) 21 &address, addrlen); 22 } 23 24 /* 根据fput_needed决定是否减少引用计数 */ 25 fput_light(sock->file, fput_needed); 26 } 27 return err; 28 }
上面的sock->ops->bind操作实际是调用了inet_stream_ops.bind
1 /* 地址绑定 */ 2 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 3 { 4 /* 省略无关代码 */ 5 /* If the socket has its own bind function then use it. (RAW) */ 6 /* 7 如果传输控制块有自己的bind操作则调用, 8 目前只有raw实现了自己的bind 9 */ 10 if (sk->sk_prot->bind) { 11 err = sk->sk_prot->bind(sk, uaddr, addr_len); 12 goto out; 13 } 14 15 /* 省略无关代码 */ 16 17 /* 18 端口不为0,或者端口为0允许绑定 19 则使用协议的具体获取端口函数绑定端口 20 */ 21 if ((snum || !inet->bind_address_no_port) && 22 sk->sk_prot->get_port(sk, snum)) { 23 24 /* 绑定失败 */ 25 inet->inet_saddr = inet->inet_rcv_saddr = 0; 26 27 /* 端口在使用中 */ 28 err = -EADDRINUSE; 29 goto out_release_sock; 30 } 31 32 /* 省略无关代码 */ 33 out_release_sock: 34 release_sock(sk); 35 out: 36 return err; 37 }
上面的sk->sk_prot->bind以及sk->sk_prot->get_port为具体传输层实现的对应操作函数,其中只有raw socket实现了bind操作,我们不关注,而以tcp的get_port操作为例,实际上也就是调用了tcp_prot.get_port,具体tcp实现为inet_csk_get_port;(该函数尚未分析,后续补充)
1 /* Obtain a reference to a local port for the given sock, 2 * if snum is zero it means select any available local port. 3 * We try to allocate an odd port (and leave even ports for connect()) 4 */ 5 int inet_csk_get_port(struct sock *sk, unsigned short snum) 6 { 7 bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; 8 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; 9 int ret = 1, port = snum; 10 struct inet_bind_hashbucket *head; 11 struct net *net = sock_net(sk); 12 struct inet_bind_bucket *tb = NULL; 13 kuid_t uid = sock_i_uid(sk); 14 15 if (!port) { 16 head = inet_csk_find_open_port(sk, &tb, &port); 17 if (!head) 18 return ret; 19 if (!tb) 20 goto tb_not_found; 21 goto success; 22 } 23 head = &hinfo->bhash[inet_bhashfn(net, port, 24 hinfo->bhash_size)]; 25 spin_lock_bh(&head->lock); 26 inet_bind_bucket_for_each(tb, &head->chain) 27 if (net_eq(ib_net(tb), net) && tb->port == port) 28 goto tb_found; 29 tb_not_found: 30 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 31 net, head, port); 32 if (!tb) 33 goto fail_unlock; 34 tb_found: 35 if (!hlist_empty(&tb->owners)) { 36 if (sk->sk_reuse == SK_FORCE_REUSE) 37 goto success; 38 39 if ((tb->fastreuse > 0 && reuse) || 40 sk_reuseport_match(tb, sk)) 41 goto success; 42 if (inet_csk_bind_conflict(sk, tb, true, true)) 43 goto fail_unlock; 44 } 45 success: 46 if (!hlist_empty(&tb->owners)) { 47 tb->fastreuse = reuse; 48 if (sk->sk_reuseport) { 49 tb->fastreuseport = FASTREUSEPORT_ANY; 50 tb->fastuid = uid; 51 tb->fast_rcv_saddr = sk->sk_rcv_saddr; 52 tb->fast_ipv6_only = ipv6_only_sock(sk); 53 #if IS_ENABLED(CONFIG_IPV6) 54 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; 55 #endif 56 } else { 57 tb->fastreuseport = 0; 58 } 59 } else { 60 if (!reuse) 61 tb->fastreuse = 0; 62 if (sk->sk_reuseport) { 63 /* We didn't match or we don't have fastreuseport set on 64 * the tb, but we have sk_reuseport set on this socket 65 * and we know that there are no bind conflicts with 66 * this socket in this tb, so reset our tb's reuseport 67 * settings so that any subsequent sockets that match 68 * our current socket will be put on the fast path. 69 * 70 * If we reset we need to set FASTREUSEPORT_STRICT so we 71 * do extra checking for all subsequent sk_reuseport 72 * socks. 73 */ 74 if (!sk_reuseport_match(tb, sk)) { 75 tb->fastreuseport = FASTREUSEPORT_STRICT; 76 tb->fastuid = uid; 77 tb->fast_rcv_saddr = sk->sk_rcv_saddr; 78 tb->fast_ipv6_only = ipv6_only_sock(sk); 79 #if IS_ENABLED(CONFIG_IPV6) 80 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; 81 #endif 82 } 83 } else { 84 tb->fastreuseport = 0; 85 } 86 } 87 if (!inet_csk(sk)->icsk_bind_hash) 88 inet_bind_hash(sk, tb, port); 89 WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); 90 ret = 0; 91 92 fail_unlock: 93 spin_unlock_bh(&head->lock); 94 return ret; 95 }