套接字之相关系统调用的调用流程

 最近一直在读内核网络协议栈源码,这里以ipv4/tcp为例对socket相关系统调用的流程做一个简要整理,这些相关系统调用的内部细节虽然各有不同,但其调用流程则基本一致;

调用流程:

(1)系统调用 –> (2)查找socket –> (3)执行socket的对应操作函数  –> (4)执行传输层协议的对应操作函数;

中间核心数据结构为inetws_array[],位于af_inet.c,以第一个元素type=SOCK_STREAM,protocol=IPPROTO_TCP为例,该类型适用与tcp协议,当创建tcp socket时,其操作socket->ops赋值为&inet_stream_ops,对应的传输控制块操作sock->sk_prot赋值为&tcp_prot;

 1 /* Upon startup we insert all the elements in inetsw_array[] into
 2  * the linked list inetsw.
 3  */
 4 static struct inet_protosw inetsw_array[] =
 5 {
 6     {
 7         .type =       SOCK_STREAM,
 8         .protocol =   IPPROTO_TCP,
 9         .prot =       &tcp_prot,
10         .ops =        &inet_stream_ops,
11         .flags =      INET_PROTOSW_PERMANENT |
12                   INET_PROTOSW_ICSK,
13     },
14 
15     {
16         .type =       SOCK_DGRAM,
17         .protocol =   IPPROTO_UDP,
18         .prot =       &udp_prot,
19         .ops =        &inet_dgram_ops,
20         .flags =      INET_PROTOSW_PERMANENT,
21        },
22 
23        {
24         .type =       SOCK_DGRAM,
25         .protocol =   IPPROTO_ICMP,
26         .prot =       &ping_prot,
27         .ops =        &inet_sockraw_ops,
28         .flags =      INET_PROTOSW_REUSE,
29        },
30 
31        {
32            .type =       SOCK_RAW,
33            .protocol =   IPPROTO_IP,    /* wild card */
34            .prot =       &raw_prot,
35            .ops =        &inet_sockraw_ops,
36            .flags =      INET_PROTOSW_REUSE,
37        }
38 };

 

查看inet_stream_ops结构会发现,其中包含了各种socket系统调用的对应的处理函数;

 1 const struct proto_ops inet_stream_ops = {
 2     .family           = PF_INET,
 3     .owner           = THIS_MODULE,
 4     .release       = inet_release,
 5     .bind           = inet_bind,
 6     .connect       = inet_stream_connect,
 7     .socketpair       = sock_no_socketpair,
 8     .accept           = inet_accept,
 9     .getname       = inet_getname,
10     .poll           = tcp_poll,
11     .ioctl           = inet_ioctl,
12     .listen           = inet_listen,
13     .shutdown       = inet_shutdown,
14     .setsockopt       = sock_common_setsockopt,
15     .getsockopt       = sock_common_getsockopt,
16     .sendmsg       = inet_sendmsg,
17     .recvmsg       = inet_recvmsg,
18     .mmap           = sock_no_mmap,
19     .sendpage       = inet_sendpage,
20     .splice_read       = tcp_splice_read,
21     .read_sock       = tcp_read_sock,
22     .peek_len       = tcp_peek_len,
23 #ifdef CONFIG_COMPAT
24     .compat_setsockopt = compat_sock_common_setsockopt,
25     .compat_getsockopt = compat_sock_common_getsockopt,
26     .compat_ioctl       = inet_compat_ioctl,
27 #endif
28 };

 

具体实例,以tcp bind系统调用为例:

 1 SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
 2 {
 3     struct socket *sock;
 4     struct sockaddr_storage address;
 5     int err, fput_needed;
 6 
 7     /* 获取socket ,fput_need标识是否需要减少文件引用计数*/
 8     sock = sockfd_lookup_light(fd, &err, &fput_needed);
 9     if (sock) {
10         /* 将用户空间地址复制到内核空间 */
11         err = move_addr_to_kernel(umyaddr, addrlen, &address);
12         if (err >= 0) {
13             /* 安全模块的bind检查 */
14             err = security_socket_bind(sock,
15                            (struct sockaddr *)&address,
16                            addrlen);
17             if (!err)
18                 /* 调用socket的bind操作 */
19                 err = sock->ops->bind(sock,
20                               (struct sockaddr *)
21                               &address, addrlen);
22         }
23 
24         /* 根据fput_needed决定是否减少引用计数 */
25         fput_light(sock->file, fput_needed);
26     }
27     return err;
28 }

 

上面的sock->ops->bind操作实际是调用了inet_stream_ops.bind

 1 /* 地址绑定 */
 2 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 3 {
 4     /*  省略无关代码 */
 5     /* If the socket has its own bind function then use it. (RAW) */
 6     /* 
 7         如果传输控制块有自己的bind操作则调用,
 8         目前只有raw实现了自己的bind 
 9     */
10     if (sk->sk_prot->bind) {
11         err = sk->sk_prot->bind(sk, uaddr, addr_len);
12         goto out;
13     }
14     
15     /* 省略无关代码 */
16 
17     /* 
18         端口不为0,或者端口为0允许绑定 
19         则使用协议的具体获取端口函数绑定端口
20     */
21     if ((snum || !inet->bind_address_no_port) &&
22         sk->sk_prot->get_port(sk, snum)) {
23 
24         /* 绑定失败 */
25         inet->inet_saddr = inet->inet_rcv_saddr = 0;
26 
27         /* 端口在使用中 */
28         err = -EADDRINUSE;
29         goto out_release_sock;
30     }
31 
32    /* 省略无关代码 */
33 out_release_sock:
34     release_sock(sk);
35 out:
36     return err;
37 }

 

上面的sk->sk_prot->bind以及sk->sk_prot->get_port为具体传输层实现的对应操作函数,其中只有raw socket实现了bind操作,我们不关注,而以tcp的get_port操作为例,实际上也就是调用了tcp_prot.get_port,具体tcp实现为inet_csk_get_port;(该函数尚未分析,后续补充)

 1 /* Obtain a reference to a local port for the given sock,
 2  * if snum is zero it means select any available local port.
 3  * We try to allocate an odd port (and leave even ports for connect())
 4  */
 5 int inet_csk_get_port(struct sock *sk, unsigned short snum)
 6 {
 7     bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
 8     struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
 9     int ret = 1, port = snum;
10     struct inet_bind_hashbucket *head;
11     struct net *net = sock_net(sk);
12     struct inet_bind_bucket *tb = NULL;
13     kuid_t uid = sock_i_uid(sk);
14 
15     if (!port) {
16         head = inet_csk_find_open_port(sk, &tb, &port);
17         if (!head)
18             return ret;
19         if (!tb)
20             goto tb_not_found;
21         goto success;
22     }
23     head = &hinfo->bhash[inet_bhashfn(net, port,
24                       hinfo->bhash_size)];
25     spin_lock_bh(&head->lock);
26     inet_bind_bucket_for_each(tb, &head->chain)
27         if (net_eq(ib_net(tb), net) && tb->port == port)
28             goto tb_found;
29 tb_not_found:
30     tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
31                      net, head, port);
32     if (!tb)
33         goto fail_unlock;
34 tb_found:
35     if (!hlist_empty(&tb->owners)) {
36         if (sk->sk_reuse == SK_FORCE_REUSE)
37             goto success;
38 
39         if ((tb->fastreuse > 0 && reuse) ||
40             sk_reuseport_match(tb, sk))
41             goto success;
42         if (inet_csk_bind_conflict(sk, tb, true, true))
43             goto fail_unlock;
44     }
45 success:
46     if (!hlist_empty(&tb->owners)) {
47         tb->fastreuse = reuse;
48         if (sk->sk_reuseport) {
49             tb->fastreuseport = FASTREUSEPORT_ANY;
50             tb->fastuid = uid;
51             tb->fast_rcv_saddr = sk->sk_rcv_saddr;
52             tb->fast_ipv6_only = ipv6_only_sock(sk);
53 #if IS_ENABLED(CONFIG_IPV6)
54             tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
55 #endif
56         } else {
57             tb->fastreuseport = 0;
58         }
59     } else {
60         if (!reuse)
61             tb->fastreuse = 0;
62         if (sk->sk_reuseport) {
63             /* We didn't match or we don't have fastreuseport set on
64              * the tb, but we have sk_reuseport set on this socket
65              * and we know that there are no bind conflicts with
66              * this socket in this tb, so reset our tb's reuseport
67              * settings so that any subsequent sockets that match
68              * our current socket will be put on the fast path.
69              *
70              * If we reset we need to set FASTREUSEPORT_STRICT so we
71              * do extra checking for all subsequent sk_reuseport
72              * socks.
73              */
74             if (!sk_reuseport_match(tb, sk)) {
75                 tb->fastreuseport = FASTREUSEPORT_STRICT;
76                 tb->fastuid = uid;
77                 tb->fast_rcv_saddr = sk->sk_rcv_saddr;
78                 tb->fast_ipv6_only = ipv6_only_sock(sk);
79 #if IS_ENABLED(CONFIG_IPV6)
80                 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
81 #endif
82             }
83         } else {
84             tb->fastreuseport = 0;
85         }
86     }
87     if (!inet_csk(sk)->icsk_bind_hash)
88         inet_bind_hash(sk, tb, port);
89     WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
90     ret = 0;
91 
92 fail_unlock:
93     spin_unlock_bh(&head->lock);
94     return ret;
95 }

 

posted @ 2017-10-03 01:51  AlexAlex  阅读(2499)  评论(0编辑  收藏  举报