Linux kernel 之 socket 创建过程分析
-
重要结构体
-
struct socket
结构体
// 普通的 BSD 标准 socket 结构体
// socket_state: socket 状态, 连接?不连接?
// type: socket type (%SOCK_STREAM, etc)
// flags: socket flags (%SOCK_NOSPACE, etc)
// ops: 专用协议的socket的操作
// file: 与socket 有关的指针列表
// sk: 负责协议相关结构体,这样就让这个这个结构体和协议分开。
// wq: 等待队列
struct socket {
socket_state state;
kmemcheck_bitfield_begin(type);
short type;
kmemcheck_bitfield_end(type);
unsigned long flags;
struct socket_wq __rcu *wq;
struct file *file;
struct sock *sk;
const struct proto_ops *ops;
};
-
struct socket
的创建
// socket() 本质上是 glibc 中的函数,执行的实际上是 sys_socketcall() 系统调用。
// sys_socketcall() 几乎是所有的socket函数的入口,
// 也就是 bind,connect 等函数都是需要asmlinkage long sys_socketcall(int call, unsigned long __user *args); 、、sys_socketcall() 作为入口,函数如下:
// include/linux/syscalls.h
asmlinkage long sys_socketcall(int call, unsigned long __user *args);
// net/socket.c
SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
unsigned long a[AUDITSC_ARGS];
unsigned long a0, a1;
int err;
unsigned int len;
if (call < 1 || call > SYS_SENDMMSG)
return -EINVAL;
len = nargs[call];
if (len > sizeof(a))
return -EINVAL;
/* copy_from_user should be SMP safe. */
if (copy_from_user(a, args, len))
return -EFAULT;
err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
if (err)
return err;
a0 = a[0];
a1 = a[1];
// 判断,然后运行相对应的函数
switch (call) {
case SYS_SOCKET: // 这里就是 sys_socket(),
err = sys_socket(a0, a1, a[2]);
break;
case SYS_BIND:
err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
break;
case SYS_CONNECT:
err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
break;
case SYS_LISTEN:
err = sys_listen(a0, a1);
break;
// ... ...
default:
err = -EINVAL;
break;
}
return err;
}
// include/linux/syscalls.h
asmlinkage long sys_socket(int, int, int);
// net/socket.c
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
int retval;
struct socket *sock;
int flags;
/* Check the SOCK_* constants for consistency. */
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
flags = type & ~SOCK_TYPE_MASK;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
type &= SOCK_TYPE_MASK;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
// 这里创建了 socket 结构体
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
// 与文件系统进行关联
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
-
sock_create()
函数
// include/linux/net.h
int sock_create(int family, int type, int proto, struct socket **res);
// net/socket.c
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);
// include/linux/net.h
int __sock_create(struct net *net, int family, int type, int proto,
struct socket **res, int kern);
// net/socket.c
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
/*
* Check protocol is in range
*/
// 检查 协议族是否在范围呢
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX) // 检查类型
return -EINVAL;
/* Compatibility.
This uglymoron is moved from INET layer to here to avoid
deadlock in module load.
*/ // 检查用的是PF_INET 其实这个都是兼容的。
if (family == PF_INET && type == SOCK_PACKET) {
static int warned;
if (!warned) {
warned = 1;
pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
}
family = PF_PACKET;
}
// 安全机制检查
err = security_socket_create(family, type, protocol, kern);
if (err)
return err;
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/ // ----> sock_alloc 接下面
sock = sock_alloc();
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type;
// ... ...
return 0;
// ... ...
}
EXPORT_SYMBOL(__sock_create);
-
sock_alloc()
函数解析,被上面的__sock_create()
函数调用
// net/socket.c
static struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;
inode = new_inode_pseudo(sock_mnt->mnt_sb);
if (!inode)
return NULL;
sock = SOCKET_I(inode);
kmemcheck_annotate_bitfield(sock, type);
inode->i_ino = get_next_ino();
inode->i_mode = S_IFSOCK | S_IRWXUGO; // 模式
inode->i_uid = current_fsuid(); // 获取当前的uid
inode->i_gid = current_fsgid(); // 获取当前的gid
inode->i_op = &sockfs_inode_ops; // 操作
this_cpu_add(sockets_in_use, 1);
return sock;
}
// 申请一个 socket 结构体 ,名字为 sock
// 申请一个新的节点和一个新的 socket 项目, 绑定他们两个并且初始化
// 如果申请inode 失败返回 NULL, 或者返回sock
// 接下来我们再看到 SOCKET_I(inode);
// include/net/sock.h
static inline struct socket *SOCKET_I(struct inode *inode)
{
return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}
// 然后我们发现,返回的是 inode 内的socket 结构体。
// 我们可以分析一个 container_of() 这个是怎么定义的。
// include/linux/kernel.h
#define container_of(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
// typeof 将 ptr 的指针临时保存起来为 __mptr
// 然后用这个 __mptr 指针减去下面的 member 的便宜量。
// 得到的就是 type 这个结构体的头指针。
// offsetof include/linux/stddef.h
#undef offsetof
#ifdef __compiler_offsetof
#define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER)
#else
#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER)
#endif
// 反正这里有点难理解,最后得到的结果是 type 这个结构体的头指针。
// 所以说 SOCKET_I() 得到的是 struct socket_alloc 的头指针
// include/net/sock.h
struct socket_alloc {
struct socket socket;
struct inode vfs_inode;
};
-
回到 __sock_create() 继续分析
// net/socket.c --> __sock_create()
#ifdef CONFIG_MODULES
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
if (rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d", family);
#endif
如果在 make menuconfig 中选上 编译成模块的选项,则会运行上面这个部分。
里面先是检查对应的协议族的操作表是否已经安装,如果没有安装则使用 request_module 进行安装,现在都是在 TCP/IP协议下进行分析,所以 family 是 AF_INET , 也就是 2 , 所以实际检查的全局变量是 net_families[2], 这个全局变量是在系统初始化时由 net/ipv4/af_inet.c 文件进行安装,具体代码是:
// net/ipv4/af_inet.c
static int __init inet_init(void)
{
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
// 各个协议的注册
rc = proto_register(&tcp_prot, 1);
if (rc)
goto out;
rc = proto_register(&udp_prot, 1);
if (rc)
goto out_unregister_tcp_proto;
rc = proto_register(&raw_prot, 1);
if (rc)
goto out_unregister_udp_proto;
rc = proto_register(&ping_prot, 1);
if (rc)
goto out_unregister_raw_proto;
/*
* Tell SOCKET that we are alive...
*/
(void)sock_register(&inet_family_ops);
#ifdef CONFIG_SYSCTL
ip_static_sysctl_init();
#endif
/*
* Add all the base protocols.
*/
// 各个协议的添加,添加不成功则报错
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create. */
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
// 把这个关键性的链接表一个个注册上去
// ******************************************************
// inetsw_array 结构体数组数组, 这里面有包含每个的协议,比如说tcp_prot
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_REUSE,
},
// ... ...
}
// tcp_prot ---> net/ipv4/tcp_ipv4.c
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock, // 这是init 函数会在后面被调用
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
.init_cgroup = tcp_init_cgroup,
.destroy_cgroup = tcp_destroy_cgroup,
.proto_cgroup = tcp_proto_cgroup,
#endif
};
EXPORT_SYMBOL(tcp_prot);
// ***********************************************************
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
// 各个协议模块的初始化
/*
* Set the ARP module up
*/
arp_init();
/*
* Set the IP module up
*/
ip_init();
tcp_v4_init();
/* Setup TCP slab cache for open requests. */
tcp_init();
/* Setup UDP memory threshold */
udp_init();
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
ping_init();
/*
* Set the ICMP layer up
*/
if (icmp_init() < 0)
panic("Failed to create the ICMP control socket.\n");
/*
* Initialise the multicast router
*/
#if defined(CONFIG_IP_MROUTE)
if (ip_mr_init())
pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
if (init_inet_pernet_ops())
pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
/*
* Initialise per-cpu ipv4 mibs
*/
if (init_ipv4_mibs())
pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
ipv4_proc_init();
ipfrag_init();
dev_add_pack(&ip_packet_type);
ip_tunnel_core_init();
rc = 0;
out:
return rc;
out_unregister_raw_proto:
proto_unregister(&raw_prot);
out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
goto out;
}
fs_initcall(inet_init);
-
很粗浅的看完协议那一部分之后我们回到 __sock_create()
// net/socket.c
// 看到 这个回调函数的调用
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
// 先看一个 inet_protosw 结构体
// include/net/protocol.h
/* This is used to register socket interfaces for IP protocols. */
struct inet_protosw {
struct list_head list;
/* These two fields form the lookup key. */
unsigned short type; /* This is the 2nd argument to socket(2). */
unsigned short protocol; /* This is the L4 protocol number. */
struct proto *prot;
const struct proto_ops *ops;
unsigned char flags; /* See INET_PROTOSW_* below. */
};
// 上面的 create 函数对应的是 net/ipv4/af_inet.c 里面的 inet_create 函数
// net/ipv4/af_inet.c
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
int try_loading_module = 0;
int err;
// 检查协议是否在范围之内
if (protocol < 0 || protocol >= IPPROTO_MAX)
return -EINVAL;
// 设置状态为未连接
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
// 遍历寻找请求的协议类型
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
// 遍历 inetsw[] 数组,其实就是次数而已
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
err = 0;
// 检查对应的协议,然后再选择合适的协议
/* Check the non-wild match. */
// 找到对应的协议,如果找到对应的协议,但是protocol 不是 IPPRORO_IP,则直接退出
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
// 如果没有对应的协议则返回错误码
err = -EPROTONOSUPPORT;
}
// 如果没有加载模块的保护措施
if (unlikely(err)) {
if (try_loading_module < 2) {
rcu_read_unlock();
/*
* Be more specific, e.g. net-pf-2-proto-132-type-1
* (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
*/
if (++try_loading_module == 1)
request_module("net-pf-%d-proto-%d-type-%d",
PF_INET, protocol, sock->type);
/*
* Fall back to generic, e.g. net-pf-2-proto-132
* (net-pf-PF_INET-proto-IPPROTO_SCTP)
*/
else
request_module("net-pf-%d-proto-%d",
PF_INET, protocol);
goto lookup_protocol;
} else
goto out_rcu_unlock;
}
err = -EPERM;
// 检查通用性,只有root 权限然后使用原始套接字
if (sock->type == SOCK_RAW && !kern &&
!ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
// 对socket 的操作集合进行了互联。
sock->ops = answer->ops;
answer_prot = answer->prot;
answer_flags = answer->flags;
rcu_read_unlock();
WARN_ON(!answer_prot->slab);
err = -ENOBUFS;
/* 此处调用sk_alloc分配一个struct sock,该结构体庞大,其作用是网络层对socket的表示,意思就是IP协议下有很多东西比如IP地址,网卡接口,端口等等信息需要再socket层中有所体现从而使编程者方便使用,然后就利用指针等形式把内容进行一定程度上的映射。sk_alloc首先对sock->proto和sock_creator进行设置,设置成当前协议对应的proto调用sk_prot_alloc()根据是否提供了slab缓存而判断是使用slab缓存还是通用缓存。只要分配成功,则调用sock_lock_init()对缓存进行初始化,主要是对sock锁、等待队列以及进程数据结构中的网络空间结构进行分配。初始化完了后调用sock_net_set()函数对网络空间结构进行记录,然后最后增加一个net计数器。至此回到inet_create,判断是否成功分配 */
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
if (!sk)
goto out;
err = 0;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
// 返回一个 struct inet_sock 的指针给 inet
inet = inet_sk(sk);
// 判断是不是面向连通
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
inet->nodefrag = 0;
// 判断是不是原始套接字,如果是,新建IP头部。
if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
// 判断是否采用路径 MTU 发现算法
if (net->ipv4.sysctl_ip_no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->inet_id = 0;
// 进一步初始化结构体 sk (struct sock)
// sock_init_data: 初始化接收,发送,错误信息队列,三个队列都是双向链表,属于sk_buff_head 结构体,其中会把 sk_buff 结构体串联在一起,初始化数据包发送定时器,变量,(主要是函数指针)
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
inet->rcv_tos = 0;
sk_refcnt_debug_inc(sk);
if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk);
}
// 这里,就是调用了协议里面的 init 函数 tcp_v4_init_sock
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
-
tcp_v4_init_sock
函数
static int tcp_v4_init_sock(struct sock *sk)
{
// 强制转换类型
struct inet_connection_sock *icsk = inet_csk(sk);
// 调用这个进行初始化 ,里面就时关于tcp 的一些初始化了,到此为止
tcp_init_sock(sk);
// ipv4 专用操作
icsk->icsk_af_ops = &ipv4_specific;
#ifdef CONFIG_TCP_MD5SIG
tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif
return 0;
}
-
到此, sock_create 分析完毕
-
最后回到 SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
// net/socket.c
// 刚才分析完毕
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
// socket 映射到文件系统
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
if (retval < 0)
goto out_release;
// net/socket.c
static int sock_map_fd(struct socket *sock, int flags)
{
struct file *newfile;
int fd = get_unused_fd_flags(flags);
if (unlikely(fd < 0))
return fd;
// 申请一个 sock file 节点
newfile = sock_alloc_file(sock, flags, NULL);
if (likely(!IS_ERR(newfile))) {
fd_install(fd, newfile);
return fd;
}
put_unused_fd(fd);
return PTR_ERR(newfile);
}
// 这里所展现的意思是,把socket当成一个文件节点进行操作,open, read,write ,ioctl 等
Read The Fucking Source Code