Linux TCP/IP 协议栈之 Socket 的实现分析(一)
内核版本:2.6.37
参考[作者:kendo的文章(基于内涵版本2.6.12)]
第一部份 Socket套接字的创建
socket 并不是 TCP/IP协议的一部份。
从广义上来讲,socket 是Unix/Linux 抽像的进程间通讯的一种方法。网络 socket 通讯仅仅是其若干协议中的一类。而tcp/ip 又是网络这类中的一种。
从tcp/ip 的解度看 socket ,它更多地体现了用户 API 与协议栈的一个中间层接口层。用户通过调用socket API 将报文递交给协议栈,或者从协议栈中接收报文件。
一、系统总入口
Linux 内核为所有的与socket 有关的操作的API,提供了一个统一的系统调用入口,其代码在net/socket.c 中:
/* * System call vectors. * * Argument checking cleaned up. Saved 20% in size. * This function doesn't need to set the kernel lock because * it is set by the callees. */ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { unsigned long a[6]; unsigned long a0, a1; int err; unsigned int len; if (call < 1 || call > SYS_RECVMMSG) return -EINVAL; len = nargs[call]; if (len > sizeof(a)) return -EINVAL; /* copy_from_user should be SMP safe. */ if (copy_from_user(a, args, len)) return -EFAULT; audit_socketcall(nargs[call] / sizeof(unsigned long), a); a0 = a[0]; a1 = a[1]; switch (call) { case SYS_SOCKET: err = sys_socket(a0, a1, a[2]); break; case SYS_BIND: err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_CONNECT: err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_LISTEN: err = sys_listen(a0, a1); break; case SYS_ACCEPT: err = sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 0); break; case SYS_GETSOCKNAME: err = sys_getsockname(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_GETPEERNAME: err = sys_getpeername(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_SOCKETPAIR: err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]); break; case SYS_SEND: err = sys_send(a0, (void __user *)a1, a[2], a[3]); break; case SYS_SENDTO: err = sys_sendto(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; case SYS_RECV: err = sys_recv(a0, (void __user *)a1, a[2], a[3]); break; case SYS_RECVFROM: err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], (int __user *)a[5]); break; case SYS_SHUTDOWN: err = sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); break; case SYS_GETSOCKOPT: err = sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]); break; case SYS_SENDMSG: err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_RECVMSG: err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_RECVMMSG: err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], (struct timespec __user *)a[4]); break; case SYS_ACCEPT4: err = sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], a[3]); break; default: err = -EINVAL; break; } return err; }
首先调用copy_from_user 将用户态参数拷贝至数组a 。但是问题在于,每个被调用的 API 的参数不尽相同,那么每次拷贝的字节在小如果断定?
来看其第三个参数nargs[call],其中 call 是操作码,后面有个大大的 switch...case就是判断它。对应的操作码定义在include/linux/net.h :
#define SYS_SOCKET 1 /* sys_socket(2) */ #define SYS_BIND 2 /* sys_bind(2) */ #define SYS_CONNECT 3 /* sys_connect(2) */ #define SYS_LISTEN 4 /* sys_listen(2) */ #define SYS_ACCEPT 5 /* sys_accept(2) */ #define SYS_GETSOCKNAME 6 /* sys_getsockname(2) */ #define SYS_GETPEERNAME 7 /* sys_getpeername(2) */ #define SYS_SOCKETPAIR 8 /* sys_socketpair(2) */ #define SYS_SEND 9 /* sys_send(2) */ #define SYS_RECV 10 /* sys_recv(2) */ #define SYS_SENDTO 11 /* sys_sendto(2) */ #define SYS_RECVFROM 12 /* sys_recvfrom(2) */ #define SYS_SHUTDOWN 13 /* sys_shutdown(2) */ #define SYS_SETSOCKOPT 14 /* sys_setsockopt(2) */ #define SYS_GETSOCKOPT 15 /* sys_getsockopt(2) */ #define SYS_SENDMSG 16 /* sys_sendmsg(2) */ #define SYS_RECVMSG 17 /* sys_recvmsg(2) */ #define SYS_ACCEPT4 18 /* sys_accept4(2) */ #define SYS_RECVMMSG 19 /* sys_recvmmsg(2) */
而数组nargs则根据操作码的不同,计算对应的参数的空间大小:
/* Argument list sizes for sys_socketcall */ #define AL(x) ((x) * sizeof(unsigned long)) static const unsigned char nargs[20] = { AL(0), AL(3), AL(3), AL(3), AL(2), AL(3), AL(3), AL(3), AL(4), AL(4), AL(4), AL(6), AL(6), AL(2), AL(5), AL(5), AL(3), AL(3), AL(4), AL(5) }; #undef AL
当拷贝完成参数后,就进入一个switch...case... 判断操作码,跳转至对应的系统接口。
二、 sys_socket 函数
当用户空间要创建一个socke 接口时,会调用 API 函数:
int socket(int domain, int type, int protocol);
函数,其三个参数分别表示协议族、协议类型(面向连接或无连接)以及协议。
协议族:
/* Supported address families. */ #define AF_UNSPEC 0 #define AF_UNIX 1 /* Unix domain sockets */ #define AF_LOCAL 1 /* POSIX name for AF_UNIX */ #define AF_INET 2 /* Internet IP Protocol */ #define AF_AX25 3 /* Amateur Radio AX.25 */ #define AF_IPX 4 /* Novell IPX */ #define AF_APPLETALK 5 /* AppleTalk DDP */ #define AF_NETROM 6 /* Amateur Radio NET/ROM */ #define AF_BRIDGE 7 /* Multiprotocol bridge */ #define AF_ATMPVC 8 /* ATM PVCs */ #define AF_X25 9 /* Reserved for X.25 project */ #define AF_INET6 10 /* IP version 6 */ #define AF_ROSE 11 /* Amateur Radio X.25 PLP */ #define AF_DECnet 12 /* Reserved for DECnet project */ #define AF_NETBEUI 13 /* Reserved for 802.2LLC project*/ #define AF_SECURITY 14 /* Security callback pseudo AF */ #define AF_KEY 15 /* PF_KEY key management API */ #define AF_NETLINK 16 #define AF_ROUTE AF_NETLINK /* Alias to emulate 4.4BSD */ #define AF_PACKET 17 /* Packet family */ #define AF_ASH 18 /* Ash */ #define AF_ECONET 19 /* Acorn Econet */ #define AF_ATMSVC 20 /* ATM SVCs */ #define AF_RDS 21 /* RDS sockets */ #define AF_SNA 22 /* Linux SNA Project (nutters!) */ #define AF_IRDA 23 /* IRDA sockets */ #define AF_PPPOX 24 /* PPPoX sockets */ #define AF_WANPIPE 25 /* Wanpipe API Sockets */ #define AF_LLC 26 /* Linux LLC */ #define AF_CAN 29 /* Controller Area Network */ #define AF_TIPC 30 /* TIPC sockets */ #define AF_BLUETOOTH 31 /* Bluetooth sockets */ #define AF_IUCV 32 /* IUCV sockets */ #define AF_RXRPC 33 /* RxRPC sockets */ #define AF_ISDN 34 /* mISDN sockets */ #define AF_PHONET 35 /* Phonet sockets */ #define AF_IEEE802154 36 /* IEEE802154 sockets */ #define AF_CAIF 37 /* CAIF sockets */ #define AF_MAX 38 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC #define PF_UNIX AF_UNIX #define PF_LOCAL AF_LOCAL #define PF_INET AF_INET #define PF_AX25 AF_AX25 #define PF_IPX AF_IPX #define PF_APPLETALK AF_APPLETALK #define PF_NETROM AF_NETROM #define PF_BRIDGE AF_BRIDGE #define PF_ATMPVC AF_ATMPVC #define PF_X25 AF_X25 #define PF_INET6 AF_INET6 #define PF_ROSE AF_ROSE #define PF_DECnet AF_DECnet #define PF_NETBEUI AF_NETBEUI #define PF_SECURITY AF_SECURITY #define PF_KEY AF_KEY #define PF_NETLINK AF_NETLINK #define PF_ROUTE AF_ROUTE #define PF_PACKET AF_PACKET #define PF_ASH AF_ASH #define PF_ECONET AF_ECONET #define PF_ATMSVC AF_ATMSVC #define PF_RDS AF_RDS #define PF_SNA AF_SNA #define PF_IRDA AF_IRDA #define PF_PPPOX AF_PPPOX #define PF_WANPIPE AF_WANPIPE #define PF_LLC AF_LLC #define PF_CAN AF_CAN #define PF_TIPC AF_TIPC #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IUCV AF_IUCV #define PF_RXRPC AF_RXRPC #define PF_ISDN AF_ISDN #define PF_PHONET AF_PHONET #define PF_IEEE802154 AF_IEEE802154 #define PF_CAIF AF_CAIF #define PF_MAX AF_MAX
协议类型:
enum sock_type { SOCK_STREAM = 1, SOCK_DGRAM = 2, SOCK_RAW = 3, SOCK_RDM = 4, SOCK_SEQPACKET = 5, SOCK_DCCP = 6, SOCK_PACKET = 10, };
socket创建通过操作码SYS_SOCKET是由sys_socket() 实现的:
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) { int retval; struct socket *sock; int flags; /* Check the SOCK_* constants for consistency. */ BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; retval = sock_create(family, type, protocol, &sock); if (retval < 0) goto out; retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); if (retval < 0) goto out_release; out: /* It may be already another descriptor 8) Not kernel problem. */ return retval; out_release: sock_release(sock); return retval; }
这段代码做了两件事:
1> 分配 sock 与sk,协议簇的协议封装;
2> sock 面向上层系统调用,主要是与文件系统交互。
通过进程的current指针的files,结合创建socket时返回的文件描符述,可以找到内核中对应的struct file,再根据file的f_dentry可以找到对应的目录项,而目录项struct dentry中,有d_inode指针,指向与sock封装在一起的inode。
sock又与sk指针互指,一一对应。
三、 协议簇的协议封装
int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { int err; struct socket *sock; const struct net_proto_family *pf; /* * Check protocol is in range */ if (family < 0 || family >= NPROTO) return -EAFNOSUPPORT; if (type < 0 || type >= SOCK_MAX) return -EINVAL; /* Compatibility. This uglymoron is moved from INET layer to here to avoid deadlock in module load. */ if (family == PF_INET && type == SOCK_PACKET) { static int warned; if (!warned) { warned = 1; printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm); } family = PF_PACKET; } err = security_socket_create(family, type, protocol, kern); if (err) return err; /* * Allocate the socket and allow the family to set things up. if * the protocol is 0, the family is instructed to select an appropriate * default. */ sock = sock_alloc(); if (!sock) { if (net_ratelimit()) printk(KERN_WARNING "socket: no more sockets\n"); return -ENFILE; /* Not exactly a match, but its the closest posix thing */ } sock->type = type; #ifdef CONFIG_MODULES /* Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user * requested real, full-featured networking support upon configuration. * Otherwise module support will break! */ if (net_families[family] == NULL) request_module("net-pf-%d", family); #endif rcu_read_lock(); pf = rcu_dereference(net_families[family]); err = -EAFNOSUPPORT; if (!pf) goto out_release; /* * We will call the ->create function, that possibly is in a loadable * module, so we have to bump that loadable module refcnt first. */ if (!try_module_get(pf->owner)) goto out_release; /* Now protected by module ref count */ rcu_read_unlock(); err = pf->create(net, sock, protocol, kern); if (err < 0) goto out_module_put; /* * Now to bump the refcnt of the [loadable] module that owns this * socket at sock_release time we decrement its refcnt. */ if (!try_module_get(sock->ops->owner)) goto out_module_busy; /* * Now that we're done with the ->create function, the [loadable] * module can have its refcnt decremented */ module_put(pf->owner); err = security_socket_post_create(sock, family, type, protocol, kern); if (err) goto out_sock_release; *res = sock; return 0; out_module_busy: err = -EAFNOSUPPORT; out_module_put: sock->ops = NULL; module_put(pf->owner); out_sock_release: sock_release(sock); return err; out_release: rcu_read_unlock(); goto out_sock_release; } EXPORT_SYMBOL(__sock_create);
上面这个函数主要做了三件事:
1> sock_alloc()
在分析这个函数前,首先要了解:为了对 socket 抽像出文件的概念,内核中为socket定义了一个专门的文件系统类型sockfs。
static struct vfsmount *sock_mnt __read_mostly; static struct file_system_type sock_fs_type = { .name = "sockfs", .mount = sockfs_mount, .kill_sb = kill_anon_super, };
在模块初始化的时候,安装该文件系统:
static int __init sock_init(void) { /* * Initialize sock SLAB cache. */ sk_init(); /* * Initialize skbuff SLAB cache */ skb_init(); /* * Initialize the protocols module. */ init_inodecache(); register_filesystem(&sock_fs_type); sock_mnt = kern_mount(&sock_fs_type); /* The real protocol initialization is performed in later initcalls. */ #ifdef CONFIG_NETFILTER netfilter_init(); #endif #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING skb_timestamping_init(); #endif return 0; } core_initcall(sock_init); /* early initcall */
文件系统安装中的一个重要步骤kern_mount->kern_mount_data->vfs_kern_mount:
vfs_kern_mount函数中,先根据注册的文件系统类型,如果文件系统本身有mount成员函数则调用之,没则调用它的get_sb成员函数指针,获取相应的超级块sb 。最后,调置文件系统的超级块成员指针,使之指向对应的值。
其中sockfs文件系统的mount函数调用mount_pseudo()实现超级块的初始化,跟节点inode和目录下dentry创建,sockfs_ops这里关联上文件系统。
那前面提到的new_inode()函数分配inode 时调用的: sock_mnt->mnt_sb->s_op->alloc_inode(sock_mnt->mnt_sb);
static const struct super_operations sockfs_ops = { .alloc_inode = sock_alloc_inode, .destroy_inode = sock_destroy_inode, .statfs = simple_statfs, };
这个alloc_inode函数指针也就是sockfs_ops的sock_alloc_inode()函数。
static struct inode *sock_alloc_inode(struct super_block *sb) { struct socket_alloc *ei; ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->socket.wq = kmalloc(sizeof(struct socket_wq), GFP_KERNEL); if (!ei->socket.wq) { kmem_cache_free(sock_inode_cachep, ei); return NULL; } init_waitqueue_head(&ei->socket.wq->wait); ei->socket.wq->fasync_list = NULL; ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; ei->socket.ops = NULL; ei->socket.sk = NULL; ei->socket.file = NULL; return &ei->vfs_inode; }
函数先分配了一个用于封装socket和inode的ei ,然后在高速缓存中为之申请了一块空间。这样,inode和socket就同时都被分配了。接下来初始化socket的各个成员。
struct socket_alloc { struct socket socket; struct inode vfs_inode; };
显而易见,该结构实现了inode和socket的封装。已经通过new_inode从sockfs文件系统分配一个inode,可以通过宏SOCKET_I来获取与之对应的socket:
sock = SOCKET_I(inode);
分配inode、socket 以及两者如何关联,都已一一分析了。
2> pf = rcu_dereference(net_families[family]);
net_families[family]的定义:
static const struct net_proto_family *net_families[NPROTO] __read_mostly;
net_proto_family的定义:
struct net_proto_family { int family; int (*create)(struct net *net, struct socket *sock, int protocol, int kern); struct module *owner; };
net_families数组填充函数sock_register():
/** * sock_register - add a socket protocol handler * @ops: description of protocol * * This function is called by a protocol handler that wants to * advertise its address family, and have it linked into the * socket interface. The value ops->family coresponds to the * socket system call protocol family. */ int sock_register(const struct net_proto_family *ops) { int err; if (ops->family >= NPROTO) { printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, NPROTO); return -ENOBUFS; } spin_lock(&net_family_lock); if (net_families[ops->family]) err = -EEXIST; else { net_families[ops->family] = ops; err = 0; } spin_unlock(&net_family_lock); printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family); return err; } EXPORT_SYMBOL(sock_register);
从这里我们看出每个协议族都是通过sock_register函数注册到net_families数组中,通过代码搜索发现每个协议族都会调用这个函数去注册。
Af_ax25.c (net\ax25): sock_register(&ax25_family_ops); Af_bluetooth.c (net\bluetooth): err = sock_register(&bt_sock_family_ops); Af_can.c (net\can): sock_register(&can_family_ops); Af_decnet.c (net\decnet): sock_register(&dn_family_ops); Af_econet.c (net\econet): sock_register(&econet_family_ops); Af_ieee802154.c (net\ieee802154): rc = sock_register(&ieee802154_family_ops); Af_inet.c (net\ipv4): (void)sock_register(&inet_family_ops); Af_inet6.c (net\ipv6): err = sock_register(&inet6_family_ops); Af_ipx.c (net\ipx): sock_register(&ipx_family_ops); Af_irda.c (net\irda): rc = sock_register(&irda_family_ops); Af_iucv.c (net\iucv): err = sock_register(&iucv_sock_family_ops); Af_key.c (net\key): err = sock_register(&pfkey_family_ops); Af_llc.c (net\llc): rc = sock_register(&llc_ui_family_ops); Af_netlink.c (net\netlink): sock_register(&netlink_family_ops); Af_netrom.c (net\netrom): if (sock_register(&nr_family_ops)) { Af_packet.c (net\packet): sock_register(&packet_family_ops); Af_phonet.c (net\phonet): err = sock_register(&phonet_proto_family); Af_rds.c (net\rds): ret = sock_register(&rds_family_ops); Af_rose.c (net\rose): sock_register(&rose_family_ops); Af_rxrpc.c (net\rxrpc): ret = sock_register(&rxrpc_family_ops); Af_unix.c (net\unix): sock_register(&unix_family_ops); Af_x25.c (net\x25): rc = sock_register(&x25_family_ops); Caif_socket.c (net\caif): int err = sock_register(&caif_family_ops); Ddp.c (net\appletalk): (void)sock_register(&atalk_family_ops); Net.h (include\linux):extern int sock_register(const struct net_proto_family *fam); Pppox.c (drivers\net): return sock_register(&pppox_proto_family); Pvc.c (net\atm): return sock_register(&pvc_family_ops); Socket.c (drivers\isdn\misdn): err = sock_register(&mISDN_sock_family_ops); Socket.c (net): * sock_register - add a socket protocol handler Socket.c (net):int sock_register(const struct net_proto_family *ops) Socket.c (net):EXPORT_SYMBOL(sock_register); Socket.c (net\tipc): res = sock_register(&tipc_family_ops); Svc.c (net\atm): return sock_register(&svc_family_ops);
本文主要分析的ipv4协议族,所以我们参考的文件af_inet.c(net/ipv4)。
3> err = pf->create(net, sock, protocol, kern);
在af_inet.c里面inet_init函数里面调用sock_register注册到协议族数组net_families里:
(void)sock_register(&inet_family_ops);
接着看inet_family_ops定义:
static const struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner = THIS_MODULE, };
这里的inet_create就是程序调用的函数:
/* * Create an inet socket. */ static int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct inet_protosw *answer; struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; char answer_no_check; int try_loading_module = 0; int err; if (unlikely(!inet_ehash_secret)) if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) build_ehash_secret(); sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { err = 0; /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } err = -EPROTONOSUPPORT; } if (unlikely(err)) { if (try_loading_module < 2) { rcu_read_unlock(); /* * Be more specific, e.g. net-pf-2-proto-132-type-1 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module == 1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET, protocol, sock->type); /* * Fall back to generic, e.g. net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET, protocol); goto lookup_protocol; } else goto out_rcu_unlock; } err = -EPERM; if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) goto out_rcu_unlock; err = -EAFNOSUPPORT; if (!inet_netns_ok(net, protocol)) goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; answer_no_check = answer->no_check; answer_flags = answer->flags; rcu_read_unlock(); WARN_ON(answer_prot->slab == NULL); err = -ENOBUFS; sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); if (sk == NULL) goto out; err = 0; sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = 1; inet = inet_sk(sk); inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; inet->nodefrag = 0; if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet->hdrincl = 1; } if (ipv4_config.no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; inet->inet_id = 0; sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; inet->uc_ttl = -1; inet->mc_loop = 1; inet->mc_ttl = 1; inet->mc_all = 1; inet->mc_index = 0; inet->mc_list = NULL; sk_refcnt_debug_inc(sk); if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ sk->sk_prot->hash(sk); } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); if (err) sk_common_release(sk); } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; }
在分析inet_create()函数前,就要分析inetsw[SOCK_MAX]这个数组。
static struct list_head inetsw[SOCK_MAX];
这个数组是在inet_init()->inet_register_protosw()里面填充的。
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) inet_register_protosw(q);
inetsw_array定义:
/* Upon startup we insert all the elements in inetsw_array[] into * the linked list inetsw. */ static struct inet_protosw inetsw_array[] = { { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }, { .type = SOCK_RAW, .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, } }; #define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
inet_register_protosw函数分析:
void inet_register_protosw(struct inet_protosw *p) { struct list_head *lh; struct inet_protosw *answer; int protocol = p->protocol; struct list_head *last_perm; spin_lock_bh(&inetsw_lock); if (p->type >= SOCK_MAX) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ answer = NULL; last_perm = &inetsw[p->type]; list_for_each(lh, &inetsw[p->type]) { answer = list_entry(lh, struct inet_protosw, list); /* Check only the non-wild match. */ if (INET_PROTOSW_PERMANENT & answer->flags) { if (protocol == answer->protocol) break; last_perm = lh; } answer = NULL; } if (answer) goto out_permanent; /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with * a wild-card protocol. But it is allowed to override any existing * non-permanent entry. This means that when we remove this entry, the * system automatically returns to the old behavior. */ list_add_rcu(&p->list, last_perm); out: spin_unlock_bh(&inetsw_lock); return; out_permanent: printk(KERN_ERR "Attempt to override permanent protocol %d.\n", protocol); goto out; out_illegal: printk(KERN_ERR "Ignoring attempt to register invalid socket type %d.\n", p->type); goto out; } EXPORT_SYMBOL(inet_register_protosw);
这个函数完成的工作,就是把inetsw_array 数组中,相同的协议类型(protocol成员)下边的协议,加入到inetsw 对应的协议类型的链表中去。
因为事实上一对一的关系,所以这个函数要简单得多:
因为不存在其它成员,所以每一次 list_entry 都为空值,所以不存在覆盖和追加的情况,直接调用list_add_rcu(&p->list, last_perm);
把协议类型节点(struct inet_protosw 类型的数组的某个元素)添加到链表(链表首部本身是一个数组,数组索引是协议对应的协议类型的值的第一个成员。
继续分析inet_create()函数:
首先,根据sock的成员protocol,把之前在链表中注册的协议节点找出。
然后,将创建的socket 的ops 函数指针集,指向协议类型的例如创建的是SOCK_STREAM,那么就指向了inet_stream_ops; answer_prot 指针指向了当前要创建的socket 的协议类型下边的协议,如上例,它就是IPPROTO_TCP 的tcp_prot结构。
接着, 接下来一个重要的工作,就是为socket分配一个sock,并初始化它。
最后,初始化一个 inet 。
虽然create 的代码就到这儿了,不过要说清楚sk(socK)的分配,还得费上大力气。
每一个Socket 套接字,都有一个对应的 struct socket 结构来描述(内核中一般使用名称为sock),但是同时又一个struct sock 结构(内核中一般使用名称为sk),两者之间是一一对应的关系。
在后面的sock_init_data 函数中,可以看到:
sk->sk_socket = sock;
sock->sk = sk;
socket 结构和 sock 结构实际上是同一个事物的两个方面。不妨说,socket 结构是面向进程和系统调用界面的侧面,而 sock 结构则是面向底层驱动程序的侧面。
设计者把socket套接字中,与文件系统关系比较密切的那一部份放在socket结构中,而把与通信关系比较密切的那一部份,则单独成为 一个数结结构,那就是sock 结构。
由于这两部份逻辑上本来就是一体的,所以要通过指针互相指向对方,形成一对一的关系。
调用sk_alloc()分配一个sk:
在之前proto_register()函数创建的高速缓存中申请分配一个slab缓存项,并清零。然后设置协议族、并把sk中的sk_prot与对应的协议关联起来。
分配完成sk后,另一个重要的功能就是初始化它,
sk的成员相当复杂,其主要的初始化工作是在函数sock_init_data()中完成的:
sock 结构中,有三个重要的双向队列,分别是 sk_receive_queue、sk_write_queue 和sk_error_queue。从它们的名字就可以看出来其作用了。
队列并非采用通用的list_head来维护,而是使用skb_buffer队列:
struct sk_buff_head { /* These two members must be first. */ struct sk_buff *next; struct sk_buff *prev; __u32 qlen; spinlock_t lock; };
这样,队列中指向的每一个skb_buffer,就是一个数据包,分别是接收、发送和投递错误。
inet 初始化:
inet 是一个struct inet_sock 结构类型,来看它的定义:
struct inet_sock { /* sk and pinet6 has to be the first two members of inet_sock */ struct sock sk; …… }
只留意它的第一个成员就足够了。
我们说sock 是面向用户态调用,而sk是面向内核驱动调用的,那sk是如何与协议栈交互的呢?
对于每一个类型的协议,为了与sk联系起来,都定义了一个struct XXX_sock 结构,XXX是协议名,例如:
struct tcp_sock { /* inet_sock has to be the first member of tcp_sock */ struct inet_sock inet; int tcp_header_len; /* Bytes of tcp header to send */ …… }
很明显,它们的结构定构是“af_inet 一般属性+ 自己的私有属性” ,因为它们的第一个成员总是inet 。
现在回头来照一下起初在af_inet.c中,封装协议注册proto_register()的时候,size成员,对于tcp而言:
struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, ... .obj_size = sizeof(struct tcp_sock), ... };
其它协议类似。
以obj_size 来确定每个 slab 缓存项分配的大小,所以,我们就可说,每次申请分配的,实际上是一个struct XXX_sock 结构大小的结构。因为都是定义于上层结构的第一个成员,可以使用强制类型转换来使用这块分配的内存空间。例如:
struct inet_sock { /* sk and pinet6 has to be the first two members of inet_sock */ struct sock sk; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct ipv6_pinfo *pinet6; #endif /* Socket demultiplex comparisons on incoming packets. */ __be32 inet_daddr; __be32 inet_rcv_saddr; __be16 inet_dport; __u16 inet_num; __be32 inet_saddr; __s16 uc_ttl; __u16 cmsg_flags; __be16 inet_sport; __u16 inet_id; ... }; inet = inet_sk(sk); static inline struct inet_sock *inet_sk(const struct sock *sk) { return (struct inet_sock *)sk; //inet_sock->sk }
struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; u16 tcp_header_len; /* Bytes of tcp header to send */ ... }; struct tcp_sock *tp = tcp_sk(sk); static inline struct tcp_sock *tcp_sk(const struct sock *sk) { return (struct tcp_sock *)sk; //tcp_sock->inet_conn->icsk_inet->sk }
inet_create()运行完,一个 socket 套接字基本上就创建完毕了,剩下的就是与文件系统挂钩。
四、与文件系统交互
sys_socket()函数中来,它在调用完sock_create()后,紧接着调用sock_map_fd()函数:
int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = sock_alloc_file(sock, &newfile, flags); if (likely(fd >= 0)) fd_install(fd, newfile); return fd; } EXPORT_SYMBOL(sock_map_fd);
这个函数的核心思想,在一开始,就已经分析过了。
从进程的角度来讲,一个 socket 套接字就是一个特殊的,已打开的文件。
前面分配好一个socket后,这里要做的就是将它与文件系统拉上亲戚关系。
首先获取一个空闲的文件描述符号和file结构。然后在文件系统中分配一个目录项(d_alloc),使其指向已经分配的inode节点(d_add),然后把其目录项挂在sockfs文件系统的根目录之下。
并且把目录项的指针d_op设置成指向 sockfs_dentry_operati,这个数据结构通过函数指针提供他与文件路径有关的操作:
static const struct dentry_operations sockfs_dentry_operations = { .d_dname = sockfs_dname, };
最后一步,就是将file结构中的f_op和sock结构中的i_fop都指向socket_file_ops,它是一个函数指针集,指向了socket面向文件系统的用户态调用的一些接口函数:
/* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. */ static const struct file_operations socket_file_ops = { .owner = THIS_MODULE, .llseek = no_llseek, .aio_read = sock_aio_read, .aio_write = sock_aio_write, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_sock_ioctl, #endif .mmap = sock_mmap, .open = sock_no_open, /* special open code to disallow open via /proc */ .release = sock_close, .fasync = sock_fasync, .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, .splice_read = sock_splice_read, };
到这里,整个socket 套接字的创建工作,就宣告完成了。
posted on 2014-04-29 10:27 CSlunatic 阅读(1958) 评论(0) 编辑 收藏 举报