Linux kernel 之 socket 创建过程分析

  • 重要结构体

  • struct socket 结构体

	// 普通的 BSD 标准 socket 结构体
	// socket_state: socket 状态, 连接?不连接?
	// type: socket type (%SOCK_STREAM, etc)
	// flags: socket flags (%SOCK_NOSPACE, etc)
	// ops: 专用协议的socket的操作
	// file: 与socket 有关的指针列表
	// sk: 负责协议相关结构体,这样就让这个这个结构体和协议分开。
	// wq: 等待队列
	struct socket {  
 	   socket_state        state;                                                  
	                                                                                
 	   kmemcheck_bitfield_begin(type);                                             
 	   short           type;                                                       
 	   kmemcheck_bitfield_end(type);                                               
                                                                                
 	   unsigned long       flags;                                                  
                                                                                
  	  struct socket_wq __rcu  *wq;                                                
  	                                                                              
  	  struct file     *file;                                                      
  	  struct sock     *sk;                                                        
  	  const struct proto_ops  *ops;                                               
	};  
  • struct socket 的创建

	// socket() 本质上是 glibc 中的函数,执行的实际上是 sys_socketcall() 系统调用。
	// sys_socketcall() 几乎是所有的socket函数的入口, 
	// 也就是 bind,connect 等函数都是需要asmlinkage long sys_socketcall(int call, unsigned long __user *args); 、、sys_socketcall() 作为入口,函数如下:

	// include/linux/syscalls.h
	asmlinkage long sys_socketcall(int call, unsigned long __user *args); 
	// net/socket.c
	SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)            
	{                                                                               
    	unsigned long a[AUDITSC_ARGS];                                              
    	unsigned long a0, a1;                                                       
    	int err;                                                                    
    	unsigned int len;                                                           
                                                                                
    	if (call < 1 || call > SYS_SENDMMSG)                                        
        	return -EINVAL;                                                         
                                                                                
    	len = nargs[call];                                                          
    	if (len > sizeof(a))                                                        
        	return -EINVAL;                                                         
                                                                               
    	/* copy_from_user should be SMP safe. */                                    
    	if (copy_from_user(a, args, len))                                           
        	return -EFAULT;                                                         
                                                                                
    	err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);             
    	if (err)                                                                    
    	    return err;                                                             
    	                                                                            
    	a0 = a[0]; 
    	a1 = a[1];                                                                  
    	// 判断,然后运行相对应的函数 
    	switch (call) {                                                             
    	case SYS_SOCKET:  // 这里就是 sys_socket(), 
    	    err = sys_socket(a0, a1, a[2]);                                         
    	    break;                                                                  
    	case SYS_BIND:                                                              
    	    err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);                 
    	    break;                                                                  
    	case SYS_CONNECT:                                                           
    	    err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);              
    	    break;                                                                  
    	case SYS_LISTEN:                                                            
    	    err = sys_listen(a0, a1);                                               
    	    break;                                                                  
    	// ... ...                                                             
    	default:                                                                    
    	    err = -EINVAL;                                                          
    	    break;                                                                  
    	}                                                                           
    	return err;                                                                 
	}   	                                                                                                                                                                              
	// include/linux/syscalls.h
	asmlinkage long sys_socket(int, int, int);
	// net/socket.c 
	SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)                     
	{                                                                                  
    	int retval;                                                                    
    	struct socket *sock;                                                           
    	int flags;                                                                     
    	                                                                               
    	/* Check the SOCK_* constants for consistency.  */                             
    	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);                                       
    	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);                   
    	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);                                   
    	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);                                  
    	                                                                               
    	flags = type & ~SOCK_TYPE_MASK;                                                
    	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))                                
    	    return -EINVAL;                                                         
    	type &= SOCK_TYPE_MASK;                                                     
    	                                                                            
    	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))                 
    	    flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;                          
    	// 这里创建了 socket 结构体
    	retval = sock_create(family, type, protocol, &sock);                        
    	if (retval < 0)                                                             
    	    goto out;                                                               
    	// 与文件系统进行关联
    	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));               
    	if (retval < 0)                                                             
    	    goto out_release;                                                       
    	                                                                            
	out:                                                                            
    	/* It may be already another descriptor 8) Not kernel problem. */           
    	return retval;                                                              
                                                                                
	out_release:                                                                    
	    sock_release(sock);                                                         
	    return retval;                                                              
	}                                                                               
  • sock_create() 函数

	// include/linux/net.h
	int sock_create(int family, int type, int proto, struct socket **res);

	// net/socket.c
	int sock_create(int family, int type, int protocol, struct socket **res)        
	{                                                                               
    	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
	}                                                                               
	EXPORT_SYMBOL(sock_create);  

	// include/linux/net.h 
	int __sock_create(struct net *net, int family, int type, int proto,             
          struct socket **res, int kern);   
	// net/socket.c
	int __sock_create(struct net *net, int family, int type, int protocol,          
            	 struct socket **res, int kern)                                     
	{                                                                               
	    int err;                                                                    
	    struct socket *sock;                                                        
	    const struct net_proto_family *pf;                                          
	                                                                                
	    /*                                                                          
	     *      Check protocol is in range                                          
	     */    
		// 检查 协议族是否在范围呢  
	    if (family < 0 || family >= NPROTO)                                         
	        return -EAFNOSUPPORT;                                                   
	    if (type < 0 || type >= SOCK_MAX)   // 检查类型  
	        return -EINVAL;                                                         
	                                                                                
	    /* Compatibility.                                                           
	                                                                                
	       This uglymoron is moved from INET layer to here to avoid                 
	       deadlock in module load.                                                 
	     */    // 检查用的是PF_INET 其实这个都是兼容的。 
	    if (family == PF_INET && type == SOCK_PACKET) {                             
	        static int warned;                                                      
	        if (!warned) {                                                          
	            warned = 1;                                                         
	            pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)\n",                 
	                current->comm);                                                 
	        }                                                                       
	        family = PF_PACKET;                                                     
	    }                                                                           
	    // 安全机制检查 
	    err = security_socket_create(family, type, protocol, kern);                 
	    if (err)                                                                    
	        return err;                                                             
	                                                                                
	    /*                                                                          
	     *  Allocate the socket and allow the family to set things up. if           
	     *  the protocol is 0, the family is instructed to select an appropriate    
	     *  default.                                                                
	     */  // ----> sock_alloc  接下面  
	    sock = sock_alloc();                                                        
	    if (!sock) {                                                                
	        net_warn_ratelimited("socket: no more sockets\n");                      
	        return -ENFILE; /* Not exactly a match, but its the                     
	                   closest posix thing */                                       
	    }                                                                           
	                                                                                
	    sock->type = type;    
         
		// ... ...    
	    return 0;         
       // ... ...            
	}                                   
	EXPORT_SYMBOL(__sock_create);  
  • sock_alloc() 函数解析,被上面的 __sock_create() 函数调用

	// net/socket.c
	static struct socket *sock_alloc(void)                                          
	{                                                                               
	    struct inode *inode;                                                        
	    struct socket *sock;                                                        
	                                                                                
	    inode = new_inode_pseudo(sock_mnt->mnt_sb);                                 
	    if (!inode)                                                                 
	        return NULL;                                                            
	                                                                                
	    sock = SOCKET_I(inode);                                                     
	                                                                                
	    kmemcheck_annotate_bitfield(sock, type);                                    
	    inode->i_ino = get_next_ino();                                              
	    inode->i_mode = S_IFSOCK | S_IRWXUGO; // 模式  
	    inode->i_uid = current_fsuid();  // 获取当前的uid  
	    inode->i_gid = current_fsgid();  // 获取当前的gid 
	    inode->i_op = &sockfs_inode_ops; // 操作 
	                                                                                
	    this_cpu_add(sockets_in_use, 1);                                            
	    return sock;   
	}   
	// 申请一个 socket 结构体 ,名字为 sock
	// 申请一个新的节点和一个新的 socket 项目, 绑定他们两个并且初始化
	// 如果申请inode 失败返回 NULL, 或者返回sock  
	// 接下来我们再看到 SOCKET_I(inode);
	// include/net/sock.h
	static inline struct socket *SOCKET_I(struct inode *inode)                      
	{                                                                               
    	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;        
	}    
	// 然后我们发现,返回的是 inode 内的socket 结构体。  
	
	// 我们可以分析一个 container_of() 这个是怎么定义的。
	// include/linux/kernel.h
	#define container_of(ptr, type, member) ({          \                           
    const typeof( ((type *)0)->member ) *__mptr = (ptr);    \                   
    (type *)( (char *)__mptr - offsetof(type,member) );})  
	//  typeof 将 ptr 的指针临时保存起来为 __mptr
	//  然后用这个 __mptr 指针减去下面的 member 的便宜量。
    //  得到的就是 type 这个结构体的头指针。
	//  offsetof   include/linux/stddef.h
	#undef offsetof                                                                 
	#ifdef __compiler_offsetof                                                      
	#define offsetof(TYPE, MEMBER)  __compiler_offsetof(TYPE, MEMBER)               
	#else                                                                           
	#define offsetof(TYPE, MEMBER)  ((size_t)&((TYPE *)0)->MEMBER)                  
	#endif   
	                                                                          
	// 反正这里有点难理解,最后得到的结果是 type 这个结构体的头指针。
	
	// 所以说 SOCKET_I() 得到的是 struct socket_alloc 的头指针
	// include/net/sock.h
	struct socket_alloc {                                                           
    	struct socket socket;                                                       
    	struct inode vfs_inode;                                                     
	};               
  • 回到 __sock_create() 继续分析

	// net/socket.c   --> __sock_create()
	#ifdef CONFIG_MODULES                                                           
	    /* Attempt to load a protocol module if the find failed.                    
	     *                                                                          
	     * 12/09/1996 Marcin: But! this makes REALLY only sense, if the 	user        
	     * requested real, full-featured networking support upon 	configuration.         
	     * Otherwise module support will break!                                     
	     */                                                                         
	    if (rcu_access_pointer(net_families[family]) == NULL)                       
	        request_module("net-pf-%d", family);                                    
	#endif                                                                          
	如果在 make menuconfig 中选上 编译成模块的选项,则会运行上面这个部分。
	里面先是检查对应的协议族的操作表是否已经安装,如果没有安装则使用 request_module 进行安装,现在都是在 TCP/IP协议下进行分析,所以 family 是 AF_INET , 也就是 2 , 所以实际检查的全局变量是 net_families[2], 这个全局变量是在系统初始化时由 net/ipv4/af_inet.c 文件进行安装,具体代码是:
	// net/ipv4/af_inet.c
	static int __init inet_init(void)                                               
	{                                                                               
	    struct inet_protosw *q;                                                     
	    struct list_head *r;                                                        
	    int rc = -EINVAL;                                                           
	                                                                                
	    sock_skb_cb_check_size(sizeof(struct inet_skb_parm));                       
	    // 各个协议的注册 
	    rc = proto_register(&tcp_prot, 1);                                          
	    if (rc)                                                                     
	        goto out;                                                               
	                                                                                
	    rc = proto_register(&udp_prot, 1);                                          
	    if (rc)                                                                     
	        goto out_unregister_tcp_proto;                                          
	                                                                                
	    rc = proto_register(&raw_prot, 1);                                          
	    if (rc)                                                                     
	        goto out_unregister_udp_proto;                                          
	                                                                                
	    rc = proto_register(&ping_prot, 1);                                         
	    if (rc)                                                                     
	        goto out_unregister_raw_proto;                                          
	                                                                                
	    /*                                                                          
	     *  Tell SOCKET that we are alive...                                        
	     */                                                                         
	
		    (void)sock_register(&inet_family_ops);                                      
	                                                                                
	#ifdef CONFIG_SYSCTL                                                            
	    ip_static_sysctl_init();                                                    
	#endif                                                                          
	                                                                                
	    /*                                                                          
	     *  Add all the base protocols.                                             
	     */                                                                         
	    // 各个协议的添加,添加不成功则报错 
	    if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)                    
	        pr_crit("%s: Cannot add ICMP protocol\n", __func__);                    
	    if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)                      
	        pr_crit("%s: Cannot add UDP protocol\n", __func__);                     
	    if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)                      
	        pr_crit("%s: Cannot add TCP protocol\n", __func__);                     
	#ifdef CONFIG_IP_MULTICAST                                                      
	    if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)                    
	        pr_crit("%s: Cannot add IGMP protocol\n", __func__);                    
	#endif                                                                          
	                                                                                
	    /* Register the socket-side information for inet_create. */                 
	    for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)                            
	        INIT_LIST_HEAD(r);                                                      
	    // 把这个关键性的链接表一个个注册上去
		// ******************************************************
		// inetsw_array 结构体数组数组, 这里面有包含每个的协议,比如说tcp_prot
		static struct inet_protosw inetsw_array[] =                                     
		{                                                                               
    		{                                                                           
        		.type =       SOCK_STREAM,                                              
        		.protocol =   IPPROTO_TCP,                                              
     	   		.prot =       &tcp_prot,                                                
      	 		.ops =        &inet_stream_ops,                                         
       		 	.flags =      INET_PROTOSW_PERMANENT |                                  
                  		INET_PROTOSW_ICSK,                                            
    		},                                                                          
                                                                                
    		{                                                                           
        		.type =       SOCK_DGRAM,                                               
        		.protocol =   IPPROTO_UDP,                                              
        		.prot =       &udp_prot,                                                
        		.ops =        &inet_dgram_ops,                                          
        		.flags =      INET_PROTOSW_PERMANENT,                                   
       		},                                                                       
                                                                                
       		{                                                                        
        		.type =       SOCK_DGRAM,                                               
        		.protocol =   IPPROTO_ICMP,                                             
        		.prot =       &ping_prot,                                               
        		.ops =        &inet_dgram_ops,                                          
        		.flags =      INET_PROTOSW_REUSE,                                       
       		},
			// ... ...
		} 
		
		// tcp_prot  ---> net/ipv4/tcp_ipv4.c
		struct proto tcp_prot = {                                                       
    		.name           = "TCP",                                                    
    		.owner          = THIS_MODULE,                                              
    		.close          = tcp_close,                                                
    		.connect        = tcp_v4_connect,                                               
    		.disconnect     = tcp_disconnect,                                               
    		.accept         = inet_csk_accept,                                              
    		.ioctl          = tcp_ioctl,                                                    
    		.init           = tcp_v4_init_sock,    // 这是init 函数会在后面被调用
    		.destroy        = tcp_v4_destroy_sock,                                          
    		.shutdown       = tcp_shutdown,                                                 
    		.setsockopt     = tcp_setsockopt,                                               
    		.getsockopt     = tcp_getsockopt,                                               
    		.recvmsg        = tcp_recvmsg,                                                  
    		.sendmsg        = tcp_sendmsg,                                                  
    		.sendpage       = tcp_sendpage,                                                 
    		.backlog_rcv        = tcp_v4_do_rcv,                                            
    		.release_cb     = tcp_release_cb,                                               
    		.hash           = inet_hash,                                                    
    		.unhash         = inet_unhash,                                                  
    		.get_port       = inet_csk_get_port,                                            
    		.enter_memory_pressure  = tcp_enter_memory_pressure,                            
    		.stream_memory_free = tcp_stream_memory_free,                                   
    		.sockets_allocated  = &tcp_sockets_allocated,                                   
    		.orphan_count       = &tcp_orphan_count,                                        
    		.memory_allocated   = &tcp_memory_allocated,                                    
    		.memory_pressure    = &tcp_memory_pressure,                                 
    		.sysctl_mem     = sysctl_tcp_mem,                                           
    		.sysctl_wmem        = sysctl_tcp_wmem,                                      
    		.sysctl_rmem        = sysctl_tcp_rmem,                                      
    		.max_header     = MAX_TCP_HEADER,                                           
    		.obj_size       = sizeof(struct tcp_sock),                                  
    		.slab_flags     = SLAB_DESTROY_BY_RCU,                                      
    		.twsk_prot      = &tcp_timewait_sock_ops,                                   
    		.rsk_prot       = &tcp_request_sock_ops,                                    
    		.h.hashinfo     = &tcp_hashinfo,                                            
    		.no_autobind        = true,                                                 
		#ifdef CONFIG_COMPAT                                                            
    		.compat_setsockopt  = compat_tcp_setsockopt,                                
    		.compat_getsockopt  = compat_tcp_getsockopt,                                
		#endif                                                                          
		#ifdef CONFIG_MEMCG_KMEM                                                        
    		.init_cgroup        = tcp_init_cgroup,                                      
		    .destroy_cgroup     = tcp_destroy_cgroup,                                   
    		.proto_cgroup       = tcp_proto_cgroup,                                     
		#endif                                                                          
		};                                                                              
		EXPORT_SYMBOL(tcp_prot); 
		// ***********************************************************
	    for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)            
	        inet_register_protosw(q);                                               
	    
		// 各个协议模块的初始化 
	    /*                                                                          
	     *  Set the ARP module up                                                   
	     */                                                                         
	                                                                                
	    arp_init();                                                                 
	                                                                                
	    /*                                                                          
	     *  Set the IP module up                                                    
	     */                                                                         
	                                                                                
	    ip_init();                                                                  
	                                                                                
	    tcp_v4_init();                                                              
	                                                                                
	    /* Setup TCP slab cache for open requests. */                               
	    tcp_init();                                                                 
                                                                                
	    /* Setup UDP memory threshold */                                            
	    udp_init();                                                                 
	                                                                                
	    /* Add UDP-Lite (RFC 3828) */                                               
	    udplite4_register();                                                        
                                                                          
	    ping_init();                                                                
	                                                                                
	    /*                                                                          
	     *  Set the ICMP layer up                                                   
	     */                                                                         
                                                                                
	    if (icmp_init() < 0)                                                        
	        panic("Failed to create the ICMP control socket.\n");                   
	                                                                                
	    /*                                                                          
	     *  Initialise the multicast router                                         
	     */                                                                         
	#if defined(CONFIG_IP_MROUTE)                                                   
	    if (ip_mr_init())                                                           
	        pr_crit("%s: Cannot init ipv4 mroute\n", __func__);                     
	#endif                                                                          
	                                                                                
	    if (init_inet_pernet_ops())                                                 
	        pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);            
	    /*                                                                          
	     *  Initialise per-cpu ipv4 mibs                                            
	     */                                                                         
                                                                                
	    if (init_ipv4_mibs())                                                       
	        pr_crit("%s: Cannot init ipv4 mibs\n", __func__);                       
	                                                                                
	    ipv4_proc_init();                                                           
	                                                                                
	    ipfrag_init();                                                              
	                                                                                
	    dev_add_pack(&ip_packet_type);                                              
	                                                                                
	    ip_tunnel_core_init();                                                      
                                                                                
	    rc = 0;                                                                     
	out:                                                                            
	    return rc;                                                                  
	out_unregister_raw_proto:                                                       
	    proto_unregister(&raw_prot);                                                
	out_unregister_udp_proto:                                                       
	    proto_unregister(&udp_prot);                                                
	out_unregister_tcp_proto:                                                       
	    proto_unregister(&tcp_prot);                                                
	    goto out;                                                                   
	}                                                                               

	fs_initcall(inet_init);
  • 很粗浅的看完协议那一部分之后我们回到 __sock_create()

	// net/socket.c
	// 看到 这个回调函数的调用
	    err = pf->create(net, sock, protocol, kern);                                
	    if (err < 0)                                                                
 	       goto out_module_put; 
	// 先看一个 inet_protosw 结构体
	// include/net/protocol.h
	/* This is used to register socket interfaces for IP protocols.  */             
	struct inet_protosw {                                                           
    	struct list_head list;                                                      
    	                                                                            
    	    /* These two fields form the lookup key.  */                            
    	unsigned short   type;     /* This is the 2nd argument to socket(2). */     
    	unsigned short   protocol; /* This is the L4 protocol number.  */           
    	                                                                            
    	struct proto     *prot;                                                     
    	const struct proto_ops *ops;                                                
    	                                                                            
    	unsigned char    flags;      /* See INET_PROTOSW_* below.  */               
	};                                                                              
		
	// 上面的 create 函数对应的是 net/ipv4/af_inet.c 里面的 inet_create 函数
	// net/ipv4/af_inet.c
	static int inet_create(struct net *net, struct socket *sock, int protocol,      
               int kern)                                                        
	{                                                                               
	    struct sock *sk;                                                            
	    struct inet_protosw *answer;                                                
	    struct inet_sock *inet;                                                     
	    struct proto *answer_prot;                                                  
	    unsigned char answer_flags;                                                 
	    int try_loading_module = 0;                                                 
	    int err;                                                                    
	    // 检查协议是否在范围之内 
	    if (protocol < 0 || protocol >= IPPROTO_MAX)                                
	        return -EINVAL;                                                         
	    // 设置状态为未连接
	    sock->state = SS_UNCONNECTED;                                               
	                                                                                
	    /* Look for the requested type/protocol pair. */ 
	// 遍历寻找请求的协议类型 
	lookup_protocol:                                                                
	    err = -ESOCKTNOSUPPORT;                                                     
	    rcu_read_lock(); 
		// 遍历 inetsw[] 数组,其实就是次数而已
	    list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { 
	                                                                                
	        err = 0;   
			// 检查对应的协议,然后再选择合适的协议 
	        /* Check the non-wild match. */ 
			// 找到对应的协议,如果找到对应的协议,但是protocol 不是 IPPRORO_IP,则直接退出 
	        if (protocol == answer->protocol) {                                     
	            if (protocol != IPPROTO_IP)                                         
	                break;                                                          
	        } else {                                                                
	            /* Check for the two wild cases. */                                 
	            if (IPPROTO_IP == protocol) {                                       
	                protocol = answer->protocol;                                    
	                break;                                                          
	            }                                                                   
	            if (IPPROTO_IP == answer->protocol)                                 
	                break;                                                          
	        } 
			// 如果没有对应的协议则返回错误码 
	        err = -EPROTONOSUPPORT;
	    }                                                                           
	    // 如果没有加载模块的保护措施 
	    if (unlikely(err)) {                                                        
	        if (try_loading_module < 2) {                                           
	            rcu_read_unlock();                                                  
	            /*                                                                  
	             * Be more specific, e.g. net-pf-2-proto-132-type-1                 
	             * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)             
	             */                                                                 
	            if (++try_loading_module == 1)                                      
	                request_module("net-pf-%d-proto-%d-type-%d",                    
	                           PF_INET, protocol, sock->type);                      
	            /*                                                                  
	             * Fall back to generic, e.g. net-pf-2-proto-132                    
	             * (net-pf-PF_INET-proto-IPPROTO_SCTP)                              
	             */                                                                 
	            else                                                                
	                request_module("net-pf-%d-proto-%d",                            
	                           PF_INET, protocol);                                  
	            goto lookup_protocol;                                               
	        } else                                                                  
	            goto out_rcu_unlock;                                                
	    }                                                                           
	                                                                                
	    err = -EPERM;  
		//  检查通用性,只有root 权限然后使用原始套接字 
	    if (sock->type == SOCK_RAW && !kern &&                                      
	        !ns_capable(net->user_ns, CAP_NET_RAW))                                 
		        goto out_rcu_unlock;   
                                                 
	    // 对socket 的操作集合进行了互联。
	    sock->ops = answer->ops;                                                    
	    answer_prot = answer->prot;                                                 
	    answer_flags = answer->flags;                                               
	    rcu_read_unlock();                                                          
	                                                                                
	    WARN_ON(!answer_prot->slab);                                                
	                                                                                
	    err = -ENOBUFS;
		/* 此处调用sk_alloc分配一个struct sock,该结构体庞大,其作用是网络层对socket的表示,意思就是IP协议下有很多东西比如IP地址,网卡接口,端口等等信息需要再socket层中有所体现从而使编程者方便使用,然后就利用指针等形式把内容进行一定程度上的映射。sk_alloc首先对sock->proto和sock_creator进行设置,设置成当前协议对应的proto调用sk_prot_alloc()根据是否提供了slab缓存而判断是使用slab缓存还是通用缓存。只要分配成功,则调用sock_lock_init()对缓存进行初始化,主要是对sock锁、等待队列以及进程数据结构中的网络空间结构进行分配。初始化完了后调用sock_net_set()函数对网络空间结构进行记录,然后最后增加一个net计数器。至此回到inet_create,判断是否成功分配 */ 
	    sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);                 
	    if (!sk)                                                                    
	        goto out;                                                               
	                                                                                
	    err = 0;                                                                    
	    if (INET_PROTOSW_REUSE & answer_flags)                                      
	        sk->sk_reuse = SK_CAN_REUSE;                                            
	    
		// 返回一个 struct inet_sock 的指针给 inet                                                                            
	    inet = inet_sk(sk);     
		// 判断是不是面向连通                                                    
	    inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;                    
	                                                                                
	    inet->nodefrag = 0;                                                         
	    
		// 判断是不是原始套接字,如果是,新建IP头部。
	    if (SOCK_RAW == sock->type) {                                               
	        inet->inet_num = protocol;                                              
	        if (IPPROTO_RAW == protocol)                                            
	            inet->hdrincl = 1;                                                  
	    }                                                                           
	    // 判断是否采用路径 MTU 发现算法                                                                           
	    if (net->ipv4.sysctl_ip_no_pmtu_disc)                                       
	        inet->pmtudisc = IP_PMTUDISC_DONT;                                      
	    else                                                                        
	        inet->pmtudisc = IP_PMTUDISC_WANT;                                      
	                                                                                
	    inet->inet_id = 0;
                                                          
	    // 进一步初始化结构体 sk (struct sock)
		// sock_init_data: 初始化接收,发送,错误信息队列,三个队列都是双向链表,属于sk_buff_head 结构体,其中会把 sk_buff 结构体串联在一起,初始化数据包发送定时器,变量,(主要是函数指针)
	    sock_init_data(sock, sk); 
	    sk->sk_destruct    = inet_sock_destruct;                                    
	    sk->sk_protocol    = protocol;                                              
	    sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;                              
	                                                                                
	    inet->uc_ttl    = -1;                                                       
	    inet->mc_loop   = 1;                                                        
	    inet->mc_ttl    = 1;                                                        
	    inet->mc_all    = 1;                                                        
	    inet->mc_index  = 0;                                                        
	    inet->mc_list   = NULL;                                                     
	    inet->rcv_tos   = 0;                                                        
	                                                                                
	    sk_refcnt_debug_inc(sk);                                                    
	                                                                                
	    if (inet->inet_num) {                                                       
	        /* It assumes that any protocol which allows                            
	         * the user to assign a number at socket                                
	         * creation time automatically                                          
	         * shares.                                                              
	         */                                                                     
	        inet->inet_sport = htons(inet->inet_num);                               
	        /* Add to protocol hash chains. */                                      
	        sk->sk_prot->hash(sk);                                                  
	    }                                                                           
	    
		//  这里,就是调用了协议里面的 init 函数  tcp_v4_init_sock 
	    if (sk->sk_prot->init) {                                                    
	        err = sk->sk_prot->init(sk);                                            
	        if (err)                                                                
	            sk_common_release(sk);                                              
	    }                                                                           
	out:                                                                            
	    return err;                                                                 
	out_rcu_unlock:                                                                 
	    rcu_read_unlock();                                                          
	    goto out;                                                                   
	}                                                                                                                                  
  • tcp_v4_init_sock 函数

	static int tcp_v4_init_sock(struct sock *sk)                                    
	{      
		// 强制转换类型 
	    struct inet_connection_sock *icsk = inet_csk(sk);                           
	    // 调用这个进行初始化 ,里面就时关于tcp 的一些初始化了,到此为止 
	    tcp_init_sock(sk);                                                          
	    // ipv4 专用操作 
	    icsk->icsk_af_ops = &ipv4_specific;                                         
	                                                                                
	#ifdef CONFIG_TCP_MD5SIG                                                        
	    tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;                          
	#endif                                                                          
	                                                                                
	    return 0;                                                                   
	}                                                                               
  • 到此, sock_create 分析完毕

  • 最后回到 SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)

	// net/socket.c  
	// 刚才分析完毕  
	retval = sock_create(family, type, protocol, &sock);                        
    	if (retval < 0)                                                             
        	goto out;
	// socket 映射到文件系统
	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));               
    	if (retval < 0)                                                             
    	    goto out_release;                                          
	// net/socket.c
	static int sock_map_fd(struct socket *sock, int flags)                          
	{                                                                               
    	struct file *newfile;                                                       
    	int fd = get_unused_fd_flags(flags);                                        
    	if (unlikely(fd < 0))     
    	    return fd;      
    	
		// 申请一个 sock file 节点 
    	newfile = sock_alloc_file(sock, flags, NULL);                               
    	if (likely(!IS_ERR(newfile))) {                                             
    	    fd_install(fd, newfile);                                                
    	    return fd;                                                              
    	}                                                                           
    	                                                                            
    	put_unused_fd(fd);                                                          
    	return PTR_ERR(newfile);                                                    
	}
	// 这里所展现的意思是,把socket当成一个文件节点进行操作,open, read,write ,ioctl 等                                                                 
posted @ 2017-06-01 08:27  陈富林  阅读(4141)  评论(0编辑  收藏  举报