Socket listen 简要分析

#include <sys/types.h> /* See NOTES */
#include <sys/socket.h>
int listen(int sockfd, int backlog);

· 参数 int sockfd :成功创建的 TCP 套接字。
·int backlog :定义 TCP 未处理连接的队列长度。该队列虽然已经完成了三次握手,但服务器端还没 有执行 accept 的连接。 APUE 中说, backlog 只是一个提示,具体的数值实际上是由系统来决定的。

Now it specifies the queue length for completely established sockets waiting to be accepted,instead of the number of incomplete connection requests. The maximum length of the queue
for incomplete sockets can be set using the tcp_max_syn_backlog sysctl.

全连接队列的最大长度:backlog保存的是完成三次握手、等待accept的全连接,而不是半连接

min(backlog, somaxconn),net.core.somaxconn默认为128。

这个值最终存储于sk->sk_max_ack_backlog

半连接队列的最大长度:不同内核版本中不一样
tcp_max_syn_backlog默认值为256。(For incomplete connections)-----新内核完全是为了兼容以前版本的逻辑,也会使用这个参数

/* include/net/sock.h */
struct sock {
    ...
    u32 sk_ack_backlog; /* 当前全连接队列已有数据个数。 */
    u32 sk_max_ack_backlog; /* 队列最大长度,用于限制半连接和全连接队列的长度。 */
    ...
}
// 半连接数据保存在哈希表中 inet_hashinfo.inet_ehash_bucket 新内核是这样的处理 和老内核不一样

/* net/ipv4/tcp_ipv4.c */
struct inet_hashinfo tcp_hashinfo;

/* include/net/inet_hashtables.h */
struct inet_hashinfo {
    /* 保存了 TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE 状态的连接。 */
    struct inet_ehash_bucket *ehash;
    spinlock_t               *ehash_locks;
    unsigned int             ehash_mask;
    unsigned int             ehash_locks_mask;
    ...
};

 
当使用SYN Cookie时,这个参数变为无效。
半连接队列的最大长度为backlog、somaxconn、tcp_max_syn_backlog的最小值

 
/*
 *	Perform a listen. Basically, we allow the protocol to do anything
 *	necessary for a listen, and if that works, we mark the socket as
 *	ready for listening.
 */

SYSCALL_DEFINE2(listen, int, fd, int, backlog)
{
	struct socket *sock;
	int err, fput_needed;
	int somaxconn;

	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (sock) {
		somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
		if ((unsigned int)backlog > somaxconn)
			backlog = somaxconn;

		err = security_socket_listen(sock, backlog);
		if (!err)
			err = sock->ops->listen(sock, backlog);

		fput_light(sock->file, fput_needed);
	}
	return err;
}

 

/*
 *	Move a socket into listening state.
 */
int inet_listen(struct socket *sock, int backlog)
{
	struct sock *sk = sock->sk;
	unsigned char old_state;
	int err;

	lock_sock(sk);

	err = -EINVAL;
	/* 此时套接口状态需为SS_UNCONNECTED,套接口类型需为SOCK_STREAM */
	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
		goto out;

	old_state = sk->sk_state; /* 当前的连接需为CLOSED或LISTEN状态 */
	if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
		goto out;

	/* Really, if the socket is already in listen state
	 * we can only allow the backlog to be adjusted.
	 */
	if (old_state != TCP_LISTEN) {
		err = inet_csk_listen_start(sk, backlog);/* 启动监听 */
		if (err)
			goto out;
	}
	sk->sk_max_ack_backlog = backlog;/* 最大全连接队列长度 */
	err = 0;

out:
	release_sock(sk);
	return err;
}

 

int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
	struct inet_sock *inet = inet_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);
///* 初始化全连接队列,创建半连接队列的实例 */ int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); if (rc != 0) return rc; sk->sk_max_ack_backlog = 0;/* 最大的backlog,最大全连接队列长度 初始化为0*/ sk->sk_ack_backlog = 0;/* 当前的backlog,当前全连接队列长度 先设置为0*/ inet_csk_delack_init(sk); /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only * after validation is complete. */ sk->sk_state = TCP_LISTEN; //检查端口是否可用,防止bind()后其它进程修改了端口信息 if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk);//clear 路由 sk->sk_prot->hash(sk);//把sock链接进入监听哈希表listening_hash中。 return 0; } sk->sk_state = TCP_CLOSE; /* 如果端口不可用,则释放半连接队列 */ __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; }

 

struct inet_hashinfo {
    /* This is for sockets with full identity only.  Sockets here will
     * always be without wildcards and will have the following invariant:
     *
     *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
     *
     */
    struct inet_ehash_bucket    *ehash;
    spinlock_t            *ehash_locks;
    unsigned int            ehash_mask;
    unsigned int            ehash_locks_mask;

    /* Ok, let's try this, I give up, we do need a local binding
     * TCP hash as well as the others for fast bind/connect.
     */
    struct inet_bind_hashbucket    *bhash;

    unsigned int            bhash_size;
    /* 4 bytes hole on 64 bit */

    struct kmem_cache        *bind_bucket_cachep;

    /* All the above members are written once at bootup and
     * never written again _or_ are predominantly read-access.
     *
     * Now align to a new cache line as all the following members
     * might be often dirty.
     */
    /* All sockets in TCP_LISTEN state will be in here.  This is the only
     * table where wildcard'd TCP sockets can exist.  Hash function here
     * is just local port number.
     */
    struct inet_listen_hashbucket    listening_hash[INET_LHTABLE_SIZE]
                    ____cacheline_aligned_in_smp;
};

 

 

sysctl is an API. So you can just read the Linux kernel documentation for appropriate version:

tcp_max_syn_backlog - INTEGER
    Maximal number of remembered connection requests, which have not
    received an acknowledgment from connecting client.
    The minimal value is 128 for low memory machines, and it will
    increase in proportion to the memory of machine.
    If server suffers from overload, try increasing this number.

somaxconn - INTEGER
    Limit of socket listen() backlog, known in userspace as SOMAXCONN.
    Defaults to 128.  See also tcp_max_syn_backlog for additional tuning
    for TCP sockets.

Let's consider a TCP-handshake.. tcp_max_syn_backlog represents the maximal number of connections in SYN_RECV queue. I.e. when your server received SYN, sent SYN-ACK and haven't received ACK yet. This is a separate queue of so-called "request sockets" - reqsk in code (i.e. not fully-fledged sockets, "request sockets" occupy less memory. In this state we can save some memory and not yet allocate a full socket because the full connection may not be at all in the future if ACK will not arrive). The value of this queue is affected (see this post) by listen()'s backlog argument and limited by tcp_max_syn_backlog in kernel.

somaxconn represents the maximal size of ESTABLISHED queue. This is another queue.
Recall the previously mentioned SYN_RECV queue - your server is waiting for ACK from client. When the ACK arrives the kernel roughly speaking makes the big full-fledged socket from "request socket" and moves it to ESTABLISHED queue. Then you can do accept() on this socket. This queue is also affected by listen()'s backlog argument and limited by somaxconn in kernel.

 

listen_sock结构用于保存SYN_RECV状态的连接请求块,所以也叫半连接队列。

 

posted @ 2019-06-27 20:53  codestacklinuxer  阅读(1415)  评论(0编辑  收藏  举报