惊群效应学习

惊群效应:

举一个很简单的例子,当你往一群鸽子中间扔一块食物,虽然最终只有一个鸽子抢到食物,但所有鸽子都会被惊动来争夺,没有抢到食物的鸽子只好回去继续睡觉, 等待下一块食物到来。这样,每扔一块食物,都会惊动所有的鸽子,即为惊群。对于操作系统来说,多个进程/线程在等待同一资源是,也会产生类似的效果,其结 果就是每当资源可用,所有的进程/线程都来竞争资源,造成的后果
 
惊群导致问题:

1、上下文切换(context  switch)过高会导致cpu像个搬运工,频繁地在寄存器和运行队列之间奔波,更多的时间花在了进程(线程)切换,而不是在真正工作的进程(线程)上面。直接的消耗包括cpu寄存器要保存和加载(例如程序计数器)、系统调度器的代码需要执行。间接的消耗在于多核cache之间的共享数据。

2、通过锁机制解决惊群效应是一种方法,在任意时刻只让一个进程(线程)处理等待的事件。但是锁机制也会造成cpu等资源的消耗和性能损耗

1) accept惊群
主进程创建了socket、bind、listen之后,fork()出来多个进程,每个子进程都开始循环处理(accept)这个listen_fd。每个进程都阻塞在accept上,当一个新的连接到来时候,所有的进程都会被唤醒,但是其中只有一个进程会接受成功,其余皆失败,重新休眠
这个程序模拟上面的场景,当我们用telnet连接该服务器程 序时,会看到只返回一个进程pid,即只有一个进程被唤醒
  1 #include<stdio.h>
  2 #include<sys/types.h>
  3 #include<sys/socket.h>
  4 #include<unistd.h>
  5 #include<sys/epoll.h>
  6 #include<netdb.h>
  7 #include<stdlib.h>
  8 #include<fcntl.h>
  9 #include<sys/wait.h>
 10 #include<errno.h>
 11 #define PROCESS_NUM 10
 12 #define MAXEVENTS 64
 13 //socket创建和绑定
 14 int sock_creat_bind(char * port){
 15     int sock_fd = socket(AF_INET, SOCK_STREAM, 0);
 16     struct sockaddr_in serveraddr;
 17     serveraddr.sin_family = AF_INET;
 18     serveraddr.sin_port = htons(atoi(port));
 19     serveraddr.sin_addr.s_addr = htonl(INADDR_ANY);
 20  
 21     bind(sock_fd, (struct sockaddr *)&serveraddr, sizeof(serveraddr));
 22     return sock_fd;
 23 }
 24 //利用fcntl设置文件或者函数调用的状态标志
 25 int make_nonblocking(int fd){
 26     int val = fcntl(fd, F_GETFL);
 27     val |= O_NONBLOCK;
 28     if(fcntl(fd, F_SETFL, val) < 0){
 29         perror("fcntl set");
 30         return -1;
 31     }
 32     return 0;
 33 }
 34  
 35 int main(int argc, char *argv[])
 36 {
 37     int sock_fd, epoll_fd;
 38     struct epoll_event event;
 39     struct epoll_event *events;
 40         
 41     if(argc < 2){
 42         printf("usage: [port] %s", argv[1]);
 43         exit(1);
 44     }
 45      if((sock_fd = sock_creat_bind(argv[1])) < 0){
 46         perror("socket and bind");
 47         exit(1);
 48     }
 49     if(make_nonblocking(sock_fd) < 0){
 50         perror("make non blocking");
 51         exit(1);
 52     }
 53     if(listen(sock_fd, SOMAXCONN) < 0){
 54         perror("listen");
 55         exit(1);
 56     }
 57     if((epoll_fd = epoll_create(MAXEVENTS))< 0){
 58         perror("epoll_create");
 59         exit(1);
 60     }
 61     event.data.fd = sock_fd;
 62     event.events = EPOLLIN;
 63     if(epoll_ctl(epoll_fd, EPOLL_CTL_ADD, sock_fd, &event) < 0){
 64         perror("epoll_ctl");
 65         exit(1);
 66     }
 67     /*buffer where events are returned*/
 68     events = calloc(MAXEVENTS, sizeof(event));
 69     int i;
 70     for(i = 0; i < PROCESS_NUM; ++i){
 71         int pid = fork();
 72         if(pid == 0){
 73             while(1){
 74                 int num, j;
 75                 num = epoll_wait(epoll_fd, events, MAXEVENTS, -1);
 76                 printf("process %d returnt from epoll_wait\n", getpid());
 77                 sleep(2);
 78                 for(i = 0; i < num; ++i){
 79                     if((events[i].events & EPOLLERR) || (events[i].events & EPOLLHUP) || (!(events[i].events & EPOLLIN))){
 80                         fprintf(stderr, "epoll error\n");
 81                         close(events[i].data.fd);
 82                         continue;
 83                     }else if(sock_fd == events[i].data.fd){
 84                         //收到关于监听套接字的通知,意味着一盒或者多个传入连接
 85                         struct sockaddr in_addr;
 86                         socklen_t in_len = sizeof(in_addr);
 87                         if(accept(sock_fd, &in_addr, &in_len) < 0){
 88                             printf("process %d accept failed!\n", getpid());
 89                         }else{
 90                             printf("process %d accept successful!\n", getpid());
 91                         }
 92                     }
 93                 }
 94             }
 95         }
 96     }
 97     wait(0);
 98     free(events);
 99     close(sock_fd);
100     return 0;
101 }

fly@G480:~/fly/learn/test$ strace -f ./fork
execve("./fork", ["./fork"], 0x7fffd0d489d8 /* 61 vars */) = 0
brk(NULL) = 0x55e33c728000
arch_prctl(0x3001 /* ARCH_??? */, 0x7fff1f212060) = -1 EINVAL (Invalid argument)
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=102598, ...}) = 0
mmap(NULL, 102598, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f29c9075000
close(3) = 0
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\360r\2\0\0\0\0\0"..., 832) = 832
lseek(3, 64, SEEK_SET) = 64
read(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784) = 784
lseek(3, 848, SEEK_SET) = 848
read(3, "\4\0\0\0\20\0\0\0\5\0\0\0GNU\0\2\0\0\300\4\0\0\0\3\0\0\0\0\0\0\0", 32) = 32
lseek(3, 880, SEEK_SET) = 880
read(3, "\4\0\0\0\24\0\0\0\3\0\0\0GNU\0u\343\342\331Yj\256%\0230\256~\363\371\32\204"..., 68) = 68
fstat(3, {st_mode=S_IFREG|0755, st_size=2025032, ...}) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f29c9073000
lseek(3, 64, SEEK_SET) = 64
read(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784) = 784
lseek(3, 848, SEEK_SET) = 848
read(3, "\4\0\0\0\20\0\0\0\5\0\0\0GNU\0\2\0\0\300\4\0\0\0\3\0\0\0\0\0\0\0", 32) = 32
lseek(3, 880, SEEK_SET) = 880
read(3, "\4\0\0\0\24\0\0\0\3\0\0\0GNU\0u\343\342\331Yj\256%\0230\256~\363\371\32\204"..., 68) = 68
mmap(NULL, 2032984, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f29c8e82000
mmap(0x7f29c8ea7000, 1540096, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x25000) = 0x7f29c8ea7000
mmap(0x7f29c901f000, 303104, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x19d000) = 0x7f29c901f000
mmap(0x7f29c9069000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1e6000) = 0x7f29c9069000
mmap(0x7f29c906f000, 13656, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f29c906f000
close(3) = 0
arch_prctl(ARCH_SET_FS, 0x7f29c9074540) = 0
mprotect(0x7f29c9069000, 12288, PROT_READ) = 0
mprotect(0x55e33bbdf000, 4096, PROT_READ) = 0
mprotect(0x7f29c90bb000, 4096, PROT_READ) = 0
munmap(0x7f29c9075000, 102598) = 0
socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3
bind(3, {sa_family=AF_INET, sin_port=htons(1234), sin_addr=inet_addr("0.0.0.0")}, 16) = 0
listen(3, 1024) = 0
clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f29c9074810) = 5528
strace: Process 5528 attached
[pid 5527] clone( <unfinished ...>
[pid 5528] accept(3, NULL, NULLstrace: Process 5529 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f29c9074810) = 5529
[pid 5527] clone( <unfinished ...>
[pid 5529] accept(3, NULL, NULLstrace: Process 5530 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f29c9074810) = 5530
[pid 5527] clone( <unfinished ...>
[pid 5530] accept(3, NULL, NULLstrace: Process 5531 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f29c9074810) = 5531
[pid 5527] clone( <unfinished ...>
[pid 5531] accept(3, NULL, NULLstrace: Process 5532 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f29c9074810) = 5532
[pid 5527] clone( <unfinished ...>
[pid 5532] accept(3, NULL, NULLstrace: Process 5533 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f29c9074810) = 5533
[pid 5527] clone( <unfinished ...>
[pid 5533] accept(3, NULL, NULLstrace: Process 5534 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f29c9074810) = 5534
[pid 5527] clone( <unfinished ...>
[pid 5534] accept(3, NULL, NULLstrace: Process 5535 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f29c9074810) = 5535
[pid 5527] clone( <unfinished ...>
[pid 5535] accept(3, NULL, NULL <unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f29c9074810) = 5536
strace: Process 5536 attached
[pid 5527] clone( <unfinished ...>
[pid 5536] accept(3, NULL, NULLstrace: Process 5537 attached
<unfinished ...>
[pid 5527] <... clone resumed> child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f29c9074810) = 5537
[pid 5527] wait4(-1, <unfinished ...>
[pid 5537] accept(3, NULL, NULL


这里我们首先看到系统创建了十个进程。下面这张图你会看出十个进程阻塞在accept这个系统调用上面:

接下来在另一个终端执行telnet 127.0.0.1 1234:

 1 [pid  5528] <... accept resumed> )      = 4
 2 [pid  5528] getpid()                    = 5528
 3 [pid  5528] fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(0x88, 0x1), ...}) = 0
 4 [pid  5528] brk(NULL)                   = 0x55e33c728000
 5 [pid  5528] brk(0x55e33c749000)         = 0x55e33c749000
 6 [pid  5528] write(1, "process 5528 accept a connection"..., 49process 5528 accept a connection failed: Success
 7 ) = 49
 8 [pid  5528] close(4)                    = 0
 9 [pid  5528] accept(3, NULL, NULL^C <unfinished ...>
10 [pid  5537] <... accept resumed> )      = ? ERESTARTSYS (To be restarted if SA_RESTART is set)

很明显当telnet连接的时候只有一个进程accept成功,你会不会和我有同样的疑问,就是会不会内核中唤醒了所有的进程只是没有获取 到资源失败了,就好像惊群被“隐藏”?

在内核2.6及之后,解决了惊群,在内核中增加了一个互斥等待变量。一个互斥等待的行为与睡眠基本类似,主要的不同点在于:
        1)当一个等待队列入口有 WQ_FLAG_EXCLUSEVE 标志置位, 它被添加到等待队列的尾部. 没有这个标志的入口项, 相反, 添加到开始.
        2)当 wake_up 被在一个等待队列上调用时, 它在唤醒第一个有 WQ_FLAG_EXCLUSIVE 标志的进程后停止。
        对于互斥等待的行为,比如如对一个listen后的socket描述符,多线程阻塞accept时,系统内核只会唤醒所有正在等待此时间的队列 的第一个,队列中的其他人则继续等待下一次事件的发生,这样就避免的多个线程同时监听同一个socket描述符时的惊群问题

 

下面分析一下内核源码, 如何解决的?

1 我们要解决如下几个问题:
1:accept()函数的实现,包括从全队列中取出sock。
2:accept()函数如何如何被唤醒
3:accept()函数如何解决惊群
4:多个进程accept(),优先唤醒哪个进程

accept()函数的实现
  accept()函数实现逻辑相对比较简单
  如果没有完成建立的TCP会话,阻塞情况下,则阻塞,非阻塞情况下,则返回-EAGAIN。
  
  所以总结来说需要考虑这么几种情况:
  1、当前全队列中有socket,则accept()直接返回对应的fd。
  2、如果当前全队列中没有socket,则如果当前socket是阻塞的,直接睡眠。
  3、如果当前全队列中没有socket,如果非阻塞,就直接返回-EAGAIN。
  4、如果是阻塞的listenfd,需要将当前进程挂在listenfd对应socket的等待队列里面,当前进程让出cpu,并且等待唤醒

sys_accept->sys_accept4->inet_accept->inet_csk_accept

其中 inet_csk_accept是核心处理逻辑,其处理了上述1、3两种情况

 1 /*
 2  * This will accept the next outstanding connection.
 3  */
 4 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
 5 {
 6     struct inet_connection_sock *icsk = inet_csk(sk);
 7     struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 8     struct sock *newsk;
 9     struct request_sock *req;
10     int error;
11 
12     lock_sock(sk);
13 
14     /* We need to make sure that this socket is listening,
15      * and that it has something pending.
16      */
17     
18     //只有TCP_LISTEN状态的socket才能调用accept
19     error = -EINVAL;
20     if (sk->sk_state != TCP_LISTEN)
21         goto out_err;
22 
23     /* Find already established connection */
24     
25     //如果当前全队列中有已经三次握手建立起来后的连接,就不会进这个if,直接走到后面取全队列中的socket
26     if (reqsk_queue_empty(queue)) {
27         long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
28 
29         /* If this is a non blocking socket don't sleep */
30         //非阻塞的socket,直接返回了
31         error = -EAGAIN;
32         if (!timeo)
33             goto out_err;
34 
35         //阻塞的socket,调用 inet_csk_wait_for_connect ,下文会说
36         error = inet_csk_wait_for_connect(sk, timeo);
37         
38         if (error)
39             goto out_err;
40     }
41     
42     //走到这里,说明全队列中有socket,直接取出来
43     req = reqsk_queue_remove(queue);
44     newsk = req->sk;
45 
46     sk_acceptq_removed(sk);
47     if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
48         spin_lock_bh(&queue->fastopenq->lock);
49         if (tcp_rsk(req)->listener) {
50             /* We are still waiting for the final ACK from 3WHS
51              * so can't free req now. Instead, we set req->sk to
52              * NULL to signify that the child socket is taken
53              * so reqsk_fastopen_remove() will free the req
54              * when 3WHS finishes (or is aborted).
55              */
56             req->sk = NULL;
57             req = NULL;
58         }
59         spin_unlock_bh(&queue->fastopenq->lock);
60     }
61 out:
62     release_sock(sk);
63     if (req)
64         __reqsk_free(req);
65     return newsk;
66 out_err:
67     newsk = NULL;
68     req = NULL;
69     *err = error;
70     goto out;
71 }

inet_csk_wait_for_connect函数处理了2、4两种情况

 1 /*
 2  * Wait for an incoming connection, avoid race conditions. This must be called
 3  * with the socket locked.
 4  */
 5 static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 6 {
 7     struct inet_connection_sock *icsk = inet_csk(sk);
 8     DEFINE_WAIT(wait);
 9     int err;
10 
11     /*
12      * True wake-one mechanism for incoming connections: only
13      * one process gets woken up, not the 'whole herd'.
14      * Since we do not 'race & poll' for established sockets
15      * anymore, the common case will execute the loop only once.
16      *
17      * Subtle issue: "add_wait_queue_exclusive()" will be added
18      * after any current non-exclusive waiters, and we know that
19      * it will always _stay_ after any new non-exclusive waiters
20      * because all non-exclusive waiters are added at the
21      * beginning of the wait-queue. As such, it's ok to "drop"
22      * our exclusiveness temporarily when we get woken up without
23      * having to remove and re-insert us on the wait queue.
24      */
25     for (;;) {
26         //prepare_to_wait_exclusive很重要,把 wait 挂到当前sk的等待队列里面。
27         prepare_to_wait_exclusive(sk_sleep(sk), &wait,
28                       TASK_INTERRUPTIBLE);
29         release_sock(sk);
30         //icsk_accept_queue是全队列
31         if (reqsk_queue_empty(&icsk->icsk_accept_queue))
32             timeo = schedule_timeout(timeo);//阻塞情况下,只有主动唤醒当前进程,才会继续执行。
33         lock_sock(sk);
34         err = 0;
35         
36         //如果阻塞且非超时的情况从schedule_timeout返回,那么必然是全队列有值了。
37         if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
38             break;//这个break是所有程序必经之路
39         err = -EINVAL;
40         if (sk->sk_state != TCP_LISTEN)
41             break;
42         err = sock_intr_errno(timeo);
43         
44         //有信号或者睡眠时间满了,则退出循环,否则接着睡。
45         if (signal_pending(current))
46             break;
47         err = -EAGAIN;
48         if (!timeo)
49             break;
50     }
51     finish_wait(sk_sleep(sk), &wait);
52     return err;
53 }

首先,为什么循环?这是历史原因,考虑有这么一种情况,就是睡眠时间没有睡满,那么 schedule_timeout返回的值大于0,那么什么情况下,睡眠没有睡满呢?一种情况就是进程收到信号,

另一种就是listenfd对应的socket的全队列有数据了,不考虑信号的情况,假设全队列有数据了,历史上,Linux的accept是惊群的,全队列有值后,所有进程都唤醒,那么必然存在某些进程读取到了全队列socket,而某些没有读取到,这些没有读取到的进程,肯定是睡眠没睡满,所以需要接着睡。
但是本文分析的Linux内核版本是3.10,全队列有数据时,只会唤醒一个进程,故而,次for循环只会跑一次

prepare_to_wait_exclusive函数很重要,把当前上下文加到listenfd对应的socket等待队列里面,如果是多进程,那么listenfd对应的socket等待队列里面会有
多个进程的上下文

 

多进程 accept 如何处理惊群????

多进程accept,不考虑resuseport,那么多进程accept只会出现在父子进程同时accept的情况,那么上文也说过,prepare_to_wait_exclusive函数会被当前进程
上下文加入到listenfd等待队列里面,所以父子进程的上下文都会加入到socket的等待队列里面。核心问题就是这么唤醒,我们可以相当,所谓的惊群,就是把>
等待队里里面的所有进程都唤醒。
我们此时来看看如何唤醒

 1 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 2 {
 3     struct sock *rsk;
 4 
 5     ......
 6     if (sk->sk_state == TCP_LISTEN) {
 7         struct sock *nsk = tcp_v4_hnd_req(sk, skb);
 8         if (!nsk)
 9             goto discard;
10 
11         if (nsk != sk) {
12             sock_rps_save_rxhash(nsk, skb);
13             //当三次握手客户端的ack到来时,会走tcp_child_process这里
14             if (tcp_child_process(sk, nsk, skb)) {
15                 rsk = nsk;
16                 goto reset;
17             }
18             return 0;
19         }
20     }
21     ......
22 }
 1 int tcp_child_process(struct sock *parent, struct sock *child,
 2               struct sk_buff *skb)
 3 {
 4     int ret = 0;
 5     int state = child->sk_state;
 6 
 7     if (!sock_owned_by_user(child)) {
 8         ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
 9                         skb->len);
10         /* Wakeup parent, send SIGIO */
11         if (state == TCP_SYN_RECV && child->sk_state != state)
12             parent->sk_data_ready(parent, 0);//唤醒 在accept的进程,调用 sock_def_readable
13     } else {
14         /* Alas, it is possible again, because we do lookup
15          * in main socket hash table and lock on listening
16          * socket does not protect us more.
17          */
18         __sk_add_backlog(child, skb);
19     }
20 
21     bh_unlock_sock(child);
22     sock_put(child);
23     return ret;
24 }
 1 static void sock_def_readable(struct sock *sk, int len)
 2 {
 3     struct socket_wq *wq;
 4 
 5     rcu_read_lock();
 6     wq = rcu_dereference(sk->sk_wq);
 7     //显然,我们在accept的时候调用了`prepare_to_wait_exclusive`加入了队列,故唤醒靠 wake_up_interruptible_sync_poll
 8     if (wq_has_sleeper(wq))
 9         wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
10                         POLLRDNORM | POLLRDBAND);
11     sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
12     rcu_read_unlock();
13 }
1 #define wake_up_interruptible_sync_poll(x, m)                \
2     __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))

注意,__wake_up_sync_key的第三个参数是1

所以多个进程accept的时候,内核只会唤醒1个等待的进程,且唤醒的逻辑是FIFO

 1 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 2             int nr_exclusive, int wake_flags, void *key)
 3 {
 4     wait_queue_t *curr, *next;
 5 
 6     list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 7         unsigned flags = curr->flags;
 8 
 9         //prepare_to_wait_exclusive时候,flags是WQ_FLAG_EXCLUSIVE,入参nr_exclusive是1,所以只执行一次就break了。
10         if (curr->func(curr, mode, wake_flags, key) &&
11                 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
12             break;
13     }
14 }

在内核2.6及之后,解决了惊群,在内核中增加了一个互斥等待变量。一个互斥等待的行为与睡眠基本类似,主要的不同点在于:
        1)当一个等待队列入口有 WQ_FLAG_EXCLUSEVE 标志置位, 它被添加到等待队列的尾部. 没有这个标志的入口项, 相反, 添加到开始.
        2)当 wake_up 被在一个等待队列上调用时, 它在唤醒第一个有 WQ_FLAG_EXCLUSIVE 标志的进程后停止。
        对于互斥等待的行为,比如如对一个listen后的socket描述符,多线程阻塞accept时,系统内核只会唤醒所有正在等待此时间的队列 的第一个,队列中的其他人则继续等待下一次事件的发生,这样就避免的多个线程同时监听同一个socket描述符时的惊群问题

posted @ 2020-02-09 12:01  坚持,每天进步一点点  阅读(920)  评论(0编辑  收藏  举报