linux源码解读（十七）：红黑树在内核的应用——epoll

　　1、简单介绍一下epoll的出现的背景：这里以java代码为例，最原始的server代码如下：

while(true){
         ServerSocket ss = new ServerSocket(8888);
         System.out.println("启动服务器....");
         Socket s = ss.accept();//阻塞点
         System.out.println("客户端:"+s.getInetAddress().getLocalHost()+"已连接到服务器");
         
         BufferedReader br = new BufferedReader(new InputStreamReader(s.getInputStream()));
         //读取客户端发送来的消息
         String mess = br.readLine();//阻塞2
         System.out.println("客户端："+mess);
         BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(s.getOutputStream()));
         bw.write(mess+"\n");
         bw.flush();  

}

　　单线程中，一个死循环不停地接受客户端的连接和数据；上述代码有两个“卡点”：第一个是accept函数，server在8888端口监听有没有client来连接，如果没有就一直阻塞，代码没法往下继续执行！第二个是readLine函数，server和client建立连接后要等client发送数据；如果没收到client的数据也一直阻塞，后面的代码还是没法执行！这种代码的缺陷就很明显了：单线程同时只能处理一个client的连接和收数据，如果没有就一直阻塞，其他client的连接请求无法处理！

　　既然上述问题是单线程导致的，改成多线程不就解决了？每个线程单独accept和readLine，就算阻塞也只阻塞单个线程；但凡有新client连接，server就单独开个线程去accept和readLine，client之间互不影响，问题完美解决？？？？？(￣▽￣)" 如果真有这么简单，就没epoll啥事了！多线程也有缺陷：client连接后不见的会持续发数据，但是server却要分配线程去监听，同时做好接收数据的准备；server这边单独开一个sockt（linux操作系统底层用的是fd表示）+线程会明显会消耗server的硬件资源；如果有大量的client连接后却不发送数据，server会因为分配了大量的线程+socket，让cpu（大量线程之间轮询，产生上下文切换）甚至内存被打满（这不就是DDOS攻击么？），整个服务器没法继续干活了，这可咋整？

　　2、回顾上面的问题，本质都是因为accept和readLine阻塞引起的，如果改成单线程+非阻塞能不能解决了？epoll孕育而生！

（1）还是按照以往的思路，先看看有哪些相关的结构体：eventpoll，包含了红黑树的root节点和readList链表！既然包含了root节点，肯定需要先生成该结构体实例的！

struct eventpoll {
　　...
　　/*红黑树的根节点，这棵树中存储着所有添加到epoll中的事件，
　　也就是这个epoll监控的事件*/
　　struct rb_root rbr;
　　/*双向链表rdllist保存着将要通过epoll_wait返回给用户的、满足条件的事件*/
　　struct list_head rdllist;
　　...
};

　　　epitem：组成了红黑树的节点。和epoll这种业务相关的字段就是epoll_event了！

struct epitem {
　　...
　　//红黑树节点
　　struct rb_node rbn;
　　//双向链表节点
　　struct list_head rdllink;
　　//事件句柄等信息
　　struct epoll_filefd ffd;
　　//指向其所属的eventepoll对象
　　struct eventpoll *ep;
　　//期待的事件类型
/* The structure that describe the interested events and the source fd */
　　struct epoll_event event;
   //下一个epitem实例
   struct epitem *next;　　...
}; // 这里包含每一个事件对应着的信息。

　　从源码的英文注释看，epoll_event结构体就两个字段：由事件和fd组成的，让事件和fd产生映射关系，避免事件和fd张冠李戴！比如明明是进程A的socket收到了数据，却错误映射到了进程B的socket就尴尬了！

struct epoll_event {
    __u32 events;
    __u64 data;
} EPOLL_PACKED;

　　这3个结构体之间的脉络关系如下：这里标注了3个函数，也正是这3个函数构建了红黑树和两个双向链表！

注意：这个图上有两个链表，task链表组成的struct字段如下：最核心的就是第2个和第3个字段了！当进程没收到数据时，会进入这个wait_queue；当网卡收到数据后，会通过中断通知cpu来取数据，再执行第三个回调函数func！

struct __wait_queue {
    unsigned int        flags;
    void            *private;//等待队列的task_struct指针
    wait_queue_func_t    func;//进程唤醒后执行的回调函数
    struct list_head    task_list;
};

这个结构体实例之间的关系：

（2）既然红黑树的根节点是在eventpoll结构体中，那么肯定要先生成eventpoll实例，这个是在epoll_create函数中实现的，如下：

/*
 * Open an eventpoll file descriptor.
 */
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
    int error, fd;
    struct eventpoll *ep = NULL;
    struct file *file;

    /* Check the EPOLL_* constant for consistency.  */
    BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

    if (flags & ~EPOLL_CLOEXEC)
        return -EINVAL;
    /*
     * Create the internal data structure ("struct eventpoll").
       创建一个eventpoll实例，里面初始化红黑树、链表的节点
     */
    error = ep_alloc(&ep);
    if (error < 0)
        return error;
    /*
     * Creates all the items needed to setup an eventpoll file. That is,
     * a file structure and a free file descriptor.
     */
    fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
    if (fd < 0) {
        error = fd;
        goto out_free_ep;
    }
    file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
                 O_RDWR | (flags & O_CLOEXEC));
    if (IS_ERR(file)) {
        error = PTR_ERR(file);
        goto out_free_fd;
    }
    ep->file = file;
    fd_install(fd, file);
    return fd;

out_free_fd:
    put_unused_fd(fd);
out_free_ep:
    ep_free(ep);
    return error;
}

　　既然是create函数，核心功能就是生成eventpoll结构体的实例，这个是通过调用ep_alloc实现的！

/*生成eventpoll实例*/
static int ep_alloc(struct eventpoll **pep)
{
    int error;
    struct user_struct *user;
    struct eventpoll *ep;

    user = get_current_user();
    error = -ENOMEM;
    ep = kzalloc(sizeof(*ep), GFP_KERNEL);
    if (unlikely(!ep))
        goto free_uid;

    spin_lock_init(&ep->lock);
    mutex_init(&ep->mtx);
    init_waitqueue_head(&ep->wq);
    init_waitqueue_head(&ep->poll_wait);
    INIT_LIST_HEAD(&ep->rdllist);//readList第一个节点初始化
    ep->rbr = RB_ROOT;//红黑树根节点
    ep->ovflist = EP_UNACTIVE_PTR;
    ep->user = user;

    *pep = ep;

    return 0;

free_uid:
    free_uid(user);
    return error;
}

　　红黑树的根节点、链表头节点都生成后，接下来就是构建整颗树以及链表了，这些都是在epoll_ctl中实现的；先看看epoll操作的类型：主要有下面3种：增加、删除和修改！

static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };

　　这还是个系统调用，核心代码如下（函数开始有很多容错的代码忽略了，避免影响对主干代码的理解）：就是个switch结构，3种不同的options对应3种不同的操作！本质上就是让内核知道用户关注哪些事件；事件发生的时候需要回调哪些函数！

/*
 * The following function implements the controller interface for
 * the eventpoll file that enables the insertion/removal/change of
 * file descriptors inside the interest set.
 */
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        struct epoll_event __user *, event)
{
    int error;
    int full_check = 0;
    struct fd f, tf;
    struct eventpoll *ep;
    struct epitem *epi;
    struct epoll_event epds;
    struct eventpoll *tep = NULL;
        ..........
        /*
     * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
     * above, we can be sure to be able to use the item looked up by
     * ep_find() till we release the mutex. 在红黑树中根据fd找到epi结构体
     */
    epi = ep_find(ep, tf.file, fd);

    error = -EINVAL;
    switch (op) {
    case EPOLL_CTL_ADD:
        if (!epi) {
            epds.events |= POLLERR | POLLHUP;
            /*epoll节点插入，包括list节点和红黑树节点*/
            error = ep_insert(ep, &epds, tf.file, fd, full_check);
        } else
            error = -EEXIST;
        if (full_check)
            clear_tfile_check_list();
        break;
    case EPOLL_CTL_DEL:
        if (epi)
            /*epoll节点删除，包括list节点和红黑树节点*/
            error = ep_remove(ep, epi);
        else
            error = -ENOENT;
        break;
    case EPOLL_CTL_MOD:
        if (epi) {
            if (!(epi->event.events & EPOLLEXCLUSIVE)) {
                epds.events |= POLLERR | POLLHUP;
                /*epoll节点更改，包括list节点和红黑树节点*/
                error = ep_modify(ep, epi, &epds);
            }
        } else
            error = -ENOENT;
        break;
    }
        ..........
}

从上面的代码可知，其实核心的函数是ep_insert、ep_remove、ep_modify! 先看看ep_insert函数(为了突出重点，省略末尾的容错代码)，主要干了这么几件事：

构建wait_queue队列（通过链表实现）
构建红黑树
注册唤醒task后的回调函数

/*
 * Must be called with "mtx" held.
 */
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
             struct file *tfile, int fd, int full_check)
{
    int error, revents, pwake = 0;
    unsigned long flags;
    long user_watches;
    struct epitem *epi;
    struct ep_pqueue epq;

    user_watches = atomic_long_read(&ep->user->epoll_watches);
    if (unlikely(user_watches >= max_user_watches))
        return -ENOSPC;
    if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
        return -ENOMEM;

    /* Item initialization follow here ... */
    INIT_LIST_HEAD(&epi->rdllink);//初始化readyList链表头
    INIT_LIST_HEAD(&epi->fllink);
    INIT_LIST_HEAD(&epi->pwqlist);
    epi->ep = ep;
    ep_set_ffd(&epi->ffd, tfile, fd);
    epi->event = *event;
    epi->nwait = 0;
    epi->next = EP_UNACTIVE_PTR;
    if (epi->event.events & EPOLLWAKEUP) {
        error = ep_create_wakeup_source(epi);
        if (error)
            goto error_create_wakeup_source;
    } else {
        RCU_INIT_POINTER(epi->ws, NULL);
    }

    /* Initialize the poll table using the queue callback */
    epq.epi = epi;
    /*注册回到函数*/
    init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

    /*
     * Attach the item to the poll hooks and get current event bits.
     * We can safely use the file* here because its usage count has
     * been increased by the caller of this function. Note that after
     * this operation completes, the poll callback can start hitting
     * the new item.
     */
    revents = ep_item_poll(epi, &epq.pt);

    /*
     * We have to check if something went wrong during the poll wait queue
     * install process. Namely an allocation for a wait queue failed due
     * high memory pressure.
     */
    error = -ENOMEM;
    if (epi->nwait < 0)
        goto error_unregister;

    /* Add the current item to the list of active epoll hook for this file */
    spin_lock(&tfile->f_lock);
    list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
    spin_unlock(&tfile->f_lock);

    /*
     * Add the current item to the RB tree. All RB tree operations are
     * protected by "mtx", and ep_insert() is called with "mtx" held.
       epitem节点就是在这里插入红黑树的
     */
    ep_rbtree_insert(ep, epi);

    /* now check if we've created too many backpaths */
    error = -EINVAL;
    if (full_check && reverse_path_check())
        goto error_remove_epi;

    /* We have to drop the new item inside our item list to keep track of it */
    spin_lock_irqsave(&ep->lock, flags);

    /* If the file is already "ready" we drop it inside the ready list 
    */
    if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
        list_add_tail(&epi->rdllink, &ep->rdllist);//把当前event加入readyList的尾部
        ep_pm_stay_awake(epi);

        /* Notify waiting tasks that events are available 
        如果网卡收到数据，需要唤醒等待的task，并执行实现设置好的回调函数
        */
        if (waitqueue_active(&ep->wq))
            wake_up_locked(&ep->wq);//唤醒进程后执行的回调函数
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    }

    spin_unlock_irqrestore(&ep->lock, flags);

    atomic_long_inc(&ep->user->epoll_watches);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(&ep->poll_wait);

    return 0;
}

　　相比增加，删除节点就容易多了：直接从红黑树和readyList删除

/*
 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
 * all the associated resources. Must be called with "mtx" held.
 */
static int ep_remove(struct eventpoll *ep, struct epitem *epi)
{
    unsigned long flags;
    struct file *file = epi->ffd.file;

    /*
     * Removes poll wait queue hooks. We _have_ to do this without holding
     * the "ep->lock" otherwise a deadlock might occur. This because of the
     * sequence of the lock acquisition. Here we do "ep->lock" then the wait
     * queue head lock when unregistering the wait queue. The wakeup callback
     * will run by holding the wait queue head lock and will call our callback
     * that will try to get "ep->lock".
     */
    ep_unregister_pollwait(ep, epi);

    /* Remove the current item from the list of epoll hooks */
    spin_lock(&file->f_lock);
    list_del_rcu(&epi->fllink);
    spin_unlock(&file->f_lock);

    rb_erase(&epi->rbn, &ep->rbr);//从红黑树删除

    spin_lock_irqsave(&ep->lock, flags);
    if (ep_is_linked(&epi->rdllink))
        list_del_init(&epi->rdllink);//从readyList删除
    spin_unlock_irqrestore(&ep->lock, flags);

    wakeup_source_unregister(ep_wakeup_source(epi));
    /*
     * At this point it is safe to free the eventpoll item. Use the union
     * field epi->rcu, since we are trying to minimize the size of
     * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
     * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
     * use of the rbn field.
     */
    call_rcu(&epi->rcu, epi_rcu_free);

    atomic_long_dec(&ep->user->epoll_watches);

    return 0;
}

　　最后就是epoll_wait函数了: 也是个系统调用。为了突出主干，省略了前面的容错代码，核心就是调用了ep_poll函数；

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_wait(2).
 */
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
        int, maxevents, int, timeout)
{
    int error;
    struct fd f;
    struct eventpoll *ep;
        ..........
     /*
     * At this point it is safe to assume that the "private_data" contains
     * our own data structure.
     */
    ep = f.file->private_data;

    /* Time to fish for events ... */
    error = ep_poll(ep, events, maxevents, timeout);

}

　　ep_poll函数的实现逻辑也不复杂：在超时允许的时间内查看readyList；如果不为空说明有事件准备好了，然后把这些事件推送到用户空间！

/**
 * ep_poll - Retrieves ready events, and delivers them to the caller supplied
 *           event buffer.
 *           找到ready的事件
 * @ep: Pointer to the eventpoll context.
 * @events: Pointer to the userspace buffer where the ready events should be
 *          stored.
 * @maxevents: Size (in terms of number of events) of the caller event buffer.
 * @timeout: Maximum timeout for the ready events fetch operation, in
 *           milliseconds. If the @timeout is zero, the function will not block,
 *           while if the @timeout is less than zero, the function will block
 *           until at least one event has been retrieved (or an error
 *           occurred).
 *
 * Returns: Returns the number of ready events which have been fetched, or an
 *          error code, in case of error.
 */
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
           int maxevents, long timeout)
{
    int res = 0, eavail, timed_out = 0;
    unsigned long flags;
    u64 slack = 0;
    wait_queue_t wait;
    ktime_t expires, *to = NULL;

    if (timeout > 0) {
        struct timespec64 end_time = ep_set_mstimeout(timeout);

        slack = select_estimate_accuracy(&end_time);
        to = &expires;
        *to = timespec64_to_ktime(end_time);
    } else if (timeout == 0) {
        /*
         * Avoid the unnecessary trip to the wait queue loop, if the
         * caller specified a non blocking operation.
         */
        timed_out = 1;
        spin_lock_irqsave(&ep->lock, flags);
        goto check_events;
    }

fetch_events:
    spin_lock_irqsave(&ep->lock, flags);

    if (!ep_events_available(ep)) {
        /* 如果没有任何事件发生，就让出cpu；一旦有事件到达，会被ep_poll_callback函数唤醒！
         * We don't have any available event to return to the caller.
         * We need to sleep here, and we will be wake up by
         * ep_poll_callback() when events will become available.
         */
        init_waitqueue_entry(&wait, current);//把当前进程放入等待队列，并注册唤醒回调函数
        __add_wait_queue_exclusive(&ep->wq, &wait);

        for (;;) {
            /*
             * We don't want to sleep if the ep_poll_callback() sends us
             * a wakeup in between. That's why we set the task state
             * to TASK_INTERRUPTIBLE before doing the checks.
             */
            set_current_state(TASK_INTERRUPTIBLE);
            if (ep_events_available(ep) || timed_out)//如果有ready的事件，或已经等待超时就跳出死循环
                break;
            if (signal_pending(current)) {
                res = -EINTR;
                break;
            }

            spin_unlock_irqrestore(&ep->lock, flags);
            //主动让出cpu，进入随眠状态；等有事件发生时，进程切换回来继续执行for循环，到上面的break代码跳出
            if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
                timed_out = 1;

            spin_lock_irqsave(&ep->lock, flags);
        }

        __remove_wait_queue(&ep->wq, &wait);//移除wait_queue
        __set_current_state(TASK_RUNNING);//唤醒后进程状态设置为running
    }
check_events:
    /* Is it worth to try to dig for events ? */
    eavail = ep_events_available(ep);//遍历readyList，发现不为空哦，说明有事件产生了！

    spin_unlock_irqrestore(&ep->lock, flags);

    /*
     * Try to transfer events to user space. In case we get 0 events and
     * there's still timeout left over, we go trying again in search of
     * more luck. 把ready的事件拷贝到用户空间，让上层应用继续处理
     */
    if (!res && eavail &&
        !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
        goto fetch_events;

    return res;
}

上面epoll_create、epoll_ctl和epoll_wait看起来很多，其实用起来很简单，demo代码如下（完整的用例见文章末尾）：先用epoll_create创建epoll实例，再用epoll_ctl添加fd和事件的映射，个人觉得epoll思路的精髓就体现在epoll_wait这里了：这函数可以设置等待（或则阻塞）的时长。如果执行的时候发现readyList有准备好的事件，或者说超时就返回继续执行，不会一直阻塞在这里傻等；再加上这里是单线程循环执行，完美解决了文章开头的多线程+阻塞的问题！

　　当网卡收到数据，也就是有事件发生时，会挨个唤醒wait_queue的task，然后执行每个task的回调函数，执行回调函数的名称是ep_poll_callback，其调用的核心代码如下：

/*
 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
 * number) then we wake all the non-exclusive tasks and one exclusive task.
 *
 * There are circumstances in which we can try to wake a task which has already
 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
 * zero in this (rare) case, and we handle it by continuing to scan the queue.
   执行等待队列中每个task被唤醒后的回调函数
 */
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
            int nr_exclusive, int wake_flags, void *key)
{
    wait_queue_t *curr, *next;

    list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
        unsigned flags = curr->flags;
        /*执行回调函数*/
        if (curr->func(curr, mode, wake_flags, key) &&
                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
            break;
    }
}

windows下用vmware+kali+qemu+vscode调试的效果如下：

epoll回调函数断点：左上方能看到各个变量的值，左下方能看到函数的调用栈！这个在逆向特别有用：比如在malloc断下，就能查到是哪个函数在分配内存，进而存放加密数据了！

　　3、epoll的一个完整的demo用例，方便直观理解epoll的功能：

#include <iostream>  
#include <sys/socket.h>  
#include <sys/epoll.h>  
#include <netinet/in.h>  
#include <arpa/inet.h>  
#include <fcntl.h>  
#include <unistd.h>  
#include <stdio.h>  
#include <errno.h>  
  
using namespace std;  
  
#define MAXLINE 100  
#define OPEN_MAX 100  
#define LISTENQ 20  
#define SERV_PORT 5000  
#define INFTIM 1000  
  
void setnonblocking(int sock)  
{  
    int opts;  
     opts=fcntl(sock,F_GETFL);  
    if(opts<0)  
    {  
        perror("fcntl(sock,GETFL)");  
        exit(1);  
    }  
     opts = opts|O_NONBLOCK;  
    if(fcntl(sock,F_SETFL,opts)<0)  
    {  
        perror("fcntl(sock,SETFL,opts)");  
        exit(1);  
    }  
}  
  
int main(int argc, char* argv[])  
{  
    int i, maxi, listenfd, connfd, sockfd,epfd,nfds, portnumber;  
    ssize_t n;  
    char line[MAXLINE];  
    socklen_t clilen;  
    string szTemp("");  
  
    if ( 2 == argc )  
    {  
        if( (portnumber = atoi(argv[1])) < 0 )  
        {  
            fprintf(stderr,"Usage:%s portnumber\a\n",argv[0]);  
            return 1;  
        }  
    }  
    else  
    {  
        fprintf(stderr,"Usage:%s portnumber\a\n",argv[0]);  
        return 1;  
    }  
  
    //声明epoll_event结构体的变量,ev用于注册事件,数组用于回传要处理的事件  
    struct epoll_event ev, events[20];  
      
    //创建一个epoll的句柄，size用来告诉内核这个监听的数目一共有多大  
    epfd = epoll_create(256); //生成用于处理accept的epoll专用的文件描述符  
      
    struct sockaddr_in clientaddr;  
    struct sockaddr_in serveraddr;  
    listenfd = socket(AF_INET, SOCK_STREAM, 0);  
      
    //把socket设置为非阻塞方式  
    //setnonblocking(listenfd);  
  
    //设置与要处理的事件相关的文件描述符  
    //要监听的fd
    ev.data.fd=listenfd;  
      
    //设置要处理的事件类型  
    //要监听的事件：收到数据后可读、边缘触发
    ev.events=EPOLLIN|EPOLLET;  
  
    //注册epoll事件 ，也就上述要是监听fd+监听的事件，通过ev传入
    //本质就是在红黑树上增删改查
    epoll_ctl(epfd,EPOLL_CTL_ADD,listenfd,&ev);  
      
    bzero(&serveraddr, sizeof(serveraddr)); /*配置Server socket的相关信息 */  
    serveraddr.sin_family = AF_INET;  
    char *local_addr="127.0.0.1";  
    inet_aton(local_addr,&(serveraddr.sin_addr));//htons(portnumber);  
    serveraddr.sin_port=htons(portnumber);  
    bind(listenfd,(sockaddr *)&serveraddr, sizeof(serveraddr));  
    listen(listenfd, LISTENQ);  
      
    maxi = 0;  
      
    for ( ; ; ) {  
          
        //等待epoll事件的发生  
        //返回需要处理的事件数目nfds，如返回0表示已超时。  
        nfds=epoll_wait(epfd,events,20,500);  
          
        //处理所发生的所有事件  
        for(i=0; i < nfds; ++i)  
        {  
            //如果新监测到一个SOCKET用户连接到了绑定的SOCKET端口，建立新的连接。  
            if(events[i].data.fd == listenfd)  
            {  
                //收到了用户建立连接的请求再调用accept，避免阻塞浪费时间
                connfd = accept(listenfd,(sockaddr *)&clientaddr, &clilen);  
                if(connfd < 0)  
                {  
                    perror("connfd < 0");  
                    exit(1);  
                }  
                //setnonblocking(connfd);  
                char *str = inet_ntoa(clientaddr.sin_addr);  
                cout << "accapt a connection from " << str << endl;  
                  
                //设置用于读操作的文件描述符  
                  ev.data.fd=connfd;  
                  
                //设置用于注册的读操作事件  
                  ev.events=EPOLLIN|EPOLLET;  
  
                //注册ev  
                epoll_ctl(epfd,EPOLL_CTL_ADD,connfd,&ev); /* 添加新建立的fd，同样需要监控这个fd有没有收发数据、边缘触发等 */  
            }  
            //如果是已经连接的用户，并且收到数据，那么进行读入。  
            else if(events[i].events&EPOLLIN)  
            {  
                cout << "EPOLLIN" << endl;  
                if ( (sockfd = events[i].data.fd) < 0)  
                    continue;  
                //网卡收到了数据再调用recv把数据拷贝到用户空间，避免阻塞浪费时间
                if ( (n = recv(sockfd, line, sizeof(line), 0)) < 0)   
                {    
                    // Connection Reset:你连接的那一端已经断开了，而你却还试着在对方已断开的socketfd上读写数据！  
                    if (errno == ECONNRESET)  
                    {  
                        close(sockfd);  
                        events[i].data.fd = -1;  
                    }   
                    else  
                        std::cout<<"readline error"<<std::endl;  
                }   
                else if (n == 0) //读入的数据为空  
                {  
                    close(sockfd);  
                    events[i].data.fd = -1;  
                }  
                  
                szTemp = "";  
                szTemp += line;  
                szTemp = szTemp.substr(0,szTemp.find('\r')); /* remove the enter key */  
                memset(line,0,100); /* clear the buffer */  
                //line[n] = '\0';  
                cout << "Readin: " << szTemp << endl;  
                  
                //设置用于写操作的文件描述符  
                ev.data.fd=sockfd;  
                  
                //设置用于注册的写操作事件  
                ev.events=EPOLLOUT|EPOLLET;  
                  
                //修改sockfd上要处理的事件为EPOLLOUT，即fd上可写了、可以发送数据了
                epoll_ctl(epfd,EPOLL_CTL_MOD,sockfd,&ev); /* 修改红黑树 */  
  
            }  
            else if(events[i].events&EPOLLOUT) // 如果有数据发送  
            {  
                sockfd = events[i].data.fd;  
                szTemp = "Server:" + szTemp + "\n";  
                send(sockfd, szTemp.c_str(), szTemp.size(), 0); 
                  
                //设置用于读操作的文件描述符  
                ev.data.fd=sockfd;  
                 
                //设置用于注册的读操作事件  
                ev.events=EPOLLIN|EPOLLET;  
                  
                //修改sockfd上要处理的事件为EPOLIN ，即fd上可读了、可以接收数据了
                epoll_ctl(epfd,EPOLL_CTL_MOD,sockfd,&ev); /* 修改红黑树 */  
            }  
        } //(over)处理所发生的所有事件  
    } //(over)等待epoll事件的发生  
      
    close(epfd);  
    return 0;  
}

总结：

1、红黑树的作用：当有事件发生时，可以快速根据fd查找epitem（找到得epiterm会组成链表传递给用户空间做进一步处理），比遍历链表快多了！

2、内核中链表适用的场景：用来做队列或栈，存储的每个节点都要处理（说白了就是需要遍历），不存在查找的需求场景！

3、epoll事件底层最终是中断触发的：当网卡收到数据后，通过中断通知操作系统来取数据，进而触发epoll事件！

参考：

1、https://www.bilibili.com/video/BV15z4y1m7Gt/?spm_id_from=333.788.recommend_more_video.-1 epoll源码剖析

2、https://os.51cto.com/article/649405.html epoll原理

3、https://www.bilibili.com/video/BV1Sq4y1q7Gv?zw https://zhuanlan.zhihu.com/p/445453676 linux内核调试环境搭建

4、https://blog.csdn.net/Eunice_fan1207/article/details/99674021 Linux内核剖析-----IO复用函数epoll内核源码剖析

5、https://www.cnblogs.com/carekee/articles/2760693.html epoll用例

posted @ 2022-01-21 21:57 第七子007 阅读(2013) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

第七子007

linux源码解读（十七）：红黑树在内核的应用——epoll

公告