[转] epoll 实现

转自: http://hi.baidu.com/rwen2012/blog/item/0f2f8c13eb7f3621dd5401a8.html

/

*

* This structure is stored inside the "private_data" member of the file
* structure and rapresent the main data sructure for the eventpoll
* interface.
*/
struct eventpoll {
   /* Protect the this structure access */
   rwlock_t lock;

   /*
   * This semaphore is used to ensure that files are not removed
   * while epoll is using them. This is read-held during the event
   * collection loop and it is write-held during the file cleanup
   * path, the epoll file exit code and the ctl operations.
   */
   struct rw_semaphore sem;

   /* Wait queue used by sys_epoll_wait() */
   wait_queue_head_t wq;

   /* Wait queue used by file->poll() */
   wait_queue_head_t poll_wait;

   /* List of ready file descriptors */
   struct list_head rdllist;

   /* RB-Tree root used to store monitored fd structs */
   struct rb_root rbr;
};

/*
* It opens an eventpoll file descriptor by suggesting a storage of "size"
* file descriptors. The size parameter is just an hint about how to size
* data structures. It won't prevent the user to store more than "size"
* file descriptors inside the epoll interface. It is the kernel part of
* the userspace epoll_create(2).
*/
//注意这个size参数只是一个参考值,而没有被使用到。
//这个函数就是用于建立一个eventpoll结构, 然后返回一个文件描述符,
//通过这个文件描述符可以获取这个相应的eventpoll结构.

asmlinkage long sys_epoll_create(int size)
{
   int error, fd = -1;
   struct eventpoll *ep;
   struct inode *inode;
   struct file *file;

   DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
             current, size));

   /*
   * Sanity check on the size parameter, and create the internal data
   * structure ( "struct eventpoll" ).
   */
   error = -EINVAL;
   if (size <= 0 || (error = ep_alloc(&ep)) != 0) //分配并初始化一个eventpoll结构
       goto eexit_1;

   /*
   * Creates all the items needed to setup an eventpoll file. That is,
   * a file structure, and inode and a free file descriptor.
   */
   //在这个eventpoll文件系统中分配inode节点,并且分配相应的fd, file结构,
   //并有file->private_data = ep, 就是说将ep与这个file/fd联系起来.
   error = ep_getfd(&fd, &inode, &file, ep);
   if (error)
       goto eexit_2;

   DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
             current, size, fd));

   return fd;

eexit_2:
   ep_free(ep);
   kfree(ep);
eexit_1:
   DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
             current, size, error));
   return error;
}



struct epoll_event {
   __u32 events;
   __u64 data;
} EPOLL_PACKED;

/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the hash.
*/
struct epitem {
   /* RB-Tree node used to link this structure to the eventpoll rb-tree */
   struct rb_node rbn;

   /* List header used to link this structure to the eventpoll ready list */
   struct list_head rdllink;

   /* The file descriptor information this item refers to */
   struct epoll_filefd ffd;

   /* Number of active wait queue attached to poll operations */
   int nwait;

   /* List containing poll wait queues */
   struct list_head pwqlist;

   /* The "container" of this item */
   struct eventpoll *ep;

   /* The structure that describe the interested events and the source fd */
   struct epoll_event event;

   /*
   * Used to keep track of the usage count of the structure. This avoids
   * that the structure will desappear from underneath our processing.
   */
   atomic_t usecnt;

   /* List header used to link this item to the "struct file" items list */
   struct list_head fllink;

   /* List header used to link the item to the transfer list */
   struct list_head txlink;

   /*
   * This is used during the collection/transfer of events to userspace
   * to pin items empty events set.
   */
   unsigned int revents;
};


/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set. It represents
* the kernel part of the user space epoll_ctl(2).
*/
asmlinkage long
sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
{
   int error;
   struct file *file, *tfile;
   struct eventpoll *ep;
   struct epitem *epi; //每个fd对应一个epitem结构,它是epoll_ctl中insertion/removal/change操作的对象
   struct epoll_event epds;

   DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
             current, epfd, op, fd, event));

   error = -EFAULT;
   if (ep_op_hash_event(op) &&
        copy_from_user(&epds, event, sizeof(struct epoll_event)))
       goto eexit_1;

   /* Get the "struct file *" for the eventpoll file */
   error = -EBADF;
   file = fget(epfd); //eventpoll的file结构
   if (!file)
       goto eexit_1;

   /* Get the "struct file *" for the target file */
   tfile = fget(fd); //待处理的file结构
   if (!tfile)
       goto eexit_2;

   /* The target file descriptor must support poll */
   error = -EPERM;
   if (!tfile->f_op || !tfile->f_op->poll) //待处理的file必须支持poll
       goto eexit_3;

   /*
   * We have to check that the file structure underneath the file descriptor
   * the user passed to us _is_ an eventpoll file. And also we do not permit
   * adding an epoll file descriptor inside itself.
   */
   error = -EINVAL;
   //不能自己加自己, 但是可以将一个epoll file descriptor加入到另个个epoll/poll/select中
   if (file == tfile || !is_file_epoll(file))
       goto eexit_3;

   /*
   * At this point it is safe to assume that the "private_data" contains
   * our own data structure.
   */
   ep = file->private_data; //取出eventpoll结构。

   down_write(&ep->sem);

   /* Try to lookup the file inside our hash table */
   epi = ep_find(ep, tfile, fd); //从红黑数中查找fd对应的epi, epi->usecnt++

   error = -EINVAL;
   switch (op) {
   case EPOLL_CTL_ADD: //插入操作
       if (!epi) {
           epds.events |= POLLERR | POLLHUP;

           error = ep_insert(ep, &epds, tfile, fd);
       } else
           error = -EEXIST; //如果已经存在, 插入出错
       break;
   case EPOLL_CTL_DEL: //删除操作
       if (epi)
           error = ep_remove(ep, epi);
       else
           error = -ENOENT;
       break;
   case EPOLL_CTL_MOD: //修改操作
       if (epi) {
           epds.events |= POLLERR | POLLHUP;
           error = ep_modify(ep, epi, &epds);
       } else
           error = -ENOENT;
       break;
   }

   /*
   * The function ep_find() increments the usage count of the structure
   * so, if this is not NULL, we need to release it.
   */
   if (epi)
       ep_release_epitem(epi); // epi->usecnt++, if epi->usecnt==0, 释放这个epi


   up_write(&ep->sem);

eexit_3:
   fput(tfile);
eexit_2:
   fput(file);
eexit_1:
   DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
             current, epfd, op, fd, event, error));

   return error;
}


/* Wrapper struct used by poll queueing */
struct ep_pqueue {
   poll_table pt;
   struct epitem *epi;
};

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
             struct file *tfile, int fd)
{
   int error, revents, pwake = 0;
   unsigned long flags;
   struct epitem *epi;
   struct ep_pqueue epq;

   error = -ENOMEM; //插入操作,先分配一个epi
   if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
       goto eexit_1;

   //初始化epi
   /* Item initialization follow here ... */
   ep_rb_initnode(&epi->rbn);
   INIT_LIST_HEAD(&epi->rdllink);
   INIT_LIST_HEAD(&epi->fllink);
   INIT_LIST_HEAD(&epi->txlink);
   INIT_LIST_HEAD(&epi->pwqlist);
   epi->ep = ep;   //eventpoll
   ep_set_ffd(&epi->ffd, tfile, fd); //待处理的文件描述符
   epi->event = *event; //用户关注的事件
   atomic_set(&epi->usecnt, 1);
   epi->nwait = 0;


   //注意以下的初始化, 它用ep_ptable_queue_proc这个函数初始化poll_table,
   //类似于在poll()实现中它是用__waitpoll()来初始化这个函数. 这个函数会在
   //poll_wait()中被调用,而poll_wait()则在驱动或文件系统的file_operations->poll
   //中被调用.

   /* Initialize the poll table using the queue callback */
   epq.epi = epi;
   init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

   /*
   * Attach the item to the poll hooks and get current event bits.
   * We can safely use the file* here because its usage count has
   * been increased by the caller of this function.

   */

 

   revents = tfile->f_op->poll(tfile, &epq.pt);

 


   /*
   * We have to check if something went wrong during the poll wait queue
   * install process. Namely an allocation for a wait queue failed due
   * high memory pressure.
   */
   if (epi->nwait < 0)
       goto eexit_2;

   /* Add the current item to the list of active epoll hook for this file */
   spin_lock(&tfile->f_ep_lock);
   list_add_tail(&epi->fllink, &tfile->f_ep_links);
   spin_unlock(&tfile->f_ep_lock);

   /* We have to drop the new item inside our item list to keep track of it */
   write_lock_irqsave(&ep->lock, flags);

   /* Add the current item to the rb-tree */
   ep_rbtree_insert(ep, epi); //插入到树中


   //如果可读,直接加入到相应的eventpoll的ep->rdllist中,
   /* If the file is already "ready" we drop it inside the ready list */
   if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
       list_add_tail(&epi->rdllink, &ep->rdllist);

       /* Notify waiting tasks that events are available */
       if (waitqueue_active(&ep->wq)) //如果这个睡眠队列非空, 唤醒睡眠进程
           __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
       if (waitqueue_active(&ep->poll_wait))
           pwake++;
   }

   write_unlock_irqrestore(&ep->lock, flags);

   /* We have to call this outside the lock */
   if (pwake)
       ep_poll_safewake(&psw, &ep->poll_wait); //???

   DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
             current, ep, tfile, fd));

   return 0;

eexit_2:
   ep_unregister_pollwait(ep, epi);

   /*
   * We need to do this because an event could have been arrived on some
   * allocated wait queue.
   */
   write_lock_irqsave(&ep->lock, flags);
   if (ep_is_linked(&epi->rdllink))
       ep_list_del(&epi->rdllink);
   write_unlock_irqrestore(&ep->lock, flags);

   kmem_cache_free(epi_cache, epi);
eexit_1:
   return error;
}


static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
   pt->qproc = qproc;
}
/*
* This is the callback that is used to add our wait queue to the
* target file wakeup lists.
*/
//这个函数用于将当前进程加入到设备的睡眠队列中去,这样,当设备有数据可读写时,
//设备的read/write函数会调用wake_up()唤醒睡眠在这个队列上的进程.
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
               poll_table *pt)
{
   struct epitem *epi = ep_item_from_epqueue(pt);
   struct eppoll_entry *pwq;

   if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
       //将wait的函数初始化为ep_poll_callback,当进程被唤醒时,他就会执行这个函数.
       init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
       pwq->whead = whead;
       pwq->base = epi;

       add_wait_queue(whead, &pwq->wait); //加入队列

 

       list_add_tail(&pwq->llink, &epi->pwqlist);// 在ep_unregister_pollwait中需要使用pwqlist 来注销file中的wait队列。

 

       epi->nwait++;
   } else {
       /* We have to signal that an error occurred */
       epi->nwait = -1;
   }
}


/*
* This is the callback that is passed to the wait queue wakeup
* machanism. It is called by the stored file descriptors when they
* have events to report.
*/
//这个函数体现了epoll于poll/select的本质区别. 在poll/select中,它是通过遍历所有的文件描述符
//来检查每个文件描述符是否有数据可读写, 但是在epoll中,它是在一个文件可读写时,通过wait_up()
//调用以下这个wait的callback函数, 将这个有数据可读写的文件描述符加入到ready队列中去.
//所以, 它就不用遍历所有的文件描述符,而只是这个ready队列而已了.

static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
   int pwake = 0;
   unsigned long flags;
   struct epitem *epi = ep_item_from_wait(wait);
   struct eventpoll *ep = epi->ep;

   DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
             current, epi->ffd.file, epi, ep));

   write_lock_irqsave(&ep->lock, flags);

   /*
   * If the event mask does not contain any poll(2) event, we consider the
   * descriptor to be disabled. This condition is likely the effect of the
   * EPOLLONESHOT bit that disables the descriptor when an event is received,
   * until the next EPOLL_CTL_MOD will be issued.
   */
   if (!(epi->event.events & ~EP_PRIVATE_BITS))
       goto is_disabled;

   /* If this file is already in the ready list we exit soon */
   if (ep_is_linked(&epi->rdllink))
       goto is_linked;

   list_add_tail(&epi->rdllink, &ep->rdllist); //加入到Ready队列中去

is_linked:
   /*
   * Wake up ( if active ) both the eventpoll wait list and the ->poll()
   * wait list.
   */
   if (waitqueue_active(&ep->wq)) //有数据可读, 唤醒
       __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
               TASK_INTERRUPTIBLE);
   if (waitqueue_active(&ep->poll_wait))
       pwake++;

is_disabled:
   write_unlock_irqrestore(&ep->lock, flags);

   /* We have to call this outside the lock */
   if (pwake)
       ep_poll_safewake(&psw, &ep->poll_wait);

   return 1;
}



======================================================================

/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
                   int maxevents, int timeout)
{
   int error;
   struct file *file;
   struct eventpoll *ep;

   DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
             current, epfd, events, maxevents, timeout));

   /* The maximum number of event must be greater than zero */
   if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
       return -EINVAL;

   /* Verify that the area passed by the user is writeable */
   if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
       error = -EFAULT;
       goto eexit_1;
   }

   /* Get the "struct file *" for the eventpoll file */
   error = -EBADF;
   file = fget(epfd);
   if (!file)
       goto eexit_1;

   /*
   * We have to check that the file structure underneath the fd
   * the user passed to us _is_ an eventpoll file.
   */
   error = -EINVAL;
   if (!is_file_epoll(file)) //检查是不是epoll文件描述符: (f->f_op == &eventpoll_fops;)
       goto eexit_2;

   /*
   * At this point it is safe to assume that the "private_data" contains
   * our own data structure.
   */
   ep = file->private_data;

   /* Time to fish for events ... */
   error = ep_poll(ep, events, maxevents, timeout);

eexit_2:
   fput(file);
eexit_1:
   DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
             current, epfd, events, maxevents, timeout, error));

   return error;
}


static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
           int maxevents, long timeout)
{
   int res, eavail;
   unsigned long flags;
   long jtimeout;
   wait_queue_t wait;

   /*
   * Calculate the timeout by checking for the "infinite" value ( -1 )
   * and the overflow condition. The passed timeout is in milliseconds,
   * that why (t * HZ) / 1000.
   */
   jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
       MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;

retry:
   write_lock_irqsave(&ep->lock, flags);

   res = 0;
   //如果没有ready的文件描述符,则睡眠等待被唤醒.
   //我们在上面看到,它是通过poll_wait加入到设备的睡眠队列中去的.
   if (list_empty(&ep->rdllist)) {
       /*
       * We don't have any available event to return to the caller.
       * We need to sleep here, and we will be wake up by
       * ep_poll_callback() when events will become available.
       */
       init_waitqueue_entry(&wait, current);
       __add_wait_queue(&ep->wq, &wait); //这个本身的睡眠队列,不同于设备的睡眠队列

       for (;;) {
           /*
           * We don't want to sleep if the ep_poll_callback() sends us
           * a wakeup in between. That's why we set the task state
           * to TASK_INTERRUPTIBLE before doing the checks.
           */
           set_current_state(TASK_INTERRUPTIBLE);
           if (!list_empty(&ep->rdllist) || !jtimeout)
               break;
           if (signal_pending(current)) {
               res = -EINTR;
               break;
           }

           write_unlock_irqrestore(&ep->lock, flags);
           jtimeout = schedule_timeout(jtimeout);
           write_lock_irqsave(&ep->lock, flags);
       }
       __remove_wait_queue(&ep->wq, &wait);

       set_current_state(TASK_RUNNING);
   }

   /* Is it worth to try to dig for events ? */
   eavail = !list_empty(&ep->rdllist);

   write_unlock_irqrestore(&ep->lock, flags);

   /*
   * Try to transfer events to user space. In case we get 0 events and
   * there's still timeout left over, we go trying again in search of
   * more luck.
   */
   //有数据处理
   if (!res && eavail &&
        !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
       goto retry;

   return res;
}


/*
* Perform the transfer of events to user space.
*/
static int ep_events_transfer(struct eventpoll *ep,
                  struct epoll_event __user *events, int maxevents)
{
   int eventcnt = 0;
   struct list_head txlist;

   INIT_LIST_HEAD(&txlist);

   /*
   * We need to lock this because we could be hit by
   * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
   */
   down_read(&ep->sem);

   /* Collect/extract ready items */
   if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) {
       /* Build result set in userspace */
       eventcnt = ep_send_events(ep, &txlist, events);

       /* Reinject ready items into the ready list */
       ep_reinject_items(ep, &txlist);
   }

   up_read(&ep->sem);

   return eventcnt;
}


/*
* Since we have to release the lock during the __copy_to_user() operation and
* during the f_op->poll() call, we try to collect the maximum number of items
* by reducing the irqlock/irqunlock switching rate.
*/
static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents)
{
   int nepi;
   unsigned long flags;
   struct list_head *lsthead = &ep->rdllist, *lnk;
   struct epitem *epi;

   write_lock_irqsave(&ep->lock, flags);


   //将数据从ep->rdllist队列移到txlist队列
   for (nepi = 0, lnk = lsthead->next; lnk != lsthead && nepi < maxevents;) {
       epi = list_entry(lnk, struct epitem, rdllink);

       lnk = lnk->next;

       /* If this file is already in the ready list we exit soon */
       if (!ep_is_linked(&epi->txlink)) {
           /*
           * This is initialized in this way so that the default
           * behaviour of the reinjecting code will be to push back
           * the item inside the ready list.
           */
           //待处理事件
           epi->revents = epi->event.events;

           /* Link the ready item into the transfer list */
           list_add(&epi->txlink, txlist);
           nepi++;

           /*
           * Unlink the item from the ready list.
           */
           ep_list_del(&epi->rdllink); //脱链
       }
   }

   write_unlock_irqrestore(&ep->lock, flags);

   return nepi;
}


/*
* This function is called without holding the "ep->lock" since the call to
* __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
* because of the way poll() is traditionally implemented in Linux.
*/
static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
              struct epoll_event __user *events)
{
   int eventcnt = 0;
   unsigned int revents;
   struct list_head *lnk;
   struct epitem *epi;

   /*
   * We can loop without lock because this is a task private list.
   * The test done during the collection loop will guarantee us that
   * another task will not try to collect this file. Also, items
   * cannot vanish during the loop because we are holding "sem".
   */
   list_for_each(lnk, txlist) {
       epi = list_entry(lnk, struct epitem, txlink);

       /*
       * Get the ready file event set. We can safely use the file
       * because we are holding the "sem" in read and this will
       * guarantee that both the file and the item will not vanish.
       */
       //返回事件
       revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);

       /*
       * Set the return event set for the current file descriptor.
       * Note that only the task task was successfully able to link
       * the item to its "txlist" will write this field.
       */
       epi->revents = revents & epi->event.events; //得到所关心的事情

       if (epi->revents) { //拷贝给用户
           if (__put_user(epi->revents,
                       &events[eventcnt].events) ||
                __put_user(epi->event.data,
                       &events[eventcnt].data))
               return -EFAULT;
           if (epi->event.events & EPOLLONESHOT)
               epi->event.events &= EP_PRIVATE_BITS;
           eventcnt++;
       }
   }
   return eventcnt;
}



//如下, 我们可以看到, ep_eventpoll_poll()是一个文件系统的poll具体实现,
//它是设备或文件系统的一个实现样本.
//epoll实现具体的poll()有两个目的, 一是sys_epoll_wait()中检查这个描述符是
//不是epoll描述符, (f->f_op == &eventpoll_fops;). 二是用来实现这个特殊的epoll
//文件描述符也可以加入其他的epoll/poll/select的描述符集中.

/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
   .release   = ep_eventpoll_close,
   .poll       = ep_eventpoll_poll
};


static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
   unsigned int pollflags = 0;
   unsigned long flags;
   struct eventpoll *ep = file->private_data;

   /* Insert inside our poll wait queue */
   poll_wait(file, &ep->poll_wait, wait);

   /* Check our condition */
   read_lock_irqsave(&ep->lock, flags);
   if (!list_empty(&ep->rdllist))
       pollflags = POLLIN | POLLRDNORM;
   read_unlock_irqrestore(&ep->lock, flags);

   return pollflags;
}
posted @ 2010-07-19 23:10  napoleon_liu  阅读(738)  评论(0编辑  收藏  举报