Linux epoll源码注释

Linux系统运行源码剖析-epoll代码注释

理解了中断、等待队列、调度，你就能懂Linux的80%。

--老子

转发的话，请注明出处哦：http://www.cnblogs.com/stonehat/
Linux系统内核提供了三个系统调用:

include/linux/syscalls.h

// epoll_create，创建epoll描述符
asmlinkage long sys_epoll_create(int size);
// epoll_ctl, 操作epoll描述符，增删改
asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
				struct epoll_event __user *event);
// epoll_wait, 你懂的
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
				int maxevents, int timeout);

其函数实现在fs/eventpoll.c

eventpoll 本身也是一个支持poll操作的文件，所以可以把eventpoll组成一个树形关系。

下面分别按照sys_epoll_create，sys_epoll_ctl，sys_epoll_wait的顺序分析三个系统调用。

重要的结构体

// eventpoll结构体
struct eventpoll {
	/* Protect the this structure access */
	rwlock_t lock;

	/*
	 * 同步用的内核信号量
	 */
	struct rw_semaphore sem;

    /**
     * 等待队列，epoll_wait()使用，将调用线程挂在这个队列上。
     */
	wait_queue_head_t wq;

	/* 等待队列，file->poll()会使用，在epoll中函数为ep_eventpoll_poll */
	wait_queue_head_t poll_wait;

	/* 就绪列表*/
	struct list_head rdllist;

	/* 红黑树，维护了 */
	struct rb_root rbr;
};

// 内核中文件
struct file {
	struct list_head	f_list;
	struct dentry		*f_dentry;
	struct vfsmount         *f_vfsmnt;
    //文件操作指针
	struct file_operations	*f_op;
	atomic_t		f_count;
	unsigned int 		f_flags;
	mode_t			f_mode;
	int			f_error;
	loff_t			f_pos;
	struct fown_struct	f_owner;
	unsigned int		f_uid, f_gid;
	struct file_ra_state	f_ra;

	unsigned long		f_version;
	void			*f_security;

	/* file中的私有自定义数据 */
	void			*private_data;

#ifdef CONFIG_EPOLL
	/* Used by fs/eventpoll.c to link all the hooks to this file */
	struct list_head	f_ep_links;
	spinlock_t		f_ep_lock;
#endif /* #ifdef CONFIG_EPOLL */
	struct address_space	*f_mapping;
};

struct file_operations {
	struct module *owner;
	loff_t (*llseek) (struct file *, loff_t, int);
	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
	ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t);
	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
	ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t, loff_t);
	int (*readdir) (struct file *, void *, filldir_t);
    // 不阻塞，检测file状态（可读、可写等），如果条件不满足，pt将会被加到等待队列中。（一般是这种逻辑，最终如何实现还是要看设备驱动）
	unsigned int (*poll) (struct file *f, struct poll_table_struct *pt);
	int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
	int (*mmap) (struct file *, struct vm_area_struct *);
	int (*open) (struct inode *, struct file *);
	int (*flush) (struct file *);
	int (*release) (struct inode *, struct file *);
	int (*fsync) (struct file *, struct dentry *, int datasync);
	int (*aio_fsync) (struct kiocb *, int datasync);
	int (*fasync) (int, struct file *, int);
	int (*lock) (struct file *, int, struct file_lock *);
	ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
	ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
	ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
	int (*check_flags)(int);
	int (*dir_notify)(struct file *filp, unsigned long arg);
	int (*flock) (struct file *, int, struct file_lock *);
};

概念与关系

文件描述符fd：进程打开的文件的数字代表形式，是文件指针的索引。
struct file：在内核中表示进程打开的文件。task.files[fd]=file
struct inode：静态的文件表示。

一. sys_epoll_create

代码如下：为了方便理解原理，无关紧要的代码逻辑和异常处理删掉了


asmlinkage long sys_epoll_create(int size)
{
	int error, fd;
	struct inode *inode;
	struct file *file;
	
	.....
      
	/*
	 * 创建一个新的file，inode，获得file对应的fd。
	 * 并且将file加入到当前进程打开文件列表。
	 */
	error = ep_getfd(&fd, &inode, &file);
	/* 创建struct eventpoll，并挂在file的private_data指针上*/
	error = ep_file_init(file);
	.....
	return fd;
}

ep_getfd简单流程



static int ep_getfd(int *efd, struct inode **einode, struct file **efile)
{
	struct qstr this;
	char name[32];
	struct dentry *dentry;
	struct inode *inode;
	struct file *file;
	int error, fd;

	/* Get an ready to use file */
	error = -ENFILE;
	file = get_empty_filp();
	if (!file)
		goto eexit_1;

	/* Allocates an inode from the eventpoll file system */
	inode = ep_eventpoll_inode();
	error = PTR_ERR(inode);
	if (IS_ERR(inode))
		goto eexit_2;

	/* Allocates a free descriptor to plug the file onto */
	error = get_unused_fd();
	if (error < 0)
		goto eexit_3;
	fd = error;

	/*
	 * Link the inode to a directory entry by creating a unique name
	 * using the inode number.
	 */
	error = -ENOMEM;
	sprintf(name, "[%lu]", inode->i_ino);
	this.name = name;
	this.len = strlen(name);
	this.hash = inode->i_ino;
	dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
	if (!dentry)
		goto eexit_4;
	dentry->d_op = &eventpollfs_dentry_operations;
	d_add(dentry, inode);
	file->f_vfsmnt = mntget(eventpoll_mnt);
	file->f_dentry = dentry;
	file->f_mapping = inode->i_mapping;

	file->f_pos = 0;
	file->f_flags = O_RDONLY;
	file->f_op = &eventpoll_fops;
	file->f_mode = FMODE_READ;
	file->f_version = 0;
	file->private_data = NULL;

	/* Install the new setup file into the allocated fd. */
	fd_install(fd, file);

	*efd = fd;
	*einode = inode;
	*efile = file;
	return 0;

eexit_4:
	put_unused_fd(fd);
eexit_3:
	iput(inode);
eexit_2:
	put_filp(file);
eexit_1:
	return error;
}

查找一个没有用的文件描述符。记为fd
创建一个空文件file结构体。记为epfile
在epoll的文件系统中创建一个inode
epfile和inode做关联。
epfile的f_ops成员（文件操作指针）和epoll的自定义函数组eventpoll_fops做关联。比较重要的一点是eventpoll_fops有一个自定义的poll函数，这个函数很重要，是实现epoll级联模型的关键。后面可以通过比较f_ops是否等于eventpoll_fops来判断file是不是epoll file。

   static struct file_operations eventpoll_fops = {
   	.release	= ep_eventpoll_close,
   	.poll		= ep_eventpoll_poll
   };

将epfile放到进程的打开文件列表中管理，用fd做索引。
初始化eventpoll结构，初始化等待队列和就绪队列等。
将epfile的private_data指向eventpoll结构。方便后面取eventpoll的数据。
返回给调用线程fd。

二、sys_epoll_ctl

sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
{
	int error;
	struct file *file, *tfile;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
		     current, epfd, op, fd, event));

	error = -EFAULT;
    // 1. 从用户空间拷贝event数据。
	if (EP_OP_HASH_EVENT(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto eexit_1;

	/* 2. 根据epollfile的文件描述符获得对应的file结构体，内核中fd和file是有一个映射关系的*/
	error = -EBADF;
	file = fget(epfd);
	if (!file)
		goto eexit_1;

	/* 3. 获得要操作的描述符的file指针，例如socket描述符 */
	tfile = fget(fd);
	if (!tfile)
		goto eexit_2;

	/* 4. 校验tfile是否支持poll操作，必须支持poll才能使用epoll */
	error = -EPERM;
	if (!tfile->f_op || !tfile->f_op->poll)
		goto eexit_3;

	/*
	 * 5. 校验是否是epoll的file指针
	 */
	error = -EINVAL;
	if (file == tfile || !IS_FILE_EPOLL(file))
		goto eexit_3;

	/*
	 * 6. 取eventpoll，从创建时，我们知道epoll把自己的eventpoll结构体放在file->private_data了里面。
	 */
	ep = file->private_data;
	
	down_write(&ep->sem);

	/* Try to lookup the file inside our hash table */
	epi = ep_find(ep, tfile, fd);
	// 7. 具体的逻辑操作
	error = -EINVAL;
	switch (op) {
    // 添加
	case EPOLL_CTL_ADD:
		if (!epi) {
			epds.events |= POLLERR | POLLHUP;

			error = ep_insert(ep, &epds, tfile, fd);
		} else
			error = -EEXIST;
		break;
    // 删除
	case EPOLL_CTL_DEL:
		if (epi)
			error = ep_remove(ep, epi);
		else
			error = -ENOENT;
		break;
    // 修改
	case EPOLL_CTL_MOD:
		if (epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_modify(ep, epi, &epds);
		} else
			error = -ENOENT;
		break;
	}

	/*
	 * The function ep_find() increments the usage count of the structure
	 * so, if this is not NULL, we need to release it.
	 */
	if (epi)
		ep_release_epitem(epi);

	up_write(&ep->sem);

eexit_3:
	fput(tfile);
eexit_2:
	fput(file);
eexit_1:
	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
		     current, epfd, op, fd, event, error));

	return error;
}

上面的逻辑很简单

验证输入有效性

逻辑上，只需要了解添加即可。epoll的添加是理解整个流程的关键

epoll添加

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
		     struct file *tfile, int fd)
{
	int error, revents, pwake = 0;
	unsigned long flags;
    
	struct epitem *epi;
	struct ep_pqueue epq;

	error = -ENOMEM;
	if (!(epi = EPI_MEM_ALLOC()))
		goto eexit_1;

	/* Item initialization follow here ... */
	EP_RB_INITNODE(&epi->rbn);
	INIT_LIST_HEAD(&epi->rdllink);
	INIT_LIST_HEAD(&epi->fllink);
	INIT_LIST_HEAD(&epi->txlink);
	INIT_LIST_HEAD(&epi->pwqlist);
	epi->ep = ep;
	EP_SET_FFD(&epi->ffd, tfile, fd);
	epi->event = *event;
	atomic_set(&epi->usecnt, 1);
	epi->nwait = 0;

	/* 初始化polltable，当调用poll的时候，会调用ep_ptable_queue_proc函数将自身加入等待队列中 */
	epq.epi = epi;
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

	/*
	 * 将epq.pt的结构体传入tfile进行poll，poll最终调用ep_ptable_queue_proc函数。
	 */
	revents = tfile->f_op->poll(tfile, &epq.pt);
 
	/*
	 * We have to check if something went wrong during the poll wait queue
	 * install process. Namely an allocation for a wait queue failed due
	 * high memory pressure.
	 */
	if (epi->nwait < 0)
		goto eexit_2;

	/* 操作tfile，把当前项加入到epoll列表中。
	 */
	spin_lock(&tfile->f_ep_lock);
	list_add_tail(&epi->fllink, &tfile->f_ep_links);
	spin_unlock(&tfile->f_ep_lock);

	/* We have to drop the new item inside our item list to keep track of it */
	write_lock_irqsave(&ep->lock, flags);

	/* Add the current item to the rb-tree */
	ep_rbtree_insert(ep, epi);

	/* 如果已经有就绪的，就唤醒epollwait等待队列和poll等待队列 */
	if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);

		/* Notify waiting tasks that events are available */
		if (waitqueue_active(&ep->wq))
			wake_up(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

	write_unlock_irqrestore(&ep->lock, flags);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&psw, &ep->poll_wait);

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
		     current, ep, tfile, fd));

	return 0;

eexit_2:
	ep_unregister_pollwait(ep, epi);

	/*
	 * We need to do this because an event could have been arrived on some
	 * allocated wait queue.
	 */
	write_lock_irqsave(&ep->lock, flags);
	if (EP_IS_LINKED(&epi->rdllink))
		EP_LIST_DEL(&epi->rdllink);
	write_unlock_irqrestore(&ep->lock, flags);

	EPI_MEM_FREE(epi);
eexit_1:
	return error;
}

整理一下，向epoll添加一个描述符主要步骤如下：

构建epitem，epitem之后会加入到eventpoll.rbr中。
调用init_poll_funcptr，将ep_ptable_queue_proc函数指针赋值给poll_table的qproc，poll_table记为epq.pt，在file的poll函数中，可以传入poll_table作为参数，poll函数会主动调用poll_table的qproc函数。

poll_table的结构体如下：

   /**
    *@param f:poll的file指针
    *@param whead f的等待队列
    *@param pt 
   */
   typedef void (*poll_queue_proc)(struct file *f, wait_queue_head_t *whead, struct poll_table_struct *pt);

   typedef struct poll_table_struct {
   	poll_queue_proc qproc;
   } poll_table;

poll函数原型

   // 当上层传入pt结构体时，驱动函数当调用poll_table_struct.qproc来实现阻塞队列的添加工作。
   unsigned int (*poll) (struct file *f, struct poll_table_struct *pt);

调用待监控的文件的poll函数，按第2步所说，poll函数规范的实现应该最终会调用到ep_ptable_queue_proc函数，ep_ptable_queue_proc主要是初始化一个等待队列项（以ep_ptable_queue_proc为回调函数），然后将等待队列项塞到驱动的等待队列中。ep_ptable_queue_proc注释如下：

   struct __wait_queue {
       
   	unsigned int flags;
   #define WQ_FLAG_EXCLUSIVE	0x01
       // 线程指针，如果func为默认的执行函数，这个需要赋值。
   	struct task_struct * task;
       // 等待队列唤醒执行的函数
   	wait_queue_func_t func;
       
   	struct list_head task_list;
   };
   typedef struct __wait_queue wait_queue_t;

   static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
   				 poll_table *pt)
   {
       // 这是一个特殊的宏操作，因为pt和epitem是包含在ep_queue结构体里面的，所以可以根据偏移取同级别的epitem。
   	struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt);
   	struct eppoll_entry *pwq;
   	
   	if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) {
           // 初始化一个等待队列项，并且设置当等待队列唤醒时的执行函数为ep_poll_callback
           // 这个很关键。等下我们分析这个ep_poll_call
   		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
   		pwq->whead = whead;
   		pwq->base = epi;
           // 把刚创建的等待队列项加入到等待队列中。
   		add_wait_queue(whead, &pwq->wait);
   		list_add_tail(&pwq->llink, &epi->pwqlist);
   		epi->nwait++;
   	} else {
   		/* We have to signal that an error occurred */
   		epi->nwait = -1;
   	}
   }

   static inline void init_waitqueue_func_entry(wait_queue_t *q,
   					wait_queue_func_t func)
   {
   	q->flags = 0;
   	q->task = NULL;
   	q->func = func;
   }

至此，添加一个文件描述符到epoll监控内的流程完成了，总的来讲，就是在对应的file中设置等待队列。等待回调ep_poll_callback，。至于对应的file用什么机制来确保文件异步就绪，epoll不管。不过一般是通过中断来实现的。

epoll模型的poll函数实现：

* 
 * structures and helpers for f_op->poll implementations
 */
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

typedef struct poll_table_struct {
	poll_queue_proc qproc;
} poll_table;
//poll_wait函数实现，其实内部调用了poll_table.qproc成员，poll_table.qproc在epoll中对应了上面的ep_ptable_queue_proc函数
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	if (p && wait_address)
		p->qproc(filp, wait_address, p);
}
// epollevent的poll函数实现，驱动的逻辑都差不多，有参考意义
static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
	unsigned int pollflags = 0;
	unsigned long flags;
	struct eventpoll *ep = file->private_data;

	/* 1. 加入等待队列中*/
	poll_wait(file, &ep->poll_wait, wait);

	/* Check our condition */
	read_lock_irqsave(&ep->lock, flags);
	if (!list_empty(&ep->rdllist))
		pollflags = POLLIN | POLLRDNORM;
	read_unlock_irqrestore(&ep->lock, flags);

	return pollflags;
}

sys_epoll_wait

了解了ep_insert的话，这个其实就很容易理解了：

static struct file_operations eventpoll_fops = {
	.release	= ep_eventpoll_close,
	.poll		= ep_eventpoll_poll
};

/*
 * sys_epoll_wait实现
 */
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
			       int maxevents, int timeout)
{
	int error;
	struct file *file;
	struct eventpoll *ep;

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
		     current, epfd, events, maxevents, timeout));

	/**
	 * 验证输入的代码忽略
	 */
	error = -EBADF;
    // 1. 根据epfd获得对应的file
	file = fget(epfd);
	if (!file)
		goto eexit_1;

	// 2. 验证是否是epoll的file，就是验证f_op是否等于eventpoll_fops
	error = -EINVAL;
	if (!IS_FILE_EPOLL(file))
		goto eexit_2;

	/*
	 * 3. 取eventpoll结构体
	 */
	ep = file->private_data;

	/* 4. 调用ep_poll实现具体逻辑。不要被ep_poll名字忽悠了，这个不是poll实现 */
	error = ep_poll(ep, events, maxevents, timeout);

eexit_2:
	fput(file);
eexit_1:
	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
		     current, epfd, events, maxevents, timeout, error));

	return error;
}

epoll_wait最终调用ep_poll来实现核心功能。

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
		   int maxevents, long timeout)
{
	int res, eavail;
	unsigned long flags;
	long jtimeout;
	wait_queue_t wait;

	/*
	 * 1. 内核中是是用滴答数作为时间计时的，所以下面代码是转换时间为滴答数。
	 */
	jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ?
		MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000;

retry:
	write_lock_irqsave(&ep->lock, flags);

	res = 0;
    // 1. 如果就绪队列是空的，就进行等待
	if (list_empty(&ep->rdllist)) {
		/*
		 * 2. 把当前调用epoll_wait的线程加入到wq等待队列中，当ep_poll_callback()会唤醒这个线程。
		 * current是当前线程的代表，最终是从cpu中取得的。
		 */
		init_waitqueue_entry(&wait, current);
		add_wait_queue(&ep->wq, &wait);
		//死循环处理。
		for (;;) {
			/*
			 * 3. 设置为可打断，方便处理信号。
			 */
			set_current_state(TASK_INTERRUPTIBLE);
			if (!list_empty(&ep->rdllist) || !jtimeout)
				break;
             // 4. 处理未处理信号
			if (signal_pending(current)) {
				res = -EINTR;
				break;
			}

			write_unlock_irqrestore(&ep->lock, flags);
             // 类似于睡眠。其返回值为剩余时间。该函数会将该cpu的任务切换掉。所以下一行代码在重新调度前不会执行。
			jtimeout = schedule_timeout(jtimeout);
			write_lock_irqsave(&ep->lock, flags);
		}
        //把调用线程从等待队列删除。
		remove_wait_queue(&ep->wq, &wait);

		set_current_state(TASK_RUNNING);
	}

	eavail = !list_empty(&ep->rdllist);

	write_unlock_irqrestore(&ep->lock, flags);

	/*
	 * 将events数据传回用户空间
	 */
	if (!res && eavail &&
	    !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
		goto retry;

	return res;
}

ep_poll的步骤如下：

转换超时时间为cpu滴答计数。
查询就绪队列是否就绪，如果有就绪的，就直接返回给上层。
如果没有就绪的，就等待。

a. 把调用线程添加到eventpoll.wq队列中。

b. 设置自身为可打断状态

c. 检查现在是否有就绪，有的话就直接返给上层。

d. 处理信号。

c. 发起调度，将自身切换为阻塞状态。等待被唤醒。唤醒的方式有：ep_poll_callback唤醒eventpoll.wq队列或者其他中断唤醒。ep_poll_callback是sys_epoll_ctl添加epoll监听的时候设置的等待队列回调。其实现为：


static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
	int pwake = 0;
	unsigned long flags;
     // 1. 这是一个特殊的宏操作，因为wait和epitem是包含在ep_queue结构体里面的，所以可以根据偏移取同级别的epitem。
	struct epitem *epi = EP_ITEM_FROM_WAIT(wait);
    // 2. 获得对应的eventpoll
	struct eventpoll *ep = epi->ep;

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
		     current, epi->file, epi, ep));

	write_lock_irqsave(&ep->lock, flags);
	....
	// 3. 将就绪item加入到就绪
	list_add_tail(&epi->rdllink, &ep->rdllist);

is_linked:
	/*
	 * 4. 唤醒wq等待队列(就是唤醒等待epoll_wait的线程)
	 */
	if (waitqueue_active(&ep->wq))
		wake_up(&ep->wq);
	if (waitqueue_active(&ep->poll_wait))
		pwake++;

is_disabled:
	write_unlock_irqrestore(&ep->lock, flags);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&psw, &ep->poll_wait);

	return 1;
}

posted @ 2018-03-20 23:06 stonehat 阅读(1287) 评论(0) 收藏举报

刷新页面返回顶部

给你科普技术