select源码分析(linux2.6.11)
本文以tcp poll为例子来分析select的源码,下面是函数调用顺序。
select--->sys_select->do_select--->sock_poll--->tcp_poll
asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp) { fd_set_bits fds; char *bits; long timeout; int ret, size, max_fdset; timeout = MAX_SCHEDULE_TIMEOUT; if (tvp) { time_t sec, usec; if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp))) || (ret = __get_user(sec, &tvp->tv_sec)) || (ret = __get_user(usec, &tvp->tv_usec))) goto out_nofds; ret = -EINVAL; if (sec < 0 || usec < 0) goto out_nofds; if ((unsigned long) sec < MAX_SELECT_SECONDS) { timeout = ROUND_UP(usec, 1000000/HZ); timeout += sec * (unsigned long) HZ; } } ret = -EINVAL; if (n < 0) goto out_nofds; /* max_fdset can increase, so grab it once to avoid race */ max_fdset = current->files->max_fdset; if (n > max_fdset) n = max_fdset; ret = -ENOMEM; size = FDS_BYTES(n); bits = select_bits_alloc(size); if (!bits) goto out_nofds; fds.in = (unsigned long *) bits; fds.out = (unsigned long *) (bits + size); fds.ex = (unsigned long *) (bits + 2*size); fds.res_in = (unsigned long *) (bits + 3*size); fds.res_out = (unsigned long *) (bits + 4*size); fds.res_ex = (unsigned long *) (bits + 5*size); /* 将所有关心的fd的读、写、异常位从用户态复制到内核态 */ if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); /* 主要函数 */ ret = do_select(n, &fds, &timeout); if (tvp && !(current->personality & STICKY_TIMEOUTS)) { time_t sec = 0, usec = 0; if (timeout) { sec = timeout / HZ; usec = timeout % HZ; usec *= (1000000/HZ); } put_user(sec, &tvp->tv_sec); put_user(usec, &tvp->tv_usec); } if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: select_bits_free(bits, size); out_nofds: return ret; }
int do_select(int n, fd_set_bits *fds, long *timeout) { struct poll_wqueues table; poll_table *wait; int retval, i; long __timeout = *timeout; spin_lock(¤t->files->file_lock); retval = max_select_fd(n, fds); spin_unlock(¤t->files->file_lock); if (retval < 0) return retval; n = retval; poll_initwait(&table); wait = &table.pt; if (!__timeout) wait = NULL; retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; /* 设置当前的进程状态为可中断睡眠状态,但是当前进程还没有被调度出去 */ set_current_state(TASK_INTERRUPTIBLE); inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, mask, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; struct file_operations *f_op = NULL; struct file *file = NULL; /* 这里要跳过一些并没有关心的bit位,浪费了时间 */ in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += __NFDBITS; continue; } /* 循环遍历所有关注的bit 位*/ for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) { if (i >= n) break; if (!(bit & all_bits)) continue; file = fget(i); if (file) { f_op = file->f_op; mask = DEFAULT_POLLMASK; if (f_op && f_op->poll) /* 调用poll函数,将当前进程挂上等待队列,以及设置唤醒函数(驱动收到数据时会调用唤醒函数唤醒进程)。并获取当前关心的fd的可读、可写、异常情况 (套接字的sock_poll 初始化在socket_file_ops)*/ mask = (*f_op->poll)(file, retval ? NULL : wait); fput(file); /* 表示可读 */ if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; } /* 表示可写 */ if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; } /* 表示异常 */ if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; } } /** * 如果有必要,就重新调度进程 */ cond_resched(); } if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; } /* 遍历完后,检查retval,看是否有可读可写异常,如果有retval不为0,那么则退出死循环 */ wait = NULL; if (retval || !__timeout || signal_pending(current)) break; if(table.error) { retval = table.error; break; } /* 如果上面没有检查到关心的bit位有可读可写异常。如果调用select时设置的是无限等待, 那么下面函数会进行进程调度,将当前进程调度出去。驱动收到数据时会调换用poll函数设置的唤醒函数,来唤醒当前进程对关心的bit位进行重新检查*/ __timeout = schedule_timeout(__timeout); } __set_current_state(TASK_RUNNING); poll_freewait(&table); /* * Up-to-date the caller timeout. */ *timeout = __timeout; return retval; }
/* No kernel lock held - perfect */ static unsigned int sock_poll(struct file *file, poll_table * wait) { struct socket *sock; /* * We can't return errors to poll, so it's either yes or no. */ sock = SOCKET_I(file->f_dentry->d_inode); /* 例子 tcp_poll */ return sock->ops->poll(file, sock, wait); }
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { unsigned int mask; struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); /* 将当前进程加入等待队列,并且有唤醒函数 */ poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == TCP_LISTEN) return tcp_listen_poll(sk, wait); mask = 0; if (sk->sk_err) mask = POLLERR; if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLIN | POLLRDNORM; /* Connected? */ if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { /* Potential race condition. If read of tp below will * escape above sk->sk_state, we can be illegally awaken * in SYN_* states. */ if ((tp->rcv_nxt != tp->copied_seq) && (tp->urg_seq != tp->copied_seq || tp->rcv_nxt != tp->copied_seq + 1 || sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. */ if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) mask |= POLLOUT | POLLWRNORM; } } if (tp->urg_data & TCP_URG_VALID) mask |= POLLPRI; } return mask; }
/*真正的等待处 ,每个监控调用一次 */ void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p) { struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt); struct poll_table_page *table = p->table; if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; __set_current_state(TASK_RUNNING); return; } new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } /* Add a new entry */ { struct poll_table_entry * entry = table->entry; table->entry = entry+1; get_file(filp); entry->filp = filp; entry->wait_address = wait_address; /* 添加当前进程到等待队列, 这里面含有唤醒函数 */ init_waitqueue_entry(&entry->wait, current); add_wait_queue(wait_address,&entry->wait); } }
/** * 非互斥进程由default_wake_function唤醒。它是try_to_wake_up的一个简单封装。 */ int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { task_t *p = curr->task; return try_to_wake_up(p, mode, sync); }
/** * 通过把进程状态设置为TASK_RUNNING,并把该进程插入本地CPU的运行队列来唤醒睡眠或停止的进程 * p-被唤醒进程的描述符 * state-可以被唤醒的进程状态掩码。 * sync-一个标志,用来禁止被唤醒的进程抢占本地CPU上正在运行的进程。 */ static int try_to_wake_up(task_t * p, unsigned int state, int sync) { int cpu, this_cpu, success = 0; unsigned long flags; long old_state; runqueue_t *rq; #ifdef CONFIG_SMP unsigned long load, this_load; struct sched_domain *sd; int new_cpu; #endif /** * 调用task_rq_lock来禁止中断,并获得进程所在CPU上的运行队列的锁(可能与当前CPU的运行队列不一样,并且被唤醒的进程可能并不在队列上) */ rq = task_rq_lock(p, &flags); schedstat_inc(rq, ttwu_cnt); old_state = p->state; /** * 只唤醒state对应状态的进程。如果被唤醒的进程状态不在state中,直接退出。本次唤醒无效。 * 例如:通过信号就不会唤醒TASK_UNINTERRUPTIBLE状态的进程。 */ if (!(old_state & state)) goto out; /** * 如果进程已经属于某个运行队列,就跳转到out_running,将它的状态修改为TASK_RUNNING状态后退出。 */ if (p->array) goto out_running; cpu = task_cpu(p); this_cpu = smp_processor_id(); #ifdef CONFIG_SMP /** * 在SMP上,需要检查被唤醒的进程是否应该从最近运行的CPU的运行队列迁移到另外一个CPU的运行队列。 */ /** * 被唤醒任务正在CPU上运行,不必考虑迁移了。 */ if (unlikely(task_running(rq, p))) goto out_activate; /** * 优先将进程放到进程所在CPU上运行。 */ new_cpu = cpu; /** * 如果进程所在CPU就是当前进程所在CPU,或者被唤醒进程不允许在当前进程所在CPU上运行,那么跳转到out_set_cpu */ if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; load = source_load(cpu); this_load = target_load(this_cpu); /* * If sync wakeup then subtract the (maximum possible) effect of * the currently running task from the load of the current CPU: */ if (sync) this_load -= SCHED_LOAD_SCALE; /* Don't pull the task off an idle CPU to a busy one */ /** * 如果被唤醒任务所在的CPU工作量小于当前CPU的工作量,也跳转到out_set_cpu */ if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) goto out_set_cpu; /** * 试图将进程迁移到本地CPU。 */ new_cpu = this_cpu; /* Wake to this CPU if we can */ /* * Scan domains for affine wakeup and passive balancing * possibilities. */ for_each_domain(this_cpu, sd) { unsigned int imbalance; /* * Start passive balancing when half the imbalance_pct * limit is reached. */ imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; if ((sd->flags & SD_WAKE_AFFINE) && !task_hot(p, rq->timestamp_last_tick, sd)) { /* * This domain has SD_WAKE_AFFINE and p is cache cold * in this domain. */ if (cpu_isset(cpu, sd->span)) { schedstat_inc(sd, ttwu_wake_affine); goto out_set_cpu; } } else if ((sd->flags & SD_WAKE_BALANCE) && imbalance*this_load <= 100*load) { /* * This domain has SD_WAKE_BALANCE and there is * an imbalance. */ if (cpu_isset(cpu, sd->span)) { schedstat_inc(sd, ttwu_wake_balance); goto out_set_cpu; } } } new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ out_set_cpu: schedstat_inc(rq, ttwu_attempts); new_cpu = wake_idle(new_cpu, p); if (new_cpu != cpu) { schedstat_inc(rq, ttwu_moved); set_task_cpu(p, new_cpu); task_rq_unlock(rq, &flags); /* might preempt at this point */ rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) goto out; if (p->array) goto out_running; this_cpu = smp_processor_id(); cpu = task_cpu(p); } out_activate: #endif /* CONFIG_SMP */ /** * 如果是TASK_UNINTERRUPTIBLE,就递减nr_uninterruptible * 并将activated设为-1,表示进程是从TASK_UNINTERRUPTIBLE状态被唤醒这个事实。 */ if (old_state == TASK_UNINTERRUPTIBLE) { rq->nr_uninterruptible--; /* * Tasks on involuntary sleep don't earn * sleep_avg beyond just interactive state. */ p->activated = -1; } /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) * don't trigger a preemption, if the woken up task will run on * this cpu. (in this case the 'I will reschedule' promise of * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ /** * activate_task函数依次执行以下步骤澹? * 1:调用sched_clock获得当前时间戳,如果目标CPU不是本地CPU,那么还会补偿时钟中断的偏差。 * 2:调用recalc_task_prio,计算进程的动态优先级。 * 3:根据情况设置activated * 4:设置进程的时间戳。 * 5:将进程插入进程集合。 */ activate_task(p, rq, cpu == this_cpu); /** * 如果目标CPU不是本地CPU,或者没有SYNC标志,就检查新进程的动态优先级是否比运行队列中当前进程的优先级高。 */ if (!sync || cpu != this_cpu) { if (TASK_PREEMPTS_CURR(p, rq))/* 进程的优先级比所在队列的当前进程优先级高,需要抢占。 */ /** * resched_task函数进行进程抢占。 * 在单处理器上,它仅仅设置TIF_NEED_RESCHED标志。 * 在多处理器上,它可能会发送IPI,强制让CPU产生调度。 */ resched_task(rq->curr); } success = 1; out_running: /** * 将进程状态设置为为TASK_RUNNING,注意两个流程会走到这里。 */ p->state = TASK_RUNNING; out: /** * 开中断并打开运行队列的锁。 */ task_rq_unlock(rq, &flags); /** * 返回0:进程没有被唤醒。否则返回1,进程被唤醒。 */ return success; }
当底层驱动收到数据后,会产生中断信号,调用 default_wake_function函数来唤醒对应的进程,唤醒后进程继续do_select来检查关心的bit位。至于驱动具体是如何通知上层的,还需要进一步学习与分析。