// tcp.c/*
* Wait for a TCP event.
*
* Note that we don't need to lock the socket, as the upper poll layers
* take care of normal races (between the test and the event) and we don't
* go look at any of the socket buffers directly.
*/__poll_ttcp_poll(structfile*file,structsocket*sock,poll_table*wait){__poll_tmask;structsock*sk=sock->sk;conststructtcp_sock*tp=tcp_sk(sk);intstate;// fd 添加等待事件,关联事件回调。sock_poll_wait(file,sock,wait);// socket 对应事件逻辑。state=inet_sk_state_load(sk);if(state==TCP_LISTEN)returninet_csk_listen_poll(sk);/* Socket is not locked. We are protected from async events
* by poll logic and correct handling of state changes
* made by other threads is impossible in any case.
*/mask=0;/*
* EPOLLHUP is certainly not done right. But poll() doesn't
* have a notion of HUP in just one direction, and for a
* socket the read side is more interesting.
*
* Some poll() documentation says that EPOLLHUP is incompatible
* with the EPOLLOUT/POLLWR flags, so somebody should check this
* all. But careful, it tends to be safer to return too many
* bits than too few, and you can easily break real applications
* if you don't tell them that something has hung up!
*
* Check-me.
*
* Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and
* our fs/select.c). It means that after we received EOF,
* poll always returns immediately, making impossible poll() on write()
* in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP
* if and only if shutdown has been made in both directions.
* Actually, it is interesting to look how Solaris and DUX
* solve this dilemma. I would prefer, if EPOLLHUP were maskable,
* then we could set it on SND_SHUTDOWN. BTW examples given
* in Stevens' books assume exactly this behaviour, it explains
* why EPOLLHUP is incompatible with EPOLLOUT. --ANK
*
* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
* blocking on fresh not-connected or disconnected socket. --ANK
*/if(sk->sk_shutdown==SHUTDOWN_MASK||state==TCP_CLOSE)mask|=EPOLLHUP;if(sk->sk_shutdown&RCV_SHUTDOWN)mask|=EPOLLIN|EPOLLRDNORM|EPOLLRDHUP;/* Connected or passive Fast Open socket? */if(state!=TCP_SYN_SENT&&(state!=TCP_SYN_RECV||rcu_access_pointer(tp->fastopen_rsk))){inttarget=sock_rcvlowat(sk,0,INT_MAX);if(READ_ONCE(tp->urg_seq)==READ_ONCE(tp->copied_seq)&&!sock_flag(sk,SOCK_URGINLINE)&&tp->urg_data)target++;if(tcp_stream_is_readable(tp,target,sk))mask|=EPOLLIN|EPOLLRDNORM;if(!(sk->sk_shutdown&SEND_SHUTDOWN)){if(sk_stream_is_writeable(sk)){mask|=EPOLLOUT|EPOLLWRNORM;}else{/* send SIGIO later */sk_set_bit(SOCKWQ_ASYNC_NOSPACE,sk);set_bit(SOCK_NOSPACE,&sk->sk_socket->flags);/* Race breaker. If space is freed after
* wspace test but before the flags are set,
* IO signal will be lost. Memory barrier
* pairs with the input side.
*/smp_mb__after_atomic();if(sk_stream_is_writeable(sk))mask|=EPOLLOUT|EPOLLWRNORM;}}elsemask|=EPOLLOUT|EPOLLWRNORM;if(tp->urg_data&TCP_URG_VALID)mask|=EPOLLPRI;}elseif(state==TCP_SYN_SENT&&inet_sk(sk)->defer_connect){/* Active TCP fastopen socket with defer_connect
* Return EPOLLOUT so application can call write()
* in order for kernel to generate SYN+data
*/mask|=EPOLLOUT|EPOLLWRNORM;}/* This barrier is coupled with smp_wmb() in tcp_reset() */smp_rmb();if(sk->sk_err||!skb_queue_empty_lockless(&sk->sk_error_queue))mask|=EPOLLERR;returnmask;}EXPORT_SYMBOL(tcp_poll);
/*
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
*/structeventpoll{/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/structmutexmtx;/* Wait queue used by sys_epoll_wait() */wait_queue_head_twq;/* Wait queue used by file->poll() */wait_queue_head_tpoll_wait;/* List of ready file descriptors */structlist_headrdllist;/* Lock which protects rdllist and ovflist */rwlock_tlock;/* RB tree root used to store monitored fd structs */structrb_root_cachedrbr;/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
* holding ->lock.
*/structepitem*ovflist;/* wakeup_source used when ep_scan_ready_list is running */structwakeup_source*ws;/* The user that created the eventpoll descriptor */structuser_struct*user;structfile*file;/* used to optimize loop detection check */intvisited;structlist_headvisited_list_link;#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */unsignedintnapi_id;#endif
};
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the "rbr" RB tree.
* Avoid increasing the size of this struct, there can be many thousands
* of these on a server and we do not want this to take another cache line.
*/structepitem{union{/* RB tree node links this structure to the eventpoll RB tree */structrb_noderbn;/* Used to free the struct epitem */structrcu_headrcu;};/* List header used to link this structure to the eventpoll ready list */structlist_headrdllink;/*
* Works together "struct eventpoll"->ovflist in keeping the
* single linked chain of items.
*/structepitem*next;/* The file descriptor information this item refers to */structepoll_filefdffd;/* Number of active wait queue attached to poll operations */intnwait;/* List containing poll wait queues */structlist_headpwqlist;/* The "container" of this item */structeventpoll*ep;/* List header used to link this item to the "struct file" items list */structlist_headfllink;/* wakeup_source used when EPOLLWAKEUP is set */structwakeup_source__rcu*ws;/* The structure that describe the interested events and the source fd */structepoll_eventevent;};
/* poll.h
* Do not touch the structure directly, use the access functions
* poll_does_not_wait() and poll_requested_events() instead.
*/typedefstructpoll_table_struct{poll_queue_proc_qproc;__poll_t_key;}poll_table;/*
* structures and helpers for f_op->poll implementations
*/typedefvoid(*poll_queue_proc)(structfile*,wait_queue_head_t*,structpoll_table_struct*);
成员
描述
_qproc
处理函数,可以指向 ep_ptable_queue_proc 函数,或者空。
_key
事件组合。
6.6. ep_pqueue
包装就绪事件处理结构,关联 epitem。
1
2
3
4
5
/* Wrapper struct used by poll queueing */structep_pqueue{poll_tablept;structepitem*epi;};
staticint__initeventpoll_init(void){structsysinfosi;.../* Allocates slab cache used to allocate "struct epitem" items */epi_cache=kmem_cache_create("eventpoll_epi",sizeof(structepitem),0,SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,NULL);/* Allocates slab cache used to allocate "struct eppoll_entry" */pwq_cache=kmem_cache_create("eventpoll_pwq",sizeof(structeppoll_entry),0,SLAB_PANIC|SLAB_ACCOUNT,NULL);return0;}fs_initcall(eventpoll_init);
SYSCALL_DEFINE4(epoll_wait,int,epfd,structepoll_event__user*,events,int,maxevents,int,timeout){returndo_epoll_wait(epfd,events,maxevents,timeout);}staticintdo_epoll_wait(intepfd,structepoll_event__user*events,intmaxevents,inttimeout){...// timeout 阻塞等待处理并返回就绪事件。error=ep_poll(ep,events,maxevents,timeout);...}staticintep_poll(structeventpoll*ep,structepoll_event__user*events,intmaxevents,longtimeout){intres=0,eavail,timed_out=0;u64slack=0;boolwaiter=false;wait_queue_entry_twait;ktime_texpires,*to=NULL;// 计算 timeout 睡眠时间。如果有就绪事件,处理并发送到用户空间。...fetch_events:if(!ep_events_available(ep))// napi 中断缓解技术,避免网卡频繁中断 cpu,提高数据获取的效率。这里为了积攒网络数据进行返回。ep_busy_loop(ep,timed_out);// 检查就绪队列是否有数据。eavail=ep_events_available(ep);if(eavail)// 如果有就绪事件了,就直接不用睡眠等待了,进入发送环节。gotosend_events;...// 没有就绪事件发生,需要睡眠等待。if(!waiter){waiter=true;// 等待事件,关联当前进程。init_waitqueue_entry(&wait,current);spin_lock_irq(&ep->wq.lock);// 添加等待事件。(为了解决惊群效应,所以等待事件添加了 WQ_FLAG_EXCLUSIVE 标识。查看 __wake_up_common 实现。)__add_wait_queue_exclusive(&ep->wq,&wait);spin_unlock_irq(&ep->wq.lock);}for(;;){/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/// 设置当前进程状态为等待状态,可以被信号解除等待。set_current_state(TASK_INTERRUPTIBLE);/*
* Always short-circuit for fatal signals to allow
* threads to make a timely exit without the chance of
* finding more events available and fetching
* repeatedly.
*/// 信号中断,不要执行睡眠了。if(fatal_signal_pending(current)){res=-EINTR;break;}// 检查就绪队列。eavail=ep_events_available(ep);if(eavail)break;// 信号中断,不要执行睡眠了。if(signal_pending(current)){res=-EINTR;break;}// 进程进入睡眠状态。if(!schedule_hrtimeout_range(to,slack,HRTIMER_MODE_ABS)){timed_out=1;break;}}// 进程等待超时,或者被唤醒,设置进程进入运行状态,等待内核调度运行。__set_current_state(TASK_RUNNING);send_events:/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/// 有就绪事件就发送到用户空间,否则继续获取数据直到超时。if(!res&&eavail&&!(res=ep_send_events(ep,events,maxevents))&&!timed_out)gotofetch_events;// 从等待队列中,删除等待事件。if(waiter){spin_lock_irq(&ep->wq.lock);__remove_wait_queue(&ep->wq,&wait);spin_unlock_irq(&ep->wq.lock);}returnres;}/* Used by the ep_send_events() function as callback private data */structep_send_events_data{intmaxevents;structepoll_event__user*events;intres;};staticintep_send_events(structeventpoll*ep,structepoll_event__user*events,intmaxevents){structep_send_events_dataesed;esed.maxevents=maxevents;esed.events=events;// 遍历事件就绪列表,发送就绪事件到用户空间。ep_scan_ready_list(ep,ep_send_events_proc,&esed,0,false);returnesed.res;}
static__poll_tep_send_events_proc(structeventpoll*ep,structlist_head*head,void*priv){structep_send_events_data*esed=priv;__poll_trevents;structepitem*epi,*tmp;structepoll_event__user*uevent=esed->events;structwakeup_source*ws;poll_tablept;init_poll_funcptr(&pt,NULL);...// 遍历处理 txlist(原 ep->rdllist 数据)就绪队列结点,获取事件拷贝到用户空间。list_for_each_entry_safe(epi,tmp,head,rdllink){if(esed->res>=esed->maxevents)break;...// 先从就绪队列中删除 epi,如果是 LT 模式,就绪事件还没处理完,再把它添加回去。list_del_init(&epi->rdllink);// 获取 epi 对应 fd 的就绪事件。revents=ep_item_poll(epi,&pt,1);if(!revents)// 如果没有就绪事件就返回(这时候,epi 已经从就绪列表中删除了。)continue;// 内核空间向用户空间传递数据。__put_user 成功拷贝返回 0。if(__put_user(revents,&uevent->events)||__put_user(epi->event.data,&uevent->data)){// 如果拷贝失败,继续保存在就绪列表里。list_add(&epi->rdllink,head);ep_pm_stay_awake(epi);if(!esed->res)esed->res=-EFAULT;return0;}// 成功处理就绪事件的 fd 个数。esed->res++;uevent++;if(epi->event.events&EPOLLONESHOT)// #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)epi->event.events&=EP_PRIVATE_BITS;elseif(!(epi->event.events&EPOLLET)){/*
* If this file has been added with Level
* Trigger mode, we need to insert back inside
* the ready list, so that the next call to
* epoll_wait() will check again the events
* availability. At this point, no one can insert
* into ep->rdllist besides us. The epoll_ctl()
* callers are locked out by
* ep_scan_ready_list() holding "mtx" and the
* poll callback will queue them in ep->ovflist.
*//* lt 模式下,当前事件被处理完后,不会从就绪列表中删除,留待下一次 epoll_wait
* 调用,再查看是否还有事件没处理,如果没有事件了就从就绪列表中删除。
* 在遍历事件的过程中,不能写 ep->rdllist,因为已经上锁,只能把新的就绪信息
* 添加到 ep->ovflist */list_add_tail(&epi->rdllink,&ep->rdllist);ep_pm_stay_awake(epi);}}return0;}
staticintep_poll_callback(wait_queue_entry_t*wait,unsignedmode,intsync,void*key){intpwake=0;structepitem*epi=ep_item_from_wait(wait);structeventpoll*ep=epi->ep;__poll_tpollflags=key_to_poll(key);unsignedlongflags;intewake=0;// 禁止本地中断并获得指定读锁。read_lock_irqsave(&ep->lock,flags);ep_set_busy_poll_napi_id(epi);// #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)// 如果 fd 没有关注除了 EP_PRIVATE_BITS 之外的事件,那么走解锁流程。if(!(epi->event.events&~EP_PRIVATE_BITS))gotoout_unlock;// 如果回调的事件,不是用户关注的 fd 事件,那么走解锁流程。if(pollflags&&!(pollflags&epi->event.events))gotoout_unlock;/*
* If we are transferring events to userspace, we can hold no locks
* (because we're accessing user memory, and because of linux f_op->poll()
* semantics). All the events that happen during that period of time are
* chained in ep->ovflist and requeued later on.
*/// 当内核空间向用户空间拷贝数据时,不添加 epi 到 rdllist,将它添加到 ovflist。if(READ_ONCE(ep->ovflist)!=EP_UNACTIVE_PTR){if(epi->next==EP_UNACTIVE_PTR&&chain_epi_lockless(epi))ep_pm_stay_awake_rcu(epi);gotoout_unlock;}// epi 已经加入就绪链表就不需要添加了。if(!ep_is_linked(epi)&&list_add_tail_lockless(&epi->rdllink,&ep->rdllist)){ep_pm_stay_awake_rcu(epi);}// 当回调事件是用户关注的事件,那么需要唤醒进程处理。// ep->wq 在 epoll_wait 时添加,当没有就绪事件,epoll_wait 进行睡眠等待唤醒。if(waitqueue_active(&ep->wq)){if((epi->event.events&EPOLLEXCLUSIVE)&&!(pollflags&POLLFREE)){// #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)switch(pollflags&EPOLLINOUT_BITS){caseEPOLLIN:if(epi->event.events&EPOLLIN)ewake=1;break;caseEPOLLOUT:if(epi->event.events&EPOLLOUT)ewake=1;break;case0:ewake=1;break;}}wake_up(&ep->wq);}// ep->poll_wait 是 epoll 监控另外一个 epoll fd 的等待队列。如果触发事件,也需要唤醒进程处理。if(waitqueue_active(&ep->poll_wait))pwake++;out_unlock:read_unlock_irqrestore(&ep->lock,flags);/* We have to call this outside the lock */if(pwake)ep_poll_safewake(&ep->poll_wait);if(!(epi->event.events&EPOLLEXCLUSIVE))ewake=1;if(pollflags&POLLFREE){/*
* If we race with ep_remove_wait_queue() it can miss
* ->whead = NULL and do another remove_wait_queue() after
* us, so we can't use __remove_wait_queue().
*/list_del_init(&wait->entry);/*
* ->whead != NULL protects us from the race with ep_free()
* or ep_remove(), ep_remove_wait_queue() takes whead->lock
* held by the caller. Once we nullify it, nothing protects
* ep/epi or even wait.
*/smp_store_release(&ep_pwq_from_wait(wait)->whead,NULL);}returnewake;}
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· AI技术革命,工作效率10个最佳AI工具