先转载一部分:
昨天被人问及epoll和select的差别,吞吞吐吐说了个大概,囧,基础不牢害死人啊
再次查找了资料,内容如下
select和poll:
int select(int n, fd_set *rd_fds, fd_set *wr_fds, fd_set *ex_fds, struct timeval *timeout);select用到了fd_set结构,此处有一个FD_SETSIZE决定fd_set的容量,FD_SETSIZE默认1024,可以通过ulimit -n或者setrlimit函数修改之。
int poll(struct pollfd *ufds, unsigned int nfds, int timeout);
作为select的替代品,poll的参数用struct pollfd数组(第一个参数)来取代fd_set,数组大小自己定义,这样的话避免了FD_SETSIZE给程序带来的麻烦。
每次的 select/poll操作,都需要建立当前线程的关心事件列表,并挂起此线程到等待队列中 直到事件触发或者timeout结束,同时select/poll返回后也需要对传入的句柄列表做一次扫描来dispatch。随着连接数增加,select和poll的性能是严重非线性下降。
epoll(linux), kqueue(freebsd), /dev/poll(solaris):
作为针对select和poll的升级(可以这么理解:)),主要它们做了两件事情
1. 避免了每次调用select/poll时kernel分析参数建立事件等待结构的开销,kernel维护一个长期的事件关注列表,应用程序通过句柄修改这个列表和捕获I/O事件。
2. 避免了select/poll返回后,应用程序扫描整个句柄表的开销,Kernel直接返回具体的事件列表给应用程序。
同时还有两种触发机制:
水平触发(level-triggered,也被称为条件触发)LT: 只要满足条件,就触发一个事件(只要有数据没有被获取,内核就不断通知你)
边缘触发(edge-triggered)ET: 每当状态变化时,触发一个事件
“举个读socket的例子,假定经过长时间的沉默后,现在来了100个字节,这时无论边缘触发和条件触发都会产生一个read ready notification通知应用程序可读。应用程序读了50个字节,然后重新调用api等待io事件。这时条件触发的api会因为还有50个字节可读从而立即返回用户一个read ready notification。而边缘触发的api会因为可读这个状态没有发生变化而陷入长期等待。 因此在使用边缘触发的api时,要注意每次都要读到socket返回EWOULDBLOCK为止,否则这个socket就算废了。而使用条件触发的api时,如果应用程序不需要写就不要关注socket可写的事件,否则就会无限次的立即返回一个write ready notification。"
select属于LT,
kqueue属于ET
====================================
最后 两个参考的连接
1、最有名的C10k problem http://www.kegel.com/c10k.html
2、http://blog.chinaunix.net/u/10449/showart_323252.html
在IPTV项目中,为了增加TCP连接的数量必须使用ePoll而不是select。在ePoll的编程中用到了以下代码:
1. 似乎ePoll和select 同时使用有问题。在select的循环里,莫名crash,所以把部分的select替换成了ePoll。替换的部分是作为web server的http 控制部分。
2. 错误处理很重要,能够发现问题。例如,把MaxEvent设置成了0。
3. timeout不能设置的很小,否则总是在空循环,造成CPU使用过高。
4. 当信号把ePoll的wait函数返回了,这样在错误代码里也会体现出来。
/*
******************************************************************************
*
* TCP_ExServerSocketManager :: Initialize
*
******************************************************************************
*/
int TCP_ExServerSocketManager :: Initialize( unsigned short InitialRecvBufferSize )
{
// create epoll handle
if ( m_epollFd != 0 )
close( m_epollFd );
// The size is not the maximum size of the backing store but just a
// hint to the kernel about how to dimension internal structures.
// Since Linux 2.6.8, the size argument is unused.
m_epollFd = epoll_create( /*size*/ 2048 );
// allocate memory for epoll event
if ( m_pEvents != NULL ) {
free( m_pEvents );
m_pEvents = NULL;
}
m_epollMaxEvents = 2048;
m_pEvents = ( struct epoll_event * ) realloc( m_pEvents, m_epollMaxEvents * sizeof ( struct epoll_event ) );
if ( m_pEvents == NULL ) {
return GENERIC_ERROR_MEMORY_ALLOCATION_FAILED;
}
memset( m_pEvents, 0, m_epollMaxEvents * sizeof ( struct epoll_event ) );
return TCP_ServerSocketManager :: Initialize( InitialRecvBufferSize );
}
/*
******************************************************************************
*
* TCP_ExServerSocketManager :: Initialize
*
******************************************************************************
*/
int TCP_ExServerSocketManager :: Initialize(
const char * Description,
unsigned long & ListeningSocketID,
unsigned short PortNumber,
unsigned long AdapterAddress,
unsigned short InitialRecvBufferSize )
{
// create epoll handle
if ( m_epollFd != 0 )
close( m_epollFd );
// The size is not the maximum size of the backing store but just a
// hint to the kernel about how to dimension internal structures.
// Since Linux 2.6.8, the size argument is unused.
m_epollFd = epoll_create( /*size*/ 10 );
// allocate memory for epoll event
if ( m_pEvents != NULL ) {
free( m_pEvents );
m_pEvents = NULL;
}
m_epollMaxEvents = 2048;
m_pEvents = ( struct epoll_event * ) realloc( m_pEvents, m_epollMaxEvents * sizeof ( struct epoll_event ) );
if ( m_pEvents == NULL ) {
return GENERIC_ERROR_MEMORY_ALLOCATION_FAILED;
}
memset( m_pEvents, 0, m_epollMaxEvents * sizeof ( struct epoll_event ) );
int Result = TCP_ServerSocketManager :: Initialize( Description,
ListeningSocketID,
PortNumber,
AdapterAddress,
InitialRecvBufferSize );
return Result;
}
/*
******************************************************************************
*
* TCP_ExServerSocketManager :: Execute
*
******************************************************************************
*/
void TCP_ExServerSocketManager :: Execute( void )
{
sigset_t DefaultSignalMask;
sigset_t DisableUserInterrupt;
long Timeout = 5000;
// Setup a signal mask to block SIGUSR1 and get the default signal mask.
sigemptyset( &DisableUserInterrupt );
sigaddset( &DisableUserInterrupt, SIGUSR1 );
pthread_sigmask( SIG_BLOCK, &DisableUserInterrupt, &DefaultSignalMask );
pthread_sigmask( SIG_SETMASK, &DefaultSignalMask, NULL );
while ( !m_bDone ) {
Timeout = -1;
// Block signals while setting up the FD_SETs so that new sockets will
// not be "lost" if the interrupt occurs before pselect() is called.
pthread_sigmask( SIG_BLOCK, &DisableUserInterrupt, NULL );
// Add new sockets to the list of managed sockets
ManageNewSockets();
// for sockets that are pending a connection, try reconnecting
//ReconnectSockets( Timeout );
// Determine types of events to detect on sockets and purge closed sockets.
int MaxEvents = SetSocketEvents( Timeout );
if ( MaxEvents == 0 ) {
MaxEvents = 1;
Timeout = 500;
GetDiagPortal()->SendFormattedDiagText(
DIAG_DEBUG,
"MaxEvents is zero, delay 500ms");
}
pthread_sigmask( SIG_SETMASK, &DefaultSignalMask, NULL );
// Wait for a socket event to occur. Wr restore the default signal mask
// since we want to receive alerts about changes to the FD_SETs.
errno = 0;
int EventCount = epoll_wait( m_epollFd,
m_pEvents,
MaxEvents,
Timeout );
// Do not process any events if shutdown requested.
if ( m_bDone )
break;
// Process each socket event
if ( EventCount > 0 ){
ProcessSocketEvents( EventCount );
} else{
int errsv = errno;
if (errsv==EBADF) {
GetDiagPortal()->SendFormattedDiagText(
DIAG_ERROR,
"epfd is not a valid file descriptor, errno: %lu",
errsv);
}else if (errsv==EFAULT) {
GetDiagPortal()->SendFormattedDiagText(
DIAG_ERROR,
"The memory area pointed to by events is not accessible with write permissions., errno: %lu",
errsv);
}else if (errsv==EINTR) {
//GetDiagPortal()->SendFormattedDiagText(
// DIAG_DEBUG,
// "The call was interrupted by a signal handler before any of the requested events occurred or the timeout expired, errno: %lu",
// errsv);
}else if (errsv==EINVAL) {
GetDiagPortal()->SendFormattedDiagText(
DIAG_ERROR,
" EINVAL epfd is not an epoll file descriptor, or maxevents is less than or equal to zero., errno: %lu",
errsv);
}else{
GetDiagPortal()->SendFormattedDiagText(
DIAG_DEBUG,
"ePoll call failed, errno: %lu",
errsv);
}
}
// Process any pending messages
ProcessThreadMessages();
}
}
/*
******************************************************************************
*
* TCP_ExServerSocketManager :: SetSocketEvents
*
******************************************************************************
*/
int TCP_ExServerSocketManager :: SetSocketEvents( long& Timeout )
{
SocketMap :: iterator Curr;
SocketMap :: iterator Next;
int MaxFd = 0;
Timeout = -1;
m_SocketMapLock.Lock();
// Add each socket to the read/write event set
Next = m_SocketMap.begin();
while ( Next != m_SocketMap.end() ) {
Curr = Next++;
BaseSocket * pSocket = Curr->second;
// Check for TCP client socket reconnect
if ( pSocket->GetTypeCode() == SocketType_TCP_Client ) {
TCP_ClientSocket * pClientSocket = (TCP_ClientSocket *)pSocket;
// Reconnect to the host
if ( pClientSocket->GetState() == Socket_ConnectPending ) {
pClientSocket->Connect();
Timeout = CONNECT_SELECT_TIMEOUT;
}
}
// Socket needs to be removed from list
SocketStates State = pSocket->GetState();
int Socket = pSocket->GetSocketHandle();
if (( State == Socket_NotAllocated ) || ( State == Socket_Closed )) {
// Remove from list if there are no references to the socket
// Close of an fd cause it to be removed from all epoll sets
// automatically
if ( pSocket->GetRefCount() == 0 ) {
if (Socket!=-1) {
struct epoll_event epv = {0, {0}};
epoll_ctl( m_epollFd, EPOLL_CTL_DEL, Socket, &epv );
}
m_SocketMap.erase( Curr );
delete pSocket;
continue;
}
// Recheck reference count shortly so that we can reclaim the memory
else {
Timeout = CONNECT_SELECT_TIMEOUT;
}
}
// Specify the events that the socket expects. Do not process if the
// socket is invalid.
if ( Socket != -1 ) {
int EventTypes = pSocket->GetEventTypes();
int Op = pSocket->GetEpollStatus();
if ( Op == EPOLL_CTL_ADD ) {
pSocket->SetEpollStatus( EPOLL_CTL_MOD );
}
int events = EPOLLET;
if ( EventTypes & ( FD_READ | FD_ACCEPT | FD_CLOSE | FD_CONNECT ))
events |= EPOLLIN;
if ( EventTypes & ( FD_WRITE | FD_CONNECT ))
events |= EPOLLOUT;
epoll_event ev = { 0, {0} };
ev.events = events;
ev.data.ptr = pSocket; // TODO: Need Verify
ev.data.fd = pSocket->GetSocketID();
epoll_ctl( m_epollFd, Op, Socket, &ev );
MaxFd ++;
}
}
if ( MaxFd > m_epollMaxEvents ) {
GetDiagPortal()->SendFormattedDiagText(
DIAG_ERROR,
"Exceed epoll event queue size: %lu",
MaxFd );
m_epollMaxEvents = 2 * m_epollMaxEvents > MaxFd ? 2 * m_epollMaxEvents : 2 * MaxFd;
// How to handle allocation failed?
m_pEvents = ( struct epoll_event * ) realloc( m_pEvents, m_epollMaxEvents * sizeof ( struct epoll_event ) );
if ( m_pEvents == 0 ) {
GetDiagPortal()->SendFormattedDiagText(
DIAG_ERROR,
"Allocate epoll event queue failed, size: %lu",
m_epollMaxEvents * sizeof ( struct epoll_event ) );
}
memset( m_pEvents, 0, m_epollMaxEvents * sizeof ( struct epoll_event ) );
}
m_SocketMapLock.Unlock();
return MaxFd;
}
/*
******************************************************************************
*
* TCP_ExServerSocketManager :: ProcessSocketEvents
*
******************************************************************************
*/
void TCP_ExServerSocketManager :: ProcessSocketEvents( int MaxEvents )
{
m_SocketMapLock.Lock();
for ( int i = 0; i < MaxEvents; i++ ) {
struct epoll_event * pEvent = m_pEvents + i;
BaseSocket * pSocket = GetSocketByID( pEvent->data.fd );
if ( pSocket == NULL )
continue;
// Process socket events
pSocket->DoEventProcessing( ( pEvent->events & EPOLLIN ),
( pEvent->events & EPOLLOUT ),
0 );
pSocket->ReleaseRef();
}
m_SocketMapLock.Unlock();
}