TCP拥塞控制算法内核实现剖析(一)

内核版本:2.6.37

主要源文件:linux-2.6.37/ net/ ipv4/ Tcp_cong.c 

本文主要分析RENO及TCP拥塞控制基础的实现

======================================================================================================

struct sock *sk 和 struct tcp_sock *tp 的转换

在include/ linux/ Tcp.h中,
static inline struct tcp_sock *tcp_sk(const struct sock *sk) 
{
        return (struct tcp_sock *)sk ;
}

给出struct sock *sk,
struct tcp_sock *tp = tcp_sk(sk) ;

 

tcp_sock结构

struct tcp_sock
{
       ...
 u32 window_clamp ; /* Maximal window to advertise */
 u32 rcv_ssthresh ; /* Current window clamp */
 u32 rcv_wnd ; /* Current receiver window */
       ...
 /* snd_wll 记录发送窗口更新时,造成窗口更新的那个数据报的第一个序号。
  * 它主要用于在下一次判断是否需要更新发送窗口。
  */
 u32 snd_wll ; /* Sequence for window update */ 
 u32 snd_wnd ; /* 发送窗口的大小,直接取值于来自对方的数据报的TCP首部 */
 /* Maximal window ever seen from peer 记录来自对方通告的窗口的最大值 */
 /* First byte we want an ack for 发送窗口的左边沿 */
 u32 max_window ;  u32 snd_una ; 
        ...
 /*
  * Slow start and congestion control
  */
 u32 snd_ssthresh ; /* Slow start size threshold */
 u32 snd_cwnd ; /* Sending congestion window */
 /*表示在当前的拥塞控制窗口中已经发送的数据段的个数*/
 u32 snd_cwnd_cnt ; /* Linear increase counter */ 
 u32 snd_cwnd_clamp ; /* Do not allow snd_cwnd to grow above this */
        ...
 u32 mss_cache ; /* cached effective mss , not including SACKS */
 u32 bytes_acked ; /* Appropriate Byte Counting - RFC3465 */
       ...
}


拥塞避免算法关键部分

/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd ( or alternative w ) */
void tcp_cong_avoid_ai(struct tcp_sock *tp , u32 w)
{
        if ( tp->snd_cwnd_cnt >= w) {
                   if ( tp->snd_cwnd < tp->snd_cwnd_clamp)
                        tp->snd_cwnd++ ;
                   tp->snd_cwnd_cnt = 0 ;
        } else {
                   tp->snd_cwnd_cnt ++ ;
        }                
}
EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai) ;


慢启动算法

void tcp_slow_start( struct tcp_sock *tp )
{
 int cnt ; /* increase in packets */

/* RFC3465 : ABC slow start
 * Increase only after a full MSS of bytes is acked
 *
 * TCP sender SHOULD increase cwnd by the number of
 * previously unacknowledged bytes ACKed by each incoming 
 * acknowledgment , provided the increase is not more than L
 */
 /* ack的数据少于MSS */
 if ( sysctl_tcp_abc && tp->bytes_acked < tp->mss_cached )  
        return ;

/* 此时不是应该进入拥塞避免?*/
 if ( sysctl_tcp_max_ssthresh >0 && tcp->snd_cwnd >sysctl_tcp_max_ssthresh)
        cnt = sysctl_tcp_max_ssthresh >> 1 ; /* limited slow start */
 else 
        cnt = tp->snd_cwnd ; /* exponential increase */

/* RFC3465 : ABC
 * We MAY increase by 2 if discovered delayed ack  
 */
/* 如果接收方启用了延时确认,此时收到的确认代表两个MSS数据报*/
if ( sysctl_tcp_abc >1 && tp->bytes_acked >= 2*tp->mss_cache ) 
        cnt <<= 1 ;

tp->bytes_acked = 0 ; 
tp->snd_cwnd_cnt += cnt ; /* 此时snd_cwnd_cnt等于snd_cwnd或2*snd_cwnd */

 while( tp->snd_cwnd_cnt >= tp->snd_cwnd ) { 
        tp->snd_cwnd_cnt -= tp->snd_cwnd ;
        if( tp->snd_cwnd < tp->snd_cwnd_clamp )
                tp->snd_cwnd++ ;
 }
}
EXPORT_SYMBOL_GPL( tcp_slow_start ) ;


代表拥塞算法的结构体

#define TCP_CA_NAME_MAX 16struct tcp_congestion_ops {
        struct list_head list ;
        unsigned long flags ;
        /* initialize private data (optional) */
        void (*init) (struct sock *sk) ;
        /* cleanup private data (optional) */ 
        void (*release) (struct sock *sk) ;
        /* return slow start threshold (required) */
        u32 (*ssthresh) (struct sock *sk) ;
        /* lower bound for congestion window (optional) */
        u32 (*min_cwnd) (const struct sock *sk) ;
        /* do new cwnd calculation (required) */
        void (*cong_avoid) (struct sock *sk , u32 ack , u32 in_flight ) ;
        /* call before changing ca_state (optional) */
        void (*set_state) (struct sock *sk , u8 new_state) ;
        /* call when cwnd event occurs (optional) */
        void (*cwnd_event) (struct sock *sk , enum tcp_ca_event ev) ;
        /* new value of cwnd after loss (optional) */
        u32 (*undo_cwnd) (struct sock *sk) ;
        /* hook for packet ack accounting (optional) */
        void (*pkts_acked) (struct sock *sk , u32 num_acked , s32 rtt_us) ;
        /* get info for inet_diag (optional) */
        void (*get_info) (struct sock *sk , u32 ext , struct sk_buff *skb) ;
        char name[TCP_CA_NAME_MAX] ;
        struct module *owner ;
}

 

在Tcp_cong.c中,有全局变量:

int sysctl_tcp_max_ssthresh = 0 ;

/* define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) */

static DEFINE_SPINLOCK( tcp_cong_list_lock ) ;

static LIST_HEAD( tcp_cong_list ) ; // tcp拥塞控制算法链表,其元素为tcp_congestion_ops

/*

 BUG_ON( ) ; 如果BUG_ON中的条件为真就调用BUG,它输出一些信息,然后调用panic函数挂起系统。

 char *strncpy( char * dest , char *src , size_t n ) ;

 它与strcpy不同之处在于复制n个字符,而不是把所有的字符拷贝(包括结尾'\0')。

  当src的长度小于n时,dst内的未复制空间用'\0'填充。否则,复制n个字符到dst,没有加'\0'。这里就要注意在字符串dst结尾处理加'\0'的情况了。

 rcu_read_lock() // 读者在读取由RCU保护的共享数据时使用该函数标记它进入读端临界区。

 rcu_read_unlock() // 该函数与rcu_read_lock配对使用,用以标记读者退出读端临界区。

*/

 

对拥塞控制算法的一些操作(读写增减注册等)

/* Get current default congestion control */
void tcp_get_default_congestion_control( char *name )
{
        struct tcp_congestion_ops *ca ;
        /* We will always have reno */
        BUG_ON( list_empty( &tcp_cong_list) ) ;

        rcu_read_lock( ) ;
        ca = list_entry( tcp_cong_list . next , struct tcp_congestion_ops , list ) ;
        strncpy( name , ca->name , TCP_CA_NAME_MAX ) ;
        rcu_read_unlock( ) ;
}

 

struct sock——representation of sockets

 struct inet_sock——representation of INET sockets

 struct inet_connection_sock——INET connection oriented sockets

 struct tcp_sock——tcp sockets

 以上几种socket越分越细,比如inet_connection_sock是在inet_sock上的扩展,具有自己特有的属性。

 tcp_sock是TCP协议专用的一个socket表示,它是在struct inet_connection_sock基础进行扩展,主要是增加了滑动窗口协议,避免拥塞算法等一些TCP专有属性。

 

struct inet_connection_sock {
        ...
        // Pluggable congestion control hook
        const struct tcp_congestion_ops *icsk_ca_ops ; 
        ...

        u32 icsk_ca_priv[16] ;
#define ICSK_CA_PRIV_SIZE (16*sizeof(u32))
}

 

举例://有一个初始化了得struct sock *sk

struct inet_connection_sock *icsk = inet_csk( sk ) ;

printk(KERN_INFO "%s" , icsk->icsk_ca_ops->name) ; //当前连接拥塞控制算法名称

 

struct inet_sock {
        ...
        /* Socket demultiplex comparisons on incoming packets */
        __be32 inet_daddr ;
        __be16 inet_dport ;
        __be32 inet_saddr ;
        __be16 inet_sport ;
        __be16 inet_num ; // local port 
        __be32 inet_rcv_saddr ; // Bound local IPv4 addr
        ...
}


 

/* Built list of non-restricted congestion control values*/
void tcp_get_allowed_congestion_control( char *buf , size_t maxlen)
{
        struct tcp_congestion_ops *ca ;
        size_t offs = 0 ;
        *buf = '\0' ; //有必要?
         rcu_read_lock() ;
        list_for_each_entry( ca , &tcp_cong_list , list ) {
                if( !( ca->flags & TCP_CONG_NON_RESTRICTED)) //排除有限制的。限制和非限制区别?
                         continue;
                offs += snprintf( buf+offs , maxlen-offs , "%s%s" , offs == 0?"" : " " , ca->name) ;
        }
        rcu_read_unlock() ;
}


 

/* Simple linear search , don't expect many entries! */
static struct tcp_congestion_ops*tcp_ca_find( const char *name)
{
        struct tcp_congestion_ops *e ;
        list_for_each_entry_rcu( e , &tcp_cong_list , list ) {
                if( strcmp(e->name , name)==0)
                        return e ;
         }
         return NULL ;
}


 

/*
 * Attach new congestion control algorithm to the list 
 * of available options.
 */
int tcp_register_congestion_control( struct tcp_congestion_ops *ca )
{
        int ret = 0 ;
        /* all algorithms must implement ssthresh and cong_avoid ops */
        if ( !ca->ssthresh || !ca->cong_avoid ) {
                printk(KERN_ERR "TCP %s does not implement required ops\n",
                            ca->name) ;
                return -EINVAL ;
        }

        spin_lock(&tcp_cong_list_lock) ;
        if( tcp_ca_find (ca->name)) {
                printk(KERN_NOTICE "TCP %s already registered\n", ca->name) ;
                ret = -EEXIST; //不能直接return,不然会造成死锁
         } else {
                list_add_tail_rcu( &ca->list , &tcp_cong_list) ;
                printk(KERN_INFO "TCP %s registered\n", ca->name) ;
        }
        spin_unlock(&tcp_cong_list_lock) ;

        return ret ;
}


======================================================================================================

 

posted on 2011-12-05 17:10  张大大123  阅读(547)  评论(0编辑  收藏  举报

导航