TCP拥塞控制算法内核实现剖析(四)

 
内核版本:2.6.37 

主要源文件:linux-2.6.37/ net/ ipv4/ tcp_cubic.c

本文主要分析cubic的实现


======================================================================================================


全局变量和宏定义

static u32 beta_scale  __read_mostly ;

static int bic_scale  __read_mostly = 41 ;

static u64 cube_factor  __read_mostly ;

static u32 cube_rtt_scale __read_mostly ;

static int tcp_friendliness __read_mostly = 1 ; /* 友好性 */

static int beta __read_mostly = 717 ; /* = 717/1024 (BICTCP_BETA_SCALE) */

#define BICTCP_HZ 10 /* bictcp_HZ = 2^10 = 1024 */

/* Scale factor beta calculation 

 * max_cwnd = snd_cwnd * beta

 */

#define BICTCP_BETA_SCALE 1024 

 

beta的改变(在BIC中为819)会导致在bictcp_recalc_ssthresh中,如果启用fast convergence,

那么last_max_cwnd = 0.85*snd_cwnd ,而慢启动阈值=0.7*snd_cwnd 。这样会导致更早的

到达平衡值,对snd_cwnd有很大的影响。

 

do_div函数

unsigned long long x , y , result ;

unsigned long mod ;

mod = do_div( x , y ) ;

result = x ;

64 bit division 结果保存在x中,余数保存在返回值中。

 

部分变量的初始化

static int __init cubictcp_register(void)
{
        /* bictcp参数不能过多 */
        BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE) ;

        /* Precompute a bunch of the scaling factors that are used per-packet 
         * based on SRTT of 100ms
         * beta_scale = 8*(1024 + 717) / 3 / (1024 -717 ),大约为15
         */
        beta_scale = 8*( BICTCP_BETA_SCALE + beta ) / 3 / ( BICTCP_BETA_SCALE - beta ) ;

        /* 1024 * c / rtt ,值为410
         * c = bic_scale >> 10 = 41 / 2^10 = 0.04
         *  rtt = 100ms = 0.1s
         * 如此算来,cube_rtt_scale = 1024 * c / rtt
         * c / rtt = 0.4
         */
        cube_rtt_scale = (bic_scale * 10) ; 

        /* calculate the "K" for (Wmax - cwnd) = c / rtt * K^3
         * so K = cubic_root( (Wmax - cwnd) * rtt / c)
         * the unit of K is bictcp_HZ = 2^10 , not HZ
         * c = bic_scale >> 10
         * rtt = 100ms
         * the following code has been designed and tested for
         * cwnd < 1 million packets
         * RTT < 100 seconds
         * HZ < 1,000,00 (源码说corresponding to 10 nano-second?应该是microsecond吧!)
         * /
        /* 1/c * 2 ^ 2 * bictcp_HZ * srtt 
         *  bictcp_HZ = 2^10,不符合下面算得,应该是beta_scale吧!
         */
         cube_factor = 1ull << (10+3*BICTCP_HZ) ; /* 2^40 */

        /* divide by bic_scale and by constant Srtt(100ms) */
         do_div(cube_factor , bic_scale * 10 ) ;

         return tcp_register_congestion_control(&cubictcp) ;
}

 

cubictcp_register是模块初始化函数,主要用来注册cubic算法、初始化部分全局变量,这些全局变量主要用于计算。

==========================================================================================================

bictcp结构体

/* Cubic TCP Parameters */
struct bictcp {
        u32 cnt ; /* increase  cwnd by 1 after ACKs?,用于控制snd_cwnd增长速度*/
        u32 last_max_cwnd ; /* last_maximum snd_cwnd */
        u32 loss_cwnd ; /* congestion window at last loss */
        u32 last_cwnd ; /* the last snd_cwnd */
        u32 last_time ; /* time when updated last_cwnd */

        /* origin point of bic function,即新的Wmax,取Wlast_max和snd_cwnd大者 */
        u32 bic_origin_point ; 

        /* time to origin point from the beginning of the current epoch 
         * 即新Wmax所对应的时间点t,W(bic_K) = Wmax
         */
        u32 bic_K ; 

        u32 delay_min ; /* min delay ,应该是最小RTT */
        u32 ack_cnt ; /* number of acks */

        /* estimated tcp cwnd,按照Reno算法计算得的cwnd */
        u32 tcp_cwnd ;
#define ACK_RATIO_SHIFT 4
        /* estimate the ratio of Packets / ACks << 4 */
        u16 delayed_ack ; 

        u8 sample_cnt ; /* number of samples to decide curr_rtt ,第几个sample */
        u8 found ; /* the eixt point is found ? 1:yes,0:no */
        u32 round_start ; /* beginning of each round,针对每个RTT */
        u32 end_seq ; /* end seq of the round,用来标识每个RTT */
        u32 last_jiffies ; /* last time when the ACK spacing is close,超过2ms则不认为是连续的 */
        u32 curr_rtt ; /* the minimum rtt of current round ,由sampe中最小的决定*/
}


 

Cubic计算关键

/*
 * Compute congestion window to use.
 *  建议先看函数后的分析,再回头来看代码
 */
static inline void bictcp_update( struct bictcp *ca , u32 cwnd )
{
        u64 offs ; /* 时间差,| t - K | */

        /* delta是cwnd差,bic_target是预测值,t为预测时间 */
        u32 delta , t , bic_target , max_cnt ;

        ca->ack_cnt ++ ; /* count the number of ACKs */

        if ( ca->last_cwnd == cwnd && (s32) ( tcp_time_stamp - ca->last_time) <= HZ / 32 )
               return ;

        ca->last_cwnd = cwnd ;
        ca->last_time = tcp_time_stamp ;

        /*丢包后 一个新的时段 */
        if ( ca->epoch_start == 0 ) {
                /* record the beginning of an epoch */
                ca->epoch_start == tcp_time_stamp ; 
                ca->ack_cnt = 1 ; /* start counting */
                ca->tcp_cwnd = cwnd ; /* syn with cubic,同步更新 */
              
                /* 取max(last_max_cwnd , cwnd)作为当前Wmax */
                if ( ca->last_max_cwnd <= cwnd ) {
                        ca->bic_K = 0 ;
                        ca->bic_origin_point = cwnd ;
                 } else {
                         /* Compute new K 
                          * cube_factor = 2^40 / (41*10) = 2^30 / ( C*10) = 2^30 / 0.4
                          * bic_K本来单位为秒,转成单位为 1 / 1024秒。
                          */
                         ca->bic_K = cubic_root ( cube_factor * 
                              ( ca->last_max_cwnd - cwnd )) ;
                         ca->bic_origin_point = ca->last_max_cwnd ;
                  }
         }

        /* cubic function - calc
         * calculate c * time^3 / rtt ,
         * while considering overflow in calculation of time^3
         * (so time^3 is done by using 64 bit )
         * and without the support of division of 64 bit numbers
         * (so all divisions are done by using 32 bit )
         * also NOTE the unit of those variables
         * time = ( t - K ) / 2^BICTCP_HZ
         * c = bic_scale >> 10 = 0.04
         * Constant = c / srtt = 0.4, 实际参数为0.4
         * The following code does not have overflow problems,
         * if the cwnd < 1 million packets !!!
         * 预测时间为:ca->delay_min >> 3后
          */
        /* change the unit from HZ to BICTCP_HZ */
        t = (( tcp_time_stamp + (ca->delay_min >> 3) - ca->epoch_start ) 
                    << BICTCP_HZ ) / HZ ;

        /* 求| t - bic_K | */
        if ( t < ca->bic_K ) /* 还未达到Wmax */
                offs = ca->bic_K - t ;
        else
                offs = t - ca->bic_K ; /* 此时已经超过Wmax */

        /* 计算delta =| W(t) - W(bic_K) | 
         * cube_rtt_scale = (bic_scale * 10) = c / srtt * 2^10,c/srtt = 0.4
         */
        delta = (cube_rtt_scale * offs * offs * offs ) >> (10 + 3*BICTCP_HZ) ;

        /* 计算bic_target,即预测cwnd */
        if ( t < ca->bic_K )
                bic_target = ca->bic_origin_point - delta ;
        else
                bic_target = ca->bic_origin_point + delta ;

        /* cubic function - calc bictcp_cnt */
        if ( bic_target > cwnd ) {
                /* 相差越多,增长越快,这就是函数形状由来 */
                ca->cnt = cwnd / ( bic_target - cwnd ) ; 
        } else {
                /* very small increment,目前cwnd已经超出预期了,应该降速 */
                ca->cnt = 100 * cnt ; 
        }

        /* TCP Friendly —如果bic比RENO慢,则提升cwnd增长速度,即减小cnt
         * 以上次丢包以后的时间t算起,每次RTT增长 3B / ( 2 - B),那么可以得到
          * 采用RENO算法的cwnd。
          * cwnd (RENO) = cwnd + 3B / (2 - B) * ack_cnt / cwnd
         * B为乘性减少因子,在此算法中为0.3
         */
        if ( tcp_friendliness ) {
                u32 scale = beta_scale ;
                delta = ( cwnd * scale ) >> 3 ; /* delta代表多少ACK可使tcp_cwnd++ */
                while ( ca->ack_cnt > delta ) { /* update tcp_cwnd */
                        ca->ack_cnt -= delta ;
                        ca->tcp_cwnd++ ;
                 }

                 if ( ca->tcp_cwnd > cwnd ) { /* if bic is slower than tcp */
                        delta = ca->tcp_cwnd - cwnd ;
                        max_cnt = cwnd / delta ;
                        if ( ca->cnt > max_cnt ) 
                              ca->cnt = max_cnt ;
                  }
             }

        ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack ;

        if ( ca->cnt == 0 )  /* cannot be zero */
                ca->cnt = 1 ; /* 此时代表cwnd远小于bic_target,增长速度最大 */
}

 

 

关于时间

原来时间:本来应该是jiffies/HZ,单位为秒。但是秒是一个非常大的单位,影响计算精确度。

所以计算时,先把时间放大2^10 = 1024倍,即放大所谓的BICTCP_HZ倍。

换算后时间:t * 2^BICTCP_HZ / HZ,BICTCP_HZ = 10,单位为1 / 1024 秒。

 

函数关键点

1.  我们最终要得到的是ca->cnt,用来控制snd_cwnd的增长。

2.  ca->cnt的值,是根据cwnd和w( t + after ) 的大小来判断的。w( t + after )即bic_target,它表示我们预期的

在经过after时间后的snd_cwnd。如果此时cwnd < w( t + after ),那么我们就快速增加窗口,达到预期目标。

如果cwnd > w( t + after ),那说明我们已经增加过快了,需要降速了,这样才能达到预期目标。

              cwnd / (bic_target - cwnd )   /* bic_target > cwnd */

cnt =      

             100 * cwnd   /* bic_target < cwnd */

3.  cwnd是传入的参数,已知。现在我们只需要计算bic_target。

而根据Cubic的窗口增长函数:W(t) = C(t - K)^3 + Wmax,

我们要计算时间( 当前 + after ),以及时间K。时间K即bic_K,表示函数值为Wmax所对应的时间。

通过代码可以发现,after为min RTT,即连接的传播时延。

4.  然后就是bic_K和t的计算了,详细可看上面代码。

 

友好性

通过tcp_friendliness可启用。我们知道RENO采用AIMD,其中a=1,B=0.5。在某些条件下,在同一时间点,

cwnd( Cubic ) < cwnd( Reno ),这说明Cubic此时比Reno慢。友好性函数在这种情况下被调用,用来加快

连接的cwnd的增长速度,即减小cnt。

那么怎么算采用Reno时的cwnd呢?以上次丢包以后的时间t算起,每次RTT增长3B / ( 2 - B),那么就和采用

Reno有着相同的平均cwnd,即从效果上等同于采用Reno。

=========================================================================================================

 Cubic和BIC

Cubic is an enhanced version of BIC-TCP. It simplifies the BIC-TCP window control and improves

its TCP-friendliness and RTT-fairness.

Cubic uses a cubic increase function in terms of the elapsed time since the last loss event. In order

to provide fairness to Standard TCP, Cubic alse behaves like Standard TCP when the cubic window

growth function is slower than Standard TCP. Furhermore, the real-time nature of the protocol keeps

the window growth rate independent of RTT, which keeps the protocal TCP friendly under both short

and long RTT paths.

总的来说,Cubic相比于BIC,在稳定性、友好性、RTT公平性方面得到了改善。

 =========================================================================================================

 Hybrid Start ( hystart )

Hybrid Start finds a "safe" exit point of slow start at which slow start can finish and safely move to

congestion avoidance without causing any heavy packet losses.

HyStart uses ACK trains and RTT delay samples to detect whether

(1) the forward path is congested

(2) the current size of congestion window has reached the available capacity of the

forward path.

 

Hybrid Slow Start effectively prevents the overshooting of slow start while maintaining the full

utilization of the network.

While keeping the exponential growth of slow start , HyStart finds the "exit" point where it can

safely finish the exponential growth before overshooting and move to congestion avoidance.

Especially with different operating system receivers, HyStart improves the start-up throughput

of TCP more than 2 to 3 times.

 

Slow Start存在的问题

The exponential growth of cwnd results in burst packet losses.

This overshoot causes strong perturbation in networks, resulting in low utilization of networks.

Furthermore, long bursts of packet losses caused by the overshoot also add a lot of burden

on the end systems for loss recovery and this burden often translate into consecutive timeouts

and a long blackout period of no transmission.

 

SACK存在问题

The selective acknowledgement option relives some of these burdens.

However, for a large BDP network, the processing overhead of SACK information at the end

points can be quite overwhelming because each SACK block invokes a search into the large

packet buffers of the sender for the acknowledged packets in the block, and every recovery of

a lost packet causes the same search at the receiver.

Multiple packet losses causes almost 100% CPU utilization or a reduced number of SACKs

extremely slows down loss recovery resulting in a blackout period of no transmission over

100 seconds.

 

Processing Overload during Slow Start

System overload is frequently observed during slow start which are followed by multiple

timeouts or long blackouts of no transmission.

Both flows have almost zero throughput for 30 seconds after a timeout. These blackouts

follow after the saturation of the CPU utilization at the senders and receivers. When the

system reaches the overload, it cannot react fast enough to perform loss recovery, which

results in timeouts. Repeated losses of retransmitted packets during timeouts also cause

back-to-back timeouts with exponential backoff of RTO timers where the senders do not

transmit any packets.

 

=================================================================================================

 HyStart实现

 由两种方法组成。Both run independently at the same time and slow start exits when any of them

detects an exit point.

1. ACK train length

2. Delay increase

 

/* Two methods of hybrid slow start */

#define HYSTART_ACK_TRAIN 0x1

#define HYSTART_DELAY 0x2

 

/* Number of delay samples for detecting the increase of delay */

#define HYSTART_MIN_SAMPLES 8

#define HYSTART_DELAY_MIN (2U<<3) /* Delay increase threshold最小为16ms */

#define HYSTART_DELAY_MAX (16U<<3) /* Delay increase threshold最大为128ms */

/* 注意:这里的delay_min没有放大8倍!

 * 此宏用来计算Delay increase threshold

 * delay_min <= 32ms,则threshold = 2ms

 * 32ms < delay_min < 256ms,则threshold = delay_min / 16 ms

 * delay_min >= 256ms,则threshold = 16ms

 */

#define HYSTART_DELAY_THRESH(x) clamp(x , HYSTART_DELAY_MIN , HYSTART_DELAY_MAX)

 

static int fast_convergence __read_mostly = 1 ; /* 快速收敛 */

static int initial_ssthresh __read_mostly ; /* 初始慢启动阈值 */

 

static int hystart __read_mostly = 1 ; /* HyStart开关 */

/* HyStart状态描述

 * 1:packet-train

 * 2 :delay

 * 3 :both packet-train and delay

 * 默认2种方法都使用,即设为3

 */

static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY ;

/* 除非cwnd超过了这个值,才能使用HyStart */

 static int hystart_low_window __read_mostly = 16 ;

 

 算法初始化

 static inline bictcp_hystart_reset( struct sock *sk )
{
        struct tcp_sock *tp = tcp_sk(sk) ;
        struct bictcp *ca = inet_csk_ca(sk) ;

        ca->round_start = ca->last_jiffies = jiffies ;
        ca->end_seq = tp->snd_nxt ;
        ca->curr_rtt = 0 ;
        ca->sample_cnt = 0 ;
}

 

bictcp_hystart_reset在3种情况被调用:

1. 模块初始化(bictcp_init )

2. 拥塞避免(bictcp_cong_avoid )

        if ( hystart && after( ack , ca->end_seq)) /* 一个RTT结束了,开始新的一个RTT */

                bictcp_hystart_reset(sk) ;

3. 丢包处理(bictcp_state )

        if ( new_state == TCP_CA_Loss ) {

                bictcp_reset( inet_csk_ca( sk ) ) ;

                bictcp_hystart_reset(sk) ;

        }

 

注意:bictcp_hystart_reset中并没有对ca->found置0。也就是说,只有在一开始或者丢包时,

HyStart才会派上用场,其它时间并不使用。

 

static void bictcp_init( struct sock *sk )
{
        bictcp_reset( inet_csk_ca(sk)) ;

        if ( hystart )
                bictcp_hystart_reset(sk) ;

         if ( !hystart && initial_ssthresh )
                tcp_sk(sk)->snd_ssthresh = initial_ssthresh ;
}


/* calculate the cubic root of x using a table lookup followed by one

 * Newton - Raphson iteration.

 * Avg err ~= 0.195%

 */

static u32 cubic_root( u64 a ) /* 用来计算立方根,实现略 */

 

static void bictcp_acked( struct sock *sk , u32 cnt , s32 rtt_us )
{
        const struct inet_connection_sock *icsk = inet_csk(sk) ;
        const struct tcp_sock *tp = tcp_sk(sk) ;
        struct bictcp *ca = inet_csk_ca(sk) ;
        u32 delay ; /* 此次的RTT */

        if ( icsk->icsk_ca_state == TCP_CA_Open ) {
                cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT ;
                ca->delayed_ack += cnt ;
        }

        /* Some calls are for duplicates without timestamps */
        if ( rtt_us < 0 )
                return;

        /* Discard delay samples right after fast recovery 
         * 在快速恢复后1s内不进行采样
          */
        if ( (s32) (tcp_time_stamp - ca->epoch_start < HZ )
                return;

        /* rtt_us这里扩大8倍,计算的时候会缩小8被 */
        delay = usecs_to_jiffies(rtt_us) << 3 ;
        if ( delay == 0 )
                delay = 1 ;

        /* first time call or link delay decreases */
        if ( ca->delay_min == 0 || ca->delay_min > delay )
                ca->delay_min = delay ;

        /* hystart triggers when cwnd is larger than some threshold */
        if ( hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
                tp->snd_cwnd >= hystart_low_window )
                hystart_update( sk , delay ) ;
}


 

HyStart核心部分

static void hystart_update( struct sock *sk , u32 delay )
{
        struct tcp_sock *tp = tcp_sk(sk) ;
        struct bictcp *ca = inet_csk_ca(sk) ;

        /* 启动hystart且exit point还没找到 */
        if ( ! ( ca->found & hystart_detect ) ) {
                u32 curr_jiffies = jiffies ;
              
                /* ACK train detection 
                 * 2ms内才算连续
                    */
                if ( curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) {
                        ca->last_jiffies = curr_jiffies ;
                        if ( curr_jiffies - ca->round_start >= ca->delay_min >> 4 )
                                ca->found |= HYSTART_ACK_TRAIN ;
                }

                /* HYSTART delay
                 * obtain the minimum delay of more than sampling packets 
                 */
                  if ( ca->sample_cnt < HYSTART_MIN_SAMPLES ) {
                          if ( ca->curr_rtt == 0 || ca->curr_rtt > delay )
                                  ca->curr_rtt = delay ;
                          ca->sample_cnt ++ ;
                   } else {
                          if ( ca->curr_rtt > ca->delay_min + 
                                  HYSTART_THRESH(ca->delay_min>>4))
                                  ca->found |= HYSTART_DELAY ;
                   }
            
                   /* Either one of two conditions are met ,
                    * we exit from slow start immediately.
                    */
                    if ( ca->found & hystart_detect )
                            tp->snd_ssthresh = tp->snd_cwnd ;
        }
}


可以看到,其实算法的代码很少,但是要清晰的理解该算法,还是应该看看算法的论文。

OK,关于Cubic的分析就此告一段落了↖(^ω^)↗

posted on 2011-12-22 16:36  张大大123  阅读(513)  评论(0编辑  收藏  举报

导航