TCP拥塞控制算法内核实现剖析（二）

内核版本：2.6.37

主要源文件：linux-2.6.37/ net/ ipv4/ tcp_bic.c

本文主要分析BIC算法实现

======================================================================================================

1. 相关结构体和参数

/* BIC TCP Parameters */

struct bictcp {
        u32 cnt ; /* increase cwnd by 1 after ACKs */
        u32 last_max_cwnd ; /* last maximum snd_cwnd */
        u32 loss_cwnd ; /* congestion window at last loss */
        u32 last_cwnd ; /* the last snd_cwnd */
        u32 last_time ; /* time when updated last_cwnd */
        u32 epoch_start ; /* beginning of an epoch */
#define ACK_RATIO_SHIFT 4
        u32 delayed_ack ; /* estimate the ratio of Packets/ACKs << 4 */
} ;

/* Scale factor beta calculation 
 * max_cwnd = snd_cwnd * beta
 */

#define BICTCP_BETA_SCALE 1024 

 
/* In binary search ,
 * go to point (max+min) / N
 */

#define BICTCP_B 4   /*并不是真正的二分*/

2. 全局变量

static int fast_convergence = 1 ; /* BIC能快速的达到一个平衡值，开关*/

static int max_increment = 16 ; /* 每次增加的MSS 不能超过这个值，防止增长太过剧烈*/

static int low_window = 14 ; /* lower bound on congestion window , for TCP friendliness */

static int beta = 819 ; /* = 819 / 1024(BICTCP_BETA_SCALE) ，beta for multiplicative increase 。？*/

static int initial_ssthresh ; /* 初始的阈值 */

static int smooth_part = 20 ; /* log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax 。？*/

/* initial_ssthresh的初始值被设置成2^31-1=2147483647 */

bictcp结构体保存在：

struct inet_connection_sock {

        ...

        u32 icsk_ca_priv[16] ;
#define ICSK_CA_PRIV_SIZE (16*sizeof(u32))
}

static inline void *inet_csk_ca( const struct sock *sk )
{
        return (void *)inet_csk(sk)->icsk_ca_priv ;
}

============================================================================================================

tcp_is_cwnd_limited的实现没弄明白

/* Slow start with delack produces 3 packets of burst , so that it is
 * safe "de facto". This will be default - same as the default reordering
 * threshold - but if reordering increases , we must be able to allow 
 * cwnd to burst at least this much in order to not pull it back when 
 * holes are filled.
 */

 static __inline__ __u32 tcp_max_burst ( const struct tcp_sock *sk )
{
        return tp->reordering ;
}
/* u8 reordering ; Packets reordering metric */

/* RFC2681 Check whether we are limited by application or congestion 
 * window . This is the inverse of cwnd check in tcp_tso_should_defer
 */
/* 返回0，不需要增加cwnd ; 返回1，cwnd被限制，需要增加 */

int tcp_is_cwnd_limited ( const struct sock *sk , u32 in_flight )
{
        const struct tcp_sock *tp = tcp_sk(sk) ;
        u32 left ;

        if( in_flight >= tp->snd_cwnd ) /* 不是规定in_flight < snd_cwnd ? */
                return 1 ;

        left = tp->snd_cwnd - in_flight ;
        if( sk_can_gso(sk) && 
                left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
                left * tp->mss_cache < sk->sk_gso_max_size )
                return 1 ;

        return left <= tcp_max_busrt( tp ) ;
}

=============================================================================================================

3. bictcp拥塞避免

static void bictcp_cong_avoid ( struct sock *sk , u32 ack , u32 in_flight )
{
        struct tcp_sock *tp = tcp_sk(sk) ;
        struct bictcp *ca = inet_csk_ca(sk) ;

        /* 如果发送拥塞窗口不被限制，不能再增加，则返回 */
        if( !tcp_is_cwnd_limited(sk , in_flight))
                return ;

        if( tp->snd_cwnd < tp->snd_ssthresh )
                tcp_slow_start( tp ) ;
        else {
                bictcp_update(ca , tp->snd_cwnd ) ;
                tcp_cong_avoid_ai( tp , ca->cnt ) ;
        }
}

从以上函数可以看出，BIC的慢启动和reno相同。在拥塞避免阶段，当snd_cwnd <= low_window ，两者也采用相同方法。

只有当snd_cwnd > low_window时，BIC才开始显示出它的特性。

在include/ net / tcp.h中，

/* TCP timestamps are only 32-bits */

#define tcp_time_stamps ((__u32)(jiffies))

4. bictcp结构体的更新（BIC算法关键）

/*
 * Compute congestion window to use.
 */
static inline void bictcp_update( struct bictcp *ca , u32 cwnd )
{
        /* 31.25ms以内不更新ca！！！*/
        if ( ca->last_cwnd == cwnd &&
                (s32) ( tcp_time_stamp - ca->last_time) <= HZ / 32 )
        return ;

        ca->last_cwnd = cwnd ;
        ca->last_time = tcp_time_stamp ;

        if ( ca->epoch_start == 0 ) /* recording the beginning of an epoch */
                ca->epoch_start = tcp_time_stamp ;

        /* start off normal */
        if( cwnd <= low_window ) {  /*为了保持友好性*/
                ca->cnt = cwnd ;  /*这样14个以内的ack，可使snd_cwnd++ */
                return ;
        }

        /* binary increase */
        if ( cwnd < ca->last_max_cwnd ) {  /*上次掉包前一个snd_cwnd */
                __u32 dist = (ca->last_max_cwnd - cwnd) / BICTCP_B ; /* 四分之一 */
                if ( dist > max_increment ) /* linear increase */
                        /*dist > 16，处于线性增长阶段，每收到16个ACK，会使snd_cwnd++ */
                        ca->cnt = cwnd / max_increment ;
                else if ( dist <= 1U ) /* binary search increase */
                /* dist <=1 ， ca->cnt=5*cwnd，会造成snd_cwnd增长极其缓慢，即处于稳定阶段 */
                        ca->cnt = (cwnd * smooth_part ) / BICTCP_B ; 
                else /* binary search increase */
                /* 1 < dist <= 16 ，每收到dist个ACK，会使snd_cwnd++，故增长很快 */
                        ca->cnt = cwnd / dist ; 
        } else { /* 进入max_probing阶段 */
                /* cwnd < ca->last_max_cwnd + 4 */
                if ( cwnd < ca->last_max_cwnd + BICTCP_B ) 
                        /* ca->cnt = 5*cwnd ; slow start */
                        ca->cnt = (cwnd * smooth_part ) / BICTCP_B ; 
                else if ( cwnd < ca->last_max_cwnd + max_increment * ( BICTCP_B - 1))
                        /* 增长率从5/(3*cwnd)~47/(3*cwnd)，snd_cwnd的增长加快*/
                        ca->cnt = (cwnd * (BICTCP_B - 1)) / 
                                  (cwnd - ca->last_max_cwnd) ;
                else 
                        ca->cnt = cwnd / max_increment ;/* 增长率为16/cwnd ，更快 */
        }

        /* if in slow start or link utilization is very low */
        if ( ca->loss_cwnd == 0 ) {  /* 没有发生过丢包，所以snd_cwnd增长应该快点*/
                if ( ca->cnt > 20 )/* increase cwnd 5% per RTT */
                      ca->cnt = 20 ;
        }

        /* 相当于乘与delayed_ack的百分比，delayed得越严重，则snd_cwnd应该增加越快*/
        /* 这样有无delayed对snd_cwnd的影响不大*/
        ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack ;

        /* ca->cnt cannot be zero */
        if ( ca->cnt == 0)
                ca->cnt = 1 ; }

5. 小结：

从以上函数可以看出，和reno相比，BIC在拥塞避免阶段snd_cwnd增长极快。

当ca->last_max_cwnd - snd_cwnd >= 4 时，snd_cwnd最快的增长率为 1/16 。

而当ca->last_max_cwnd - snd_cwnd <4 时，增长率非常低，可以使当前的snd_cwnd维持很长一段时间，

即以最合适的snd_cwnd发送数据。

这两点使BIC在高带宽、长时延的环境下能达到较高的吞吐量。

1. 搜索阶段

(1) cwnd < last_max_cwnd - 64，则cnt = cwnd / 16

(2) last_max_cwnd - 64 <= cwnd < last_max_cwnd -4 ，则cnt = cwnd / dist

(3) last_max_cwnd - 4 <= cwnd < last_max_cwnd ，则cnt = 5*cwnd

总体来说，snd_cwnd增长先快后慢，趋于稳定。

2. max probing阶段

(1) last_max_cwnd <= cwnd < last_max_cwnd + 4，则cnt = 5*cwnd

(2) last_max_cwnd + 4 <= cwnd < last_max_cwnd + 48 ，则cnt = 3*cwnd / (cwnd - last_max_cwnd)

(3) cwnd >= last_max_cwnd + 48 ，则cnt = cwnd / 16

总体来说，snd_cwnd的增长先慢后快，越来越快。

=======================================================================================================

来看一下初始化和重置

static inline void bictcp_reset( struct bictcp *ca )
{
        ca->cnt = 0 ;
        ca->last_max_cwnd = 0 ;
        ca->loss_cwnd = 0 ;
        ca->last_cwnd = 0 ;
        ca->last_time = 0 ;
        ca->epoch_start = 0 ;
        ca->delayed_ack = 2 << ACK_RATIO_SHIFT ; // 默认50%的delayed包
}

bictcp_reset在两种情况下被调用：初始化时（bictcp_init ）、进入拥塞处理时（bictcp_state 状态为TCP_CA_Loss）。

static void bictcp_init( struct sock *sk )
{
        bictcp_reset( inet_csk_ca( sk) ) ;

        /* 加载模块时设置了。否则，其值 = 2^31 - 1 */
        if ( initial_ssthresh ) 
                tcp_sk(sk)->snd_ssthesh = initial_ssthresh ;
}

=============================================================================================================

慢启动阈值调整

我们知道，对一个拥塞控制算法而言，有两个函数必不可少，除了上面分析过的bictcp_cong_avoid（拥塞避免），还有

bictcp_recalc_ssthresh（慢启动阈值重新计算）。RENO只是简单的把发生拥塞时的窗口除以2，而BIC则增加了一些东西。

/*
 * behave like Reno until low_window is reached , 
 * then increase congestion window slowly
 */
static u32 bictcp_recalc_ssthresh( struct sock *sk )
{
        const struct tcp_sock *tp = tcp_sk(sk) ;
        struct bictcp *ca = inet_csk_ca(sk) ;
        ca->epoch_start = 0 ; /* end of epoch，平静的日子结束了 */

        /* Wmax and fast convergence
         * fast? 好像是更安全点吧。丢包点比上次低，说明恶化，则主动降低。
         * 丢包点比上次高，则说明更好，当然采用更大的。
         */
        if ( tp->snd_cwnd < ca->last_max_cwnd && fast_convergence )
                /* 就是last_max_cwnd = 0.9 * snd_cwnd */
                ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta ))
                                                       / ( 2 * BICTCP_BETA_SCALE ) ; 
                esle
                    ca->last_max_cwnd = tp->snd_cwnd ;

        ca->loss_cwnd = tp->snd_cwnd ;

        /* snd_cwnd<=14时，同reno，保持友好性 */
        if ( tp->snd_cwnd <= low_window )
               return max( tp->snd_cwnd >> 1U , 2U ) ;
        esle
              /* 就是snd_ssthresh=0.8*snd_cwnd ，很大的一个数，能充分利用带宽 */
               return max( tp->snd_cwnd * beta ) / BICTCP_BETA_SCALE , 2U ) ;
}

bictcp_recalc_ssthresh做了两件事：重赋值last_max_cwnd、返回新的慢启动阈值。

特别值得注意的是，snd_ssthresh = 0.8 * snd_cwnd 。这个可比RENO的snd_ssthresh = 0.5 * snd_cwnd 大了很多。

所以说BIC能够更有效的利用大带宽。

=======================================================================================================
计算delayed packets ratio

/* Track delayed acknowledgement ratio using sliding window
 * ratio = (15*ratio + sample) / 16
 * sample是此时的cnt，而本来的ratio = delayed_ack / 16 
 * 按如下函数计算后，现在的ratio = (15*ratio) /16 + cnt /16
 * cnt = cnt - 原来的ratio
 */

static void bictcp_acked( struct sock *sk , u32 cnt , s32 rtt )
{
        const struct inet_connection_sock *icsk = inet_csk(sk) ;

        if ( icsk->icsk_ca_state == TCP_CA_Open ) {
                struct bictcp *ca = inet_csk_ca(sk) ;
                /* 作者似乎很注重delayed包对snd_cwnd的影响，要尽量削弱 */
                cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT ; 
                ca->delayed_ack += cnt ;
         }
}

在struct inet_connection_sock中，有__u8 icsk_ca_state，表示拥塞控制的状态。

在tcp.h中，

enum tcp_ca_state {
	TCP_CA_Open = 0,
#define TCPF_CA_Open	(1<<TCP_CA_Open)
	TCP_CA_Disorder = 1,
#define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
	TCP_CA_CWR = 2,
#define TCPF_CA_CWR	(1<<TCP_CA_CWR)
	TCP_CA_Recovery = 3,
#define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
	TCP_CA_Loss = 4
#define TCPF_CA_Loss	(1<<TCP_CA_Loss)
};

============================================================================================================

static u32 bictcp_undo_cwnd( struct sock *sk )
{
        const struct tcp_sock *tp = tcp_sk(sk) ;
        const struct bictcp *ca = inet_csk_ca(sk) ;
        return max( tp->snd_cwnd , ca->last_max_cwnd ) ;
}

此函数在退出拥塞处理时调用，而下面的bictcp_state则是在进入拥塞处理时调用。

static void bictcp_state( struct sock *sk , u8 new_state )
{
        if ( new_state == TCP_CA_Loss )
                bictcp_reset( inet_csk_ca(sk) ) ;
}

============================================================================================================

bictcp算法结构体

static struct tcp_congestion_ops bictcp = {
        . init = bictcp_init ,
        . ssthresh = bictcp_recalc_ssthresh ,
        . cong_avoid = bictcp_cong_avoid ,
        . set_state = bictcp_state ,
        . undo_cwnd = bictcp_undo_cwnd ,
        . pkts_acked = bictcp_acked ,
        . owner = THIS_MODULE ,
        . name = "bic" ,
} ;

bictcp注册函数

static int __init bictcp_register(void)
{
       /* bic算法的参数不能太多，多于16个u32 */
        BUILD_BUG_ON( sizeof( struct bictcp) ) > ICSK_CA_PRIV_SIZE ) ;
        return tcp_register_congestion_control( &bictcp ) ;
}

OK，关于BIC的代码分析告一段落，接下来看看相关函数是在什么样的情况下，以什么顺序来调用的。

======================================================================================================

BIC函数的调用时机

1. 连接每收到一个ack，则调用tcp_ack

2. tcp_ack会调用bictcp_acked，用来更新cnt和delayed_ack（用来消除delay包的影响）

3. tcp_ack会调用bictcp_cong_avoid，这是分两种情况：

（1）snd_cwnd小于慢启动阈值，处于慢启动阶段，则调用tcp_slow_start

（2）snd_cwnd大于慢启动阈值，处于拥塞避免阶段，则调用bictcp_update来更新bictcp，再调用tcp_cong_avoid_ai

4. tcp_ack中如果检测到丢包，进入拥塞处理阶段，则调用bictcp_recalc_ssthresh来更新慢启动阈值。

5. tcp_ack中完成丢包重传后，退出拥塞处理阶段，则调用bictcp_undo_cwnd来更新snd_cwnd。

快速重传：tcp_ack中的丢包检测，即检测到连续3个重复ACK。

快速恢复：bictcp_undo_cwnd，直接把snd_cwnd更新为max(snd_cwnd，last_max_cwnd)，和掉包前相差不大。

更具体的可以看看tcp_ack（net/ ipv4/ tcp_input.c）。