TCP拥塞状态机的实现(中)
内容:本文主要分析TCP拥塞状态机的实现中,虚假SACK的处理、标志丢失数据包的详细过程。
内核版本:2.6.37
作者:zhangskd @ csdn
虚假SACK
state B
如果接收的ACK指向已记录的SACK,这说明记录的SACK并没有反应接收方的真实的状态,
也就是说接收方现在已经处于严重拥塞的状态或者在处理上有bug,所以接下来就按照超时
重传的方式去处理。因为按照正常的逻辑流程,接收的ACK不应该指向已记录的SACK,
而应该指向SACK后面未接收的地方。通常情况下,此时接收方已经删除了保存到失序队列中的段。
/* If ACK arrived pointing to a remembered SACK, it means that our remembered * SACKs do not reflect real state of receiver i.e. receiver host is heavily congested * or buggy. * * Do processing similar to RTO timeout. */ static int tcp_check_sack_reneging (struct sock *sk, int flag) { if (flag & FLAG_SACK_RENEGING) { struct inet_connection_sock *icsk = inet_csk(sk); /* 记录mib信息,供SNMP使用*/ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); /* 进入loss状态,1表示清除SACKED标志*/ tcp_enter_loss(sk, 1); /* 此函数在前面blog中分析过:)*/ icsk->icsk_retransmits++; /* 未恢复的RTO加一*/ /* 重传发送队列中的第一个数据包*/ tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); /* 更新超时重传定时器*/ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); return 1; } return 0; } /** 用于返回发送队列中的第一个数据包,或者NULL * skb_peek - peek at the head of an &sk_buff_head * @list_ : list to peek at * * Peek an &sk_buff. Unlike most other operations you must * be careful with this one. A peek leaves the buffer on the * list and someone else may run off with it. You must hold * the appropriate locks or have a private queue to do this. * * Returns %NULL for an empty list or a pointer to the head element. * The reference count is not incremented and the reference is therefore * volatile. Use with caution. */ static inline struct sk_buff *skb_peek (const struct sk_buff_head *list_) { struct sk_buff *list = ((const struct sk_buff *) list_)->next; if (list == (struct sk_buff *) list_) list = NULL; return list; } static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) { return skb_peek(&sk->sk_write_queue); }
tcp_retransmit_skb()用来重传一个数据包。它最终调用tcp_transmit_skb()来发送一个数据包。
这个函数在接下来的blog中会分析。
/* This retransmits one SKB. Policy decisions and retransmit queue * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */ int tcp_retransmit_skb (struct sock *sk, struct sk_buff *skb) { }
重设重传定时器
state B
/** inet_connection_sock - INET connection oriented sock * * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmission timeout * @icsk_ca_ops: Pluggable congestion control hook * @icsk_ca_state: Congestion control state * @icsk_ca_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: scheduled timer event * @icsk_ack: Delayed ACK control data */ struct inet_connection_sock { ... unsigned long icsk_timeout; /* 数据包超时时间*/ struct timer_list icsk_retransmit_timer; /* 重传定时器*/ struct timer_list icsk_delack_timer; /* delay ack定时器*/ __u32 icsk_rto; /*超时时间*/ const struct tcp_congestion ops *icsk_ca_ops; /*拥塞控制算法*/ __u8 icsk_ca_state; /*所处拥塞状态*/ __u8 icsk_retransmits; /*还没恢复的timeout个数*/ __u8 icsk_pending; /* 等待的定时器事件*/ ... struct { ... __u8 pending; /* ACK is pending */ unsigned long timeout; /* Currently scheduled timeout */ ... } icsk_ack; /* Delayed ACK的控制模块*/ ... u32 icsk_ca_priv[16]; /*放置拥塞控制算法的参数*/ ... #define ICSK_CA_PRIV_SIZE (16*sizeof(u32)) } #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ #define ICSK_TIME_DACK 2 /* Delayed ack timer */ #define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ /* * Reset the retransmissiion timer */ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, const unsigned long max_when) { struct inet_connection_sock *icsk = inet_csk(sk); if (when > max_when) { #ifdef INET_CSK_DEBUG pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr()); #endif when = max_when; } if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { icsk->icsk_pending = what; icsk->icsk_timeout = jiffies + when; /*数据包超时时刻*/ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); } else if (what == ICSK_TIME_DACK) { icsk->icsk_ack.pending |= ICSK_ACK_TIMER; icsk->icsk_ack.timeout = jiffies + when; /*Delay ACK定时器超时时刻*/ sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); } #ifdef INET_CSK_DEBUG else { pr_debug("%s", inet_csk_timer_bug_msg); } #endif }
添加LOST标志
state C
Q: 我们发现有数据包丢失了,怎么知道要重传哪些数据包呢?
A: tcp_mark_head_lost()通过给丢失的数据包标志TCPCB_LOST,就可以表明哪些数据包需要重传。
如果通过SACK发现有段丢失,则需要从重传队首或上次标志丢失段的位置开始,为记分牌为0的段
添加LOST标志,直到所有被标志LOST的段数达到packets或者被标志序号超过high_seq为止。
/* Mark head of queue up as lost. With RFC3517 SACK, the packets is against sakced cnt, * otherwise it's against fakced cnt. * packets = fackets_out - reordering,表示sacked_out和lost_out的总和。 * 所以,被标志为LOST的段数不能超过packets。 * high_seq : 可以标志为LOST的段序号的最大值。 * mark_head: 为1表示只需要标志发送队列的第一个段。 */ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int cnt, oldcnt; int err; unsigned int mss; /* 被标志为丢失的段不能超过发送出去的数据段数*/ WARN_ON(packets > tp->packets_out); /* 如果已经有标识为丢失的段了*/ if (tp->lost_skb_hint) { skb = tp->lost_skb_hint; /* 下一个要标志的段 */ cnt = tp->lost_cnt_hint; /* 已经标志了多少段 */ /* Head already handled? 如果发送队列第一个数据包已经标志了,则返回 */ if (mark_head && skb != tcp_write_queue_head(sk)) return; } else { skb = tcp_write_queue_head(sk); cnt = 0; } tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) break; /* 如果遍历到snd_nxt,则停止*/ /* 更新丢失队列信息*/ tp->lost_skb_hint = skb; tp->lost_cnt_hint = cnt ; /* 标志为LOST的段序号不能超过high_seq */ if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) break; oldcnt = cnt; if (tcp_is_fack(tp) || tcp_is_reno(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) cnt += tcp_skb_pcount(skb); /* 此段已经被sacked */ /* 主要用于判断退出时机 */ if (cnt > packets) { if ((tcp_is_sack(tp) && !tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || (oldcnt >= pakcets)) break; mss = skb_shinfo(skb)->gso_size; err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); if (err < 0) break; cnt = packets; } /* 标志动作:标志一个段为LOST*/ tcp_skb_mark_lost(tp, skb); if (mark_head) break; } tcp_verify_left_out(tp); }
涉及变量
struct tcp_sock { /* 在重传队列中,缓存下次要标志的段,为了加速对重传队列的标志操作 */ struct sk_buff *lost_skb_hint; /* 下一次要标志的段 */ int lost_cnt_hint; /* 已经标志了多少个段 */ struct sk_buff *retransmit_skb_hint; /* 表示将要重传的起始包*/ u32 retransmit_high; /*重传队列的最大序列号*/ struct sk_buff *scoreboard_skb_hint; /* 记录超时的数据包,序号最大*/ }
TCP分片函数tcp_fragment
/* Function to create two new TCP segments. shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ int tcp_fragment (struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now) {}
给一个段添加一个LOST标志
static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) { if (! (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST | TCPCB_SACKED_ACKED))) { tcp_verify_retransmit_hint(tp, skb); /* 更新重传队列*/ tp->lost_out += tcp_skb_pcount(skb); /*增加LOST的段数*/ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; /* 添加LOST标志*/ } } /* This must be called before lost_out is incremented */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { if ((tp->retransmit_skb_hint == NULL) || before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) tp->retransmit_skb_hint = skb; if (! tp->lost_out || after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high)) tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; }