【Linux 内核网络协议栈源码剖析】数据包接收(含TCP协议状态变换)
http://blog.csdn.net/wenqian1991/article/details/46731357
接收数据包函数,release_sock 函数是在 sock.c中,该函数是在 inet socket 层,其内部的数据结构为 sock 结构
值得说明的是:虽然该函数放在connect篇幅里介绍,但是这个函数是一个通用的数据包接收函数,即数据通信阶段都要用的函数,不是单一属于connect,connect函数因为涉及到数据包的传输,所以连带在这里介绍了
该release_sock 函数被 tcp_connect 函数最后调用,用于接收数据包
- //如果对应的套接字正忙或被中断,则将数据包暂存到sock结构back_log队列中,这不能算被接收
- //数据包要插入receive_queue中才能算真正完成接收
- //release_sock函数则是从back_log中取数据包重新调用tcp_rcv函数对数据包进行接收,另外该函数也有释放套接字功能,如果设置了sk->dead标志位
- void release_sock(struct sock *sk)
- {
- unsigned long flags;
- #ifdef CONFIG_INET
- struct sk_buff *skb;
- #endif
- if (!sk->prot)
- return;
- /*
- * Make the backlog atomic. If we don't do this there is a tiny
- * window where a packet may arrive between the sk->blog being
- * tested and then set with sk->inuse still 0 causing an extra
- * unwanted re-entry into release_sock().
- */
- save_flags(flags);//保存状态
- cli();
- if (sk->blog)
- {
- restore_flags(flags);
- return;
- }
- sk->blog=1;
- sk->inuse = 1;//加锁
- restore_flags(flags);//恢复状态
- #ifdef CONFIG_INET
- /* See if we have any packets built up. */
- //从back_log中取数据包重新调用tcp_rcv函数对数据包进行接收
- while((skb = skb_dequeue(&sk->back_log)) != NULL)
- {
- sk->blog = 1;//置标识字段
- if (sk->prot->rcv)
- //调用tcp_rcv函数
- sk->prot->rcv(skb, skb->dev, sk->opt,
- skb->saddr, skb->len, skb->daddr, 1,
- /* Only used for/by raw sockets. */
- (struct inet_protocol *)sk->pair);
- }
- #endif
- sk->blog = 0;
- sk->inuse = 0;
- #ifdef CONFIG_INET
- if (sk->dead && sk->state == TCP_CLOSE) //如果sk->dead置位,则该函数是执行释放套接字操作
- {
- /* Should be about 2 rtt's */
- //通过设置定时器来操作,定时器到时间了就释放,
- reset_timer(sk, TIME_DONE, min(sk->rtt * 2, TCP_DONE_TIME));
- }
- #endif
- }
该函数内部的数据包重新接收是通过传输层的函数 tcp_rcv 实现的(tcp协议)。
传输层——tcp_rcv函数
tcp_rcv 函数是一个很重要的函数,是tcp协议数据包处理的总中心,其内部涉及到tcp状态转换,建议结合tcp状态转换图理解该函数。
- /*
- * A TCP packet has arrived.
- */
- //PS:这个函数虽然放在connect函数篇幅里面介绍,但实际上这个函数是一个单纯的接收数据包函数
- //可用于一切tcp数据传输中的数据接收状态,不是单属于connect下层函数
- //该函数是tcp协议数据包接收的总入口函数,网络层协议在判断数据包使用的是tcp协议后,
- //将调用tcp_rcv函数对该数据包进行传输层的相关处理
- /*
- 参数说明:
- skb:被接收的数据包;dev:接收该数据包的网络设备;opt:被接收数据包可能的ip选项
- daddr:ip首部中的远端地址字段值,对于本地接收,指的是本地ip地址
- len:ip数据负载的长度,包括tcp首部和tcp数据负载
- saddr:ip首部中源端ip地址,发送端ip地址;redo:标志位,判断数据包是新的还是原先缓存在back_log队列中的
- protocol:表示该套接字使用的协议以及协议对应的接收函数
- */
- int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
- unsigned long daddr, unsigned short len,
- unsigned long saddr, int redo, struct inet_protocol * protocol)
- {
- struct tcphdr *th;
- struct sock *sk;
- int syn_ok=0;
- //参数有效性检查
- if (!skb)
- {
- printk("IMPOSSIBLE 1\n");
- return(0);
- }
- //数据包必须通过网口设备才能被接收
- if (!dev)
- {
- printk("IMPOSSIBLE 2\n");
- return(0);
- }
- tcp_statistics.TcpInSegs++;
- //如果不是发送给本地的数据包,在网络层就已经被处理,不会传送到传输层来
- if(skb->pkt_type!=PACKET_HOST)
- {
- kfree_skb(skb,FREE_READ);
- return(0);
- }
- th = skb->h.th;//获取数据包对应tcp首部
- /*
- * Find the socket.
- */
- //根据tcp首部找到对应的套接字,主要是根据首部里的tcp 四要素,这里是查找最佳匹配的套接字
- //这个套接字既可以是客户端,也可以是服务器端,这个套接字是本地套接字,是该数据包目的接收的套接字,通过目的端口号定位的
- sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
- /*
- * If this socket has got a reset it's to all intents and purposes
- * really dead. Count closed sockets as dead.
- *
- * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
- * simply drops data. This seems incorrect as a 'closed' TCP doesn't
- * exist so should cause resets as if the port was unreachable.
- */
- //本地套接字已经被复位或者已经处于关闭状态,则不可接收该数据包
- if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
- sk=NULL;//用作下面的if判断
- if (!redo) //redo=0,表示这是一个新的数据包,所以需要进行检查其合法性
- {
- //计算tcp校验和
- if (tcp_check(th, len, saddr, daddr ))
- {
- skb->sk = NULL;
- kfree_skb(skb,FREE_READ);
- /*
- * We don't release the socket because it was
- * never marked in use.
- */
- return(0);
- }
- th->seq = ntohl(th->seq);//序列号字节序转换
- /* See if we know about the socket. */
- //检查套接字是否有效,即是否存在,存在又是否已经被复位或关闭
- if (sk == NULL)//如果上述某种情况是肯定的
- {
- /*
- * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
- */
- //本地不提供相关服务,此时回送一个RST数据包,复位对方请求
- //防止其一再进行数据包的发送,浪费彼此资源
- tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
- skb->sk = NULL;
- /*
- * Discard frame
- */
- kfree_skb(skb, FREE_READ);
- return(0);
- }
- //进入这里表示本地套接字可进行数据包接收
- //数据包字段设置
- skb->len = len;//数据部分长度
- skb->acked = 0;
- skb->used = 0;
- skb->free = 0;
- skb->saddr = daddr;//ip地址设置
- skb->daddr = saddr;
- /* We may need to add it to the backlog here. */
- cli();
- if (sk->inuse) //当前该套接字正在被使用,即当前套接字正忙,无暇处理这里的任务
- {
- skb_queue_tail(&sk->back_log, skb);//就将数据包暂存在back_log队列中,稍候由release_sock函数重新接收
- sti();
- return(0);
- }
- sk->inuse = 1;//否则,加锁,表示该套接字正在这里被使用
- sti();
- }
- else//redo=1,表示该数据包来源于back_log缓存队列
- {
- if (sk==NULL) //同样进行检查判断
- {//回送RST
- tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
- skb->sk = NULL;
- kfree_skb(skb, FREE_READ);
- return(0);
- }
- }
- //prot字段是一个proto类型结构变量,表示所使用的传输层协议处理函数集合
- //在创建套接字时,会根据所使用流类型进行该字段的相应初始化(见socket函数源码,以及UNP V1)
- if (!sk->prot)
- {
- printk("IMPOSSIBLE 3\n");
- return(0);
- }
- /*
- * Charge the memory to the socket.
- */
- //检查接收缓冲区空余空间,查看剩余空间是否足够接收当前数据包
- //sk->rcvbuf - sk->rmem_alloc =< skb->mem_len
- //最大接收队列的大小 - 已经接收到的 = 还可以接收的大小
- if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)
- {
- //如果空间不够,则丢弃该数据包,将造成远端超时重发,这正是本地想要的
- //到时或许就有足够空间接收了
- kfree_skb(skb, FREE_READ);
- release_sock(sk);//重新接收
- return(0);
- }
- //如果接收缓冲区空间足够,那么更新已接收缓冲区值
- skb->sk=sk;
- sk->rmem_alloc += skb->mem_len;//缓冲区中已经存放的大小
- /*
- * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
- * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
- * compatibility. We also set up variables more thoroughly [Karn notes in the
- * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
- */
- //不是处于已连接状态,那么该数据包则不是数据传送数据包,是进行三次握手或四次挥手中的某个数据包
- //下面的操作严格符合TCP三次握手状态转换,对照TCP状态转换图理解
- if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
- {
- /*
- * Now deal with unusual cases.
- */
- //如果是处于监听状态,该套接字为服务器端,且等待的是请求连接数据包,而非数据传送数据包
- if(sk->state==TCP_LISTEN)
- {
- //监听套接字只响应连接请求(SYN 数据包),对于其余类型数据包不做负责,
- //处于TCP_LISTEN 状态的套接字只指示内核应接受指向该套接字的连接请求
- //如果收到的是一个ACK应答数据包,表示这个数据包发错了地方,则回送RST数据包
- if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
- tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
- /*
- * We don't care for RST, and non SYN are absorbed (old segments)
- * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
- * netmask on a running connection it can go broadcast. Even Sun's have
- * this problem so I'm ignoring it
- */
- //同样进行检查数据包类型,以及请求对象地址情况
- if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
- {
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
- /*
- * Guess we need to make a new socket up
- */
- //到了这一步,可认定这是一个SYN数据包,表示该数据包是客户端发过来的连接请求
- //则调用tcp_conn_request 函数对连接请求做出响应
- //该函数处理连接请求,主要完成新通信套接字的创建和初始化工作,并且这个请求数据包此后进行的所有工作将由这个新套接字负责
- //具体看下一个函数剖析
- tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
- /*
- * Now we have several options: In theory there is nothing else
- * in the frame. KA9Q has an option to send data with the syn,
- * BSD accepts data with the syn up to the [to be] advertised window
- * and Solaris 2.1 gives you a protocol error. For now we just ignore
- * it, that fits the spec precisely and avoids incompatibilities. It
- * would be nice in future to drop through and process the data.
- */
- //处理back_log队列中先前被缓存的其他连接请求,并做出响应
- release_sock(sk);
- return 0;
- }
- /* retransmitted SYN? */
- //重发的SYN数据包处理,直接丢弃
- /*怎么知道这个数据包是重发的呢? 看最后一个判断条件,th->seq+1 == sk->acked_seq;
- sk->acked_seq,表示本地希望从远端接收的下一个数据的序列号,这是根据上一个数据包的最后一个数据字节的序列号来的,
- 它等于接收到上一个数据包的最后一个数据字节的序列号+1,如果不是重发,那么应该是该数据包的seq == acked_seq
- 现在是 seq+1 == acked_seq,则恰好是已经发送的一个数据包,同一个数据包序列号是一样的
- */
- if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
- {
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
- /*
- * SYN sent means we have to look for a suitable ack and either reset
- * for bad matches or go to connected
- */
- //如果状态是SYN_SENT,处于这种状态的一般是客户端,针对本函数,客户端下一个可能状态为RECV和ESTABLISHED
- //什么情况下进入某种状态,参见TCP状态转换图
- if(sk->state==TCP_SYN_SENT)
- {
- /* Crossed SYN or previous junk segment */
- //如果ack置位(前提也要syn置位),那么正常情况下是进入ESTABLISHED状态
- if(th->ack)//检查确认字段,第二个数据包(第二次握手),syn和ack确认字段都得置位
- {
- /* We got an ack, but it's not a good ack */
- //调用ack函数处理,这类情况下,如果一切正常,套接字(客户端)状态将置为ESTABLISHED
- //tcp协议中,三次握手,数据传输,四次挥手,都会有ACK确认数据包来往,
- //这个ack函数处理接收到的ack数据包,整个tcp协议涉及到的ack数据包阶段中的状态转换都会在这个函数中体现
- if(!tcp_ack(sk,th,saddr,len))//返回0,表示不正常,下面是非正常情况处理
- {
- /* Reset the ack - its an ack from a
- different connection [ th->rst is checked in tcp_reset()] */
- tcp_statistics.TcpAttemptFails++;
- tcp_reset(daddr, saddr, th,
- sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);//回送RST数据包
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return(0);
- }
- if(th->rst)//重置控制位rst置位
- return tcp_std_reset(sk,skb);//释放一个传输连接
- if(!th->syn)//syn没置位,肯定不行
- {
- /* A valid ack from a different connection
- start. Shouldn't happen but cover it */
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
- /*
- * Ok.. it's good. Set up sequence numbers and
- * move to established.
- */
- syn_ok=1;//不要重置(释放)这个连接 /* Don't reset this connection for the syn */
- //客户端接收到服务器端的数据包(SYN+ACK),然后做出确认应答,回送确认数据包
- //这里是三次握手的第二次握手阶段,客户端收到数据包,回送确认数据包让对端建立连接为第三次握手
- //这里sk是套接字,th为数据包的tcp首部,数据包的序列号则是位于tcp首部中,tcp协议提供的是可靠协议
- //其中之一就是序列号,接收是否正确,要看sk的对应序列号与th的序列号是否匹配
- sk->acked_seq=th->seq+1;//套接字下一个要接收的数据包的序列号,置为该数据包最后一个序列号+1
- //其实就是该数据包的下一个数据包数据的第一个字节
- sk->fin_seq=th->seq;//应答序列号为接收到的该数据包的序列号
- //发送确认数据包,帮助远端套接字完成连接,内部调用了_queue_xmit函数
- //下面函数会创建一个确认数据包,且序列号对应
- tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
- tcp_set_state(sk, TCP_ESTABLISHED);//然后客户端套接字进入ESTABLISHED状态
- tcp_options(sk,th);//更新本地MSS值,告诉对方自己可接收的数据包最大长度
- sk->dummy_th.dest=th->source;//获取对方的地址
- sk->copied_seq = sk->acked_seq;//本地程序有待读取数据的第一个序列号
- if(!sk->dead)
- {
- sk->state_change(sk);
- sock_wake_async(sk->socket, 0);
- }
- if(sk->max_window==0)//重置最大窗口大小
- {
- sk->max_window = 32;
- sk->mss = min(sk->max_window, sk->mtu);
- }
- }
- else//ACK标志位没有被设置
- //即只收到SYN,木有ACK,那就是第二种可能,进入RECV状态(省略了前面的SYN_)
- {
- //当客户端在发送 SYN 的同时也收到服务器端的 SYN请求,即两个同时发起连接请求
- /* See if SYN's cross. Drop if boring */
- //首先检查syn标志位
- if(th->syn && !th->rst)
- {
- /* Crossed SYN's are fine - but talking to
- yourself is right out... */
- //检查是否是自己发送的,不允许自己与自己通信
- if(sk->saddr==saddr && sk->daddr==daddr &&
- sk->dummy_th.source==th->source &&
- sk->dummy_th.dest==th->dest)
- {
- tcp_statistics.TcpAttemptFails++;
- return tcp_std_reset(sk,skb);
- }
- //如果通过了以上检查,表明是合法的,那么客户端就会从 SYN_SENT 转换到 SYN_RECV 状态
- tcp_set_state(sk,TCP_SYN_RECV);//这是两者同时打开连接的情况下
- /*
- * FIXME:
- * Must send SYN|ACK here
- 应该在这里发送一个ACK+SYN数据包
- */
- }
- /* Discard junk segment */
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
- /*
- * SYN_RECV with data maybe.. drop through
- */
- goto rfc_step6;
- }
- /*
- * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
- * a more complex suggestion for fixing these reuse issues in RFC1644
- * but not yet ready for general use. Also see RFC1379.
- */
- #define BSD_TIME_WAIT
- #ifdef BSD_TIME_WAIT
- //判断处于2MSL状态的套接字是否接收到一个连接请求,如果条件满足,表示接收到一个具有相同
- //远端地址,远端端口号的连接请求,在处理上是将原来的这个通信套接字释放,而将请求转移给监听套接字
- if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
- after(th->seq, sk->acked_seq) && !th->rst)
- {
- long seq=sk->write_seq;//保存原套接字本地发送序列号最后值
- if(sk->debug)
- printk("Doing a BSD time wait\n");
- tcp_statistics.TcpEstabResets++;
- sk->rmem_alloc -= skb->mem_len;//接收缓冲区-数据包大小,要关闭连接了,回到解放前
- skb->sk = NULL;
- sk->err=ECONNRESET;//被对端释放连接,即对端发送关闭数据包
- tcp_set_state(sk, TCP_CLOSE);//设置为CLOSE状态
- sk->shutdown = SHUTDOWN_MASK;//本地关闭,但对端未关闭,所以连接处于半关闭状态
- release_sock(sk);//如果sk->dead=1,那么该函数执行释放操作,这里是释放套接字
- //经过上面的操作,原先的通信套接字已被搁浅,这里重新得到对应的套接字,
- //由于原先的套接字被设置为CLOSE,所以在get_sock查找时会忽略该套接字,所以这里查找的为监听套接字
- //PS:对于这部分操作不是很理解,就不强行误解了...
- sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
- if (sk && sk->state==TCP_LISTEN)//如果是监听套接字
- {
- sk->inuse=1;
- skb->sk = sk;
- sk->rmem_alloc += skb->mem_len;
- //调用conn_request函数创建一个新的通信套接字
- tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
- release_sock(sk);
- return 0;
- }
- kfree_skb(skb, FREE_READ);
- return 0;
- }
- #endif
- }
- /*
- * We are now in normal data flow (see the step list in the RFC)
- * Note most of these are inline now. I'll inline the lot when
- * I have time to test it hard and look at what gcc outputs
- */
- //检查数据包中数据序列号的合法性
- if(!tcp_sequence(sk,th,len,opt,saddr,dev))
- {
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
- //这是一个RST数据包
- if(th->rst)
- return tcp_std_reset(sk,skb);
- /*
- * !syn_ok is effectively the state test in RFC793.
- */
- //重置连接
- if(th->syn && !syn_ok)
- {
- tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
- return tcp_std_reset(sk,skb);
- }
- /*
- * Process the ACK
- */
- //应答数据包,但应答序列号不合法
- if(th->ack && !tcp_ack(sk,th,saddr,len))
- {
- /*
- * Our three way handshake failed.
- */
- //如果正处于三次握手连接阶段,则连接建立失败
- if(sk->state==TCP_SYN_RECV)
- {
- tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
- }
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
- rfc_step6: //后面就是数据包中可能的数据处理了 /* I'll clean this up later */
- /*
- * Process urgent data
- */
- //紧急数据处理
- if(tcp_urg(sk, th, saddr, len))
- {
- kfree_skb(skb, FREE_READ);
- release_sock(sk);
- return 0;
- }
- /*
- * Process the encapsulated data
- */
- //普通数据处理
- if(tcp_data(skb,sk, saddr, len))
- {
- kfree_skb(skb, FREE_READ);
- release_sock(sk);//这里调用release_sock()函数,然后其内部又调用rcv函数,这样实现数据传输
- return 0;
- }
- /*
- * And done
- */
- release_sock(sk);
- return 0;
- }
上面函数调用了 tcp_conn_request 用来处理客户端的连接请求。该函数的调用需要满足两个前提条件,一是套接字处于监听状态,二是该数据包是一个SYN数据包,这就相当于是客户端向服务器端发出连接请求,等待服务器端处理。具体实现参见下篇博文accpet 剖析最后。
此外,tcp_rcv 内部还调用了 tcp_ack 函数,这些函数比较大,其实 tcp_ack 函数内部实现基本上就是 tcp状态转换,对于进一步理解tcp协议转换很有帮助:
tcp_ack 函数:
tcp_ack 函数用于处理本地(本地既可以是服务器端,也可以是客户端,内核栈没有这两者的概念,所有数据传输函数都是通用,只有本地和远端的概念)接收到的ack数据包。使用tcp协议的套接字。在连接建立完成后,此后发送的每个数据包中ack标志位都被设置为1,所以一个ack数据包本身也将包含数据,但数据部分则由专门的函数处理(tcp_data,tcp_urg),tcp_ack 函数将只对ack 标志位及其相关联字段进行处理。
首先既然是一个ack数据包,则表示本地发送的数据包已经成功被远端接收,这是远端发过来的确认包,则可以对重发队列中已得到ack数据包进行释放。tcp协议会将发送的数据包不立即释放(除非认为设置free标志位,发送后立即释放),而是将其缓存在重发队列中,以防止中间丢弃的问题,如果本地在一定时间内没有收到远端关于该数据包的ack数据包,则认为数据包中间丢弃了,则重新发送该数据包,如果收到了ack包,则释放重发队列中对应的数据包,这就是tcp可靠性之一的超时重传策略。
从远端接受的任何数据包(包括ack数据包)都携带有发送端(远端)的信息,如远端的窗口大小,远端的地址(建立连接时将通过这个数据包设置本地套接字的远端地址信息)等,本地将对此窗口进行检查(对面的接收能力如何),从而决定是否将写队列中的相关数据包发送出去,或者将重发队列中部分数据包重新缓存到写队列中。
如果数据包的交换着重于状态的更新,即该数据包不是单纯的数据传输,而是建立连接或者关闭连接的数据包,那么本地套接字则根据它的当前状态进行相应的更新,收到合法(序列号正确)的ack数据包意味着本地套接字可以进入对应的下一个状态
如果数据报非法,当然非法处置。具体实现细节参见下面源码剖析(有部分本人理解晦涩,没有贴出注释,以免误导)。
- /*
- * This routine deals with incoming acks, but not outgoing ones.
- */
- //处理本地接收到的ack数据包
- /*
- sk:本地套接字;th:数据包tcp首部;saddr:源端(发送端)地址;len:数据包数据部分长度
- */
- extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
- {
- unsigned long ack;
- int flag = 0;
- /*
- * 1 - there was data in packet as well as ack or new data is sent or
- * in shutdown state
- * 2 - data from retransmit queue was acked and removed
- * 4 - window shrunk or data from retransmit queue was acked and removed
- */
- //置位,表示本地套接字之前接收到远端发送的一个RST数据包,所以任何从远端接收到的数据包都简单丢弃
- if(sk->zapped)
- return(1); /* Dead, cant ack any more so why bother */
- /*
- * Have we discovered a larger window
- */
- //这个数据包是远端发过来的,承载了远端的一些信息
- ack = ntohl(th->ack_seq);//设置为远端期望从本地接收的下一个字节的序列号
- //窗口大小处理,并更新MSS值,数据包本身已经携带了发送端的窗口大小
- if (ntohs(th->window) > sk->max_window)
- {
- sk->max_window = ntohs(th->window);
- #ifdef CONFIG_INET_PCTCP
- /* Hack because we don't send partial packets to non SWS
- handling hosts */
- sk->mss = min(sk->max_window>>1, sk->mtu);
- #else
- sk->mss = min(sk->max_window, sk->mtu);
- #endif
- }
- /*
- * We have dropped back to keepalive timeouts. Thus we have
- * no retransmits pending.
- */
- //保活定时器用于双方长时间内暂无数据交换时,进行连接保持,以防止一方崩溃后
- //另一方始终占用资源的情况发生
- if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
- sk->retransmits = 0;//重发次数清零
- /*
- * If the ack is newer than sent or older than previous acks
- * then we can probably ignore it.
- */
- //下面after和before函数就是一个大、小比较
- //sent_seq表示将要发送的下一个数据包中第一个字节的序列号
- //rcv_ack_seq表示本地当前为止从远端接收到的最后一个ACK数据包所包含的应答序列号
- //ack设置为远端期望从本地接收的下一个字节的序列号
- if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
- {//出现序列号不匹配情况,即传输出现错误(tcp可靠的其中之一就是通过序列号匹配)
- if(sk->debug)
- printk("Ack ignored %lu %lu\n",ack,sk->sent_seq);
- /*
- * Keepalive processing.
- */
- if (after(ack, sk->sent_seq))
- {
- return(0);
- }
- /*
- * Restart the keepalive timer.
- */
- if (sk->keepopen)
- {
- if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
- reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
- }
- return(1);
- }
- /*
- * If there is data set flag 1
- */
- //len=tcp首部+tcp数据负载,如果len!=tcp首部,表示该数据包携带有数据负载
- if (len != th->doff*4)
- flag |= 1;
- /*
- * See if our window has been shrunk.
- */
- //检查窗口大小是否满足
- if (after(sk->window_seq, ack+ntohs(th->window)))
- {
- /*
- * We may need to move packets from the send queue
- * to the write queue, if the window has been shrunk on us.
- * The RFC says you are not allowed to shrink your window
- * like this, but if the other end does, you must be able
- * to deal with it.
- */
- struct sk_buff *skb;
- struct sk_buff *skb2;
- struct sk_buff *wskb = NULL;
- //从重发队列里面获得一个数据包
- skb2 = sk->send_head;
- sk->send_head = NULL;
- sk->send_tail = NULL;
- /*
- * This is an artifact of a flawed concept. We want one
- * queue and a smarter send routine when we send all.
- */
- flag |= 4; /* Window changed */
- //改变本地套接字窗口大小
- sk->window_seq = ack + ntohs(th->window);
- cli();
- //这段代码的功能就是:遍历整个重发队列,检查每个数据包的序列号,如果超过了本地窗口大小
- //就把该数据包已送到写队列中
- while (skb2 != NULL)
- {
- skb = skb2;
- skb2 = skb->link3;
- skb->link3 = NULL;
- //检查重发队列中的数据包的序列号是否超过了本地窗口大小
- if (after(skb->h.seq, sk->window_seq))
- {//如果超过了,则将该数据包从重发队列移除到写队列中
- if (sk->packets_out > 0)
- sk->packets_out--;
- /* We may need to remove this from the dev send list. */
- if (skb->next != NULL)
- {
- skb_unlink(skb);
- }
- /* Now add it to the write_queue. */
- if (wskb == NULL)
- skb_queue_head(&sk->write_queue,skb);//移动到写队列
- else
- skb_append(wskb,skb);
- wskb = skb;
- }
- else
- {
- if (sk->send_head == NULL)
- {
- sk->send_head = skb;
- sk->send_tail = skb;
- }
- else
- {
- sk->send_tail->link3 = skb;
- sk->send_tail = skb;
- }
- skb->link3 = NULL;
- }
- }
- sti();
- }
- /*
- * Pipe has emptied
- */
- //重发队列为空
- if (sk->send_tail == NULL || sk->send_head == NULL)
- {
- sk->send_head = NULL;
- sk->send_tail = NULL;
- sk->packets_out= 0;
- }
- /*
- * Update the right hand window edge of the host
- */
- //正式更新窗口大小,前面的更新是属于if判断语句内
- sk->window_seq = ack + ntohs(th->window);
- /*
- * We don't want too many packets out there.
- */
- //处理拥塞,就是相应增加该窗口大小,直到达到某个最大值
- if (sk->ip_xmit_timeout == TIME_WRITE &&
- sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
- {
- /*
- * This is Jacobson's slow start and congestion avoidance.
- * SIGCOMM '88, p. 328. Because we keep cong_window in integral
- * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
- * counter and increment it once every cwnd times. It's possible
- * that this should be done only if sk->retransmits == 0. I'm
- * interpreting "new data is acked" as including data that has
- * been retransmitted but is just now being acked.
- */
- //设置拥塞窗口大小:本地最大可同时发送但未得到应答的数据包个数
- if (sk->cong_window < sk->ssthresh)
- /*
- * In "safe" area, increase
- */
- sk->cong_window++;
- else
- {
- /*
- * In dangerous area, increase slowly. In theory this is
- * sk->cong_window += 1 / sk->cong_window
- */
- if (sk->cong_count >= sk->cong_window)
- {
- sk->cong_window++;
- sk->cong_count = 0;
- }
- else
- sk->cong_count++;
- }
- }
- /*
- * Remember the highest ack received.
- */
- sk->rcv_ack_seq = ack;//最近一次接收的数据包的应答序列号更新为远端期望接收到的序列号
- /*
- * If this ack opens up a zero window, clear backoff. It was
- * being used to time the probes, and is probably far higher than
- * it needs to be for normal retransmission.
- */
- //检查数据包是否是一个窗口通报数据包
- if (sk->ip_xmit_timeout == TIME_PROBE0)
- {
- sk->retransmits = 0; /* Our probe was answered */
- /*
- * Was it a usable window open ?
- */
- if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */
- ! before (sk->window_seq, sk->write_queue.next->h.seq))
- {
- sk->backoff = 0;
- /*
- * Recompute rto from rtt. this eliminates any backoff.
- */
- sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
- if (sk->rto > 120*HZ)
- sk->rto = 120*HZ;
- if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about
- .2 of a second because of BSD delayed acks - on a 100Mb/sec link
- .2 of a second is going to need huge windows (SIGH) */
- sk->rto = 20;
- }
- }
- /*
- * See if we can take anything off of the retransmit queue.
- */
- //重发队列
- while(sk->send_head != NULL)
- {
- /* Check for a bug. */
- //重发队列中的数据包是按序列号进行排序的
- if (sk->send_head->link3 &&
- after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
- printk("INET: tcp.c: *** bug send_list out of order.\n");
- /*
- * If our packet is before the ack sequence we can
- * discard it as it's confirmed to have arrived the other end.
- */
- if (before(sk->send_head->h.seq, ack+1))
- {
- struct sk_buff *oskb;
- if (sk->retransmits)
- {
- /*
- * We were retransmitting. don't count this in RTT est
- */
- flag |= 2;
- /*
- * even though we've gotten an ack, we're still
- * retransmitting as long as we're sending from
- * the retransmit queue. Keeping retransmits non-zero
- * prevents us from getting new data interspersed with
- * retransmissions.
- */
- if (sk->send_head->link3) /* Any more queued retransmits? */
- sk->retransmits = 1;
- else
- sk->retransmits = 0;
- }
- /*
- * Note that we only reset backoff and rto in the
- * rtt recomputation code. And that doesn't happen
- * if there were retransmissions in effect. So the
- * first new packet after the retransmissions is
- * sent with the backoff still in effect. Not until
- * we get an ack from a non-retransmitted packet do
- * we reset the backoff and rto. This allows us to deal
- * with a situation where the network delay has increased
- * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
- */
- /*
- * We have one less packet out there.
- */
- if (sk->packets_out > 0)
- sk->packets_out --;
- /*
- * Wake up the process, it can probably write more.
- */
- if (!sk->dead)
- sk->write_space(sk);
- oskb = sk->send_head;
- if (!(flag&2)) /* Not retransmitting */
- {
- long m;
- /*
- * The following amusing code comes from Jacobson's
- * article in SIGCOMM '88. Note that rtt and mdev
- * are scaled versions of rtt and mean deviation.
- * This is designed to be as fast as possible
- * m stands for "measurement".
- */
- m = jiffies - oskb->when; /* RTT */
- if(m<=0)
- m=1; /* IS THIS RIGHT FOR <0 ??? */
- m -= (sk->rtt >> 3); /* m is now error in rtt est */
- sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
- if (m < 0)
- m = -m; /* m is now abs(error) */
- m -= (sk->mdev >> 2); /* similar update on mdev */
- sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
- /*
- * Now update timeout. Note that this removes any backoff.
- */
- sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
- if (sk->rto > 120*HZ)
- sk->rto = 120*HZ;
- if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
- sk->rto = 20;
- sk->backoff = 0;
- }
- flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
- In this case as we just set it up */
- cli();
- oskb = sk->send_head;
- IS_SKB(oskb);
- sk->send_head = oskb->link3;
- if (sk->send_head == NULL)
- {
- sk->send_tail = NULL;
- }
- /*
- * We may need to remove this from the dev send list.
- */
- if (oskb->next)
- skb_unlink(oskb);
- sti();
- kfree_skb(oskb, FREE_WRITE); /* write. */
- if (!sk->dead)
- sk->write_space(sk);
- }
- else
- {
- break;
- }
- }
- /*
- * XXX someone ought to look at this too.. at the moment, if skb_peek()
- * returns non-NULL, we complete ignore the timer stuff in the else
- * clause. We ought to organize the code so that else clause can
- * (should) be executed regardless, possibly moving the PROBE timer
- * reset over. The skb_peek() thing should only move stuff to the
- * write queue, NOT also manage the timer functions.
- */
- /*
- * Maybe we can take some stuff off of the write queue,
- * and put it onto the xmit queue.
- */
- //查看待发送队列中是否缓存有数据包
- if (skb_peek(&sk->write_queue) != NULL)
- {
- //检查序列号是否超出了窗口大小,
- if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
- (sk->retransmits == 0 ||
- sk->ip_xmit_timeout != TIME_WRITE ||
- before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
- && sk->packets_out < sk->cong_window)
- {
- /*
- * Add more data to the send queue.
- */
- flag |= 1;
- tcp_write_xmit(sk);
- }
- else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
- sk->send_head == NULL &&
- sk->ack_backlog == 0 &&
- sk->state != TCP_TIME_WAIT)
- {
- /*
- * Data to queue but no room.
- */
- reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
- }
- }
- else
- {
- /*
- * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
- * from TCP_CLOSE we don't do anything
- *
- * from anything else, if there is write data (or fin) pending,
- * we use a TIME_WRITE timeout, else if keepalive we reset to
- * a KEEPALIVE timeout, else we delete the timer.
- *
- * We do not set flag for nominal write data, otherwise we may
- * force a state where we start to write itsy bitsy tidbits
- * of data.
- */
- //不同状态处理
- switch(sk->state) {
- case TCP_TIME_WAIT:
- /*
- * keep us in TIME_WAIT until we stop getting packets,
- * reset the timeout.
- */
- //重置2MSL定时器
- reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- break;
- case TCP_CLOSE:
- /*
- * don't touch the timer.
- */
- break;
- default:
- /*
- * Must check send_head, write_queue, and ack_backlog
- * to determine which timeout to use.
- */
- if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
- reset_xmit_timer(sk, TIME_WRITE, sk->rto);
- } else if (sk->keepopen) {
- reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
- } else {
- del_timer(&sk->retransmit_timer);
- sk->ip_xmit_timeout = 0;
- }
- break;
- }
- }
- /*
- * We have nothing queued but space to send. Send any partial
- * packets immediately (end of Nagle rule application).
- */
- if (sk->packets_out == 0 && sk->partial != NULL &&
- skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
- {
- flag |= 1;
- tcp_send_partial(sk);
- }
- //下面状态更新,请参考TCP状态转换图理解
- /*
- * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
- * we are now waiting for an acknowledge to our FIN. The other end is
- * already in TIME_WAIT.
- *
- * Move to TCP_CLOSE on success.
- */
- //远端已经完成关闭操作。进入该状态,表明该套接字为服务器端,且服务器端已经发送FIN数据包,
- //双方连接的完全关闭只需等待本地(服务器端)接收一个ACK数据包,如果接收到合法的ack包,将进入CLOSE状态,连接完全关闭
- if (sk->state == TCP_LAST_ACK)
- {
- if (!sk->dead)
- sk->state_change(sk);
- if(sk->debug)
- printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n",
- sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
- //检查序列号
- //rcv_ack_seq前面已被更新为ack(远端期望接收到的数据包第一个字节的序列号)
- //write_seq表示本地写入的最后一个字节的序列号,二者相等,合法
- if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
- {
- flag |= 1;
- tcp_set_state(sk,TCP_CLOSE);//设置为close状态
- sk->shutdown = SHUTDOWN_MASK;//本地关闭,至此连接实现完全关闭
- }
- }
- /*
- * Incoming ACK to a FIN we sent in the case of our initiating the close.
- *
- * Move to FIN_WAIT2 to await a FIN from the other end. Set
- * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
- */
- //第一次挥手
- //本地属于首先关闭的一方,并且已经发送了FIN数据包,正在等待ACK数据包,从而进入TCP_FIN_WAIT2状态
- if (sk->state == TCP_FIN_WAIT1)
- {
- if (!sk->dead)
- sk->state_change(sk);
- //检查序列号,可靠性
- if (sk->rcv_ack_seq == sk->write_seq)
- {
- flag |= 1;
- sk->shutdown |= SEND_SHUTDOWN;//本地关闭,处于半关闭状态
- tcp_set_state(sk, TCP_FIN_WAIT2);//状态转换
- //进入上述状态后,仍可进行数据包接收(半关闭),但不能再发送数据包
- //发送FIN表示关闭了发送通道,但其接收通道需要远端进行关闭
- }
- }
- /*
- * Incoming ACK to a FIN we sent in the case of a simultaneous close.
- *
- * Move to TIME_WAIT
- */
- //表示双方同时关闭,即两端同时发起关闭请求
- if (sk->state == TCP_CLOSING)
- {
- if (!sk->dead)
- sk->state_change(sk);
- if (sk->rcv_ack_seq == sk->write_seq)
- {
- flag |= 1;
- tcp_time_wait(sk);//设置本地套接字进入TIME_WAIT状态,并设置定时器
- }
- }
- /*
- * Final ack of a three way shake
- */
- //三次握手过程
- //处于SYN_RECV状态表示处于被动打开的一方接收到了远端发送的SYN数据包,
- //且发送了对应的ACK数据包,现在等待远端回复一个ACK包,即宣告连接建立的完成
- if(sk->state==TCP_SYN_RECV)
- {
- //这里没有检查序列号,因为连接建立之前没有其余数据包到达,
- //且前面已经判断了这是个有效序列号的应答,所以这里直接处理
- tcp_set_state(sk, TCP_ESTABLISHED);
- tcp_options(sk,th);//tcp选项处理,获取MSS值
- sk->dummy_th.dest=th->source;//建立连接,自然得更新本地套接字的远端地址信息
- sk->copied_seq = sk->acked_seq;//上层有待读取的数据的序列号,等于通知上层有可用连接
- if(!sk->dead)
- sk->state_change(sk);
- if(sk->max_window==0)
- {
- sk->max_window=32; /* Sanity check */
- sk->mss=min(sk->max_window,sk->mtu);
- }
- }
- /*
- * I make no guarantees about the first clause in the following
- * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
- * what conditions "!flag" would be true. However I think the rest
- * of the conditions would prevent that from causing any
- * unnecessary retransmission.
- * Clearly if the first packet has expired it should be
- * retransmitted. The other alternative, "flag&2 && retransmits", is
- * harder to explain: You have to look carefully at how and when the
- * timer is set and with what timeout. The most recent transmission always
- * sets the timer. So in general if the most recent thing has timed
- * out, everything before it has as well. So we want to go ahead and
- * retransmit some more. If we didn't explicitly test for this
- * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
- * would not be true. If you look at the pattern of timing, you can
- * show that rto is increased fast enough that the next packet would
- * almost never be retransmitted immediately. Then you'd end up
- * waiting for a timeout to send each packet on the retransmission
- * queue. With my implementation of the Karn sampling algorithm,
- * the timeout would double each time. The net result is that it would
- * take a hideous amount of time to recover from a single dropped packet.
- * It's possible that there should also be a test for TIME_WRITE, but
- * I think as long as "send_head != NULL" and "retransmit" is on, we've
- * got to be in real retransmission mode.
- * Note that tcp_do_retransmit is called with all==1. Setting cong_window
- * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
- * As long as no further losses occur, this seems reasonable.
- */
- //超时重发
- if (((!flag) || (flag&4)) && sk->send_head != NULL &&
- (((flag&2) && sk->retransmits) ||
- (sk->send_head->when + sk->rto < jiffies)))
- {
- if(sk->send_head->when + sk->rto < jiffies)
- tcp_retransmit(sk,0);
- else
- {
- tcp_do_retransmit(sk, 1);
- reset_xmit_timer(sk, TIME_WRITE, sk->rto);
- }
- }
- return(1);
- }
为什么有专门的接收到ack数据包处理,而没有专门的接收到syn数据包处理呢?这是因为在数据传输阶段,tcp为确保数据的可靠性传输,通信双方再接收到对方的数据包后,还需要回送一个确认应答数据包,表示已经接收到对端的数据包,对端可以释放重发队列中的数据包了,如果对端一端固定时间后没有收到这边回送的确认数据包,那么远端将开始数据超时重传操作,重发数据包。
数据超时重传和数据确认应答以及对每个传输的字节分配序列号是TCP协议提供可靠性数据传输的核心本质。