TCP/IP源码学习(52)——TCP的连接过程的实现(1)
http://blog.chinaunix.net/uid-23629988-id-3178006.html
作者:gfree.wind@gmail.com
博客:blog.focus-linux.net linuxfocus.blog.chinaunix.net
博客:blog.focus-linux.net linuxfocus.blog.chinaunix.net
本文的copyleft归gfree.wind@gmail.com所有,使用GPL发布,可以自由拷贝,转载。但转载请保持文档的完整性,注明原作者及原链接,严禁用于任何商业用途。
======================================================================================================
在以前的文章中,学习了UDP数据包的接收和发送。今天开始研究一下TCP数据包的接受。与UDP数据包类似,当IP数据包到达ip_local_deliver_finish函数时,根据四层协议从inet_protos数组中得到TCP协议对应的tcp_protocol。
static const struct net_protocol tcp_protocol = {
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.gso_send_check = tcp_v4_gso_send_check,
.gso_segment = tcp_tso_segment,
.gro_receive = tcp4_gro_receive,
.gro_complete = tcp4_gro_complete,
.no_policy = 1,
.netns_ok = 1,
};
那么TCP数据包的接收函数入口即为tcp_v4_rcv
int tcp_v4_rcv(struct sk_buff *skb)
{
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
int ret;
struct net *net = dev_net(skb->dev);
/* 检测该包是否为发给本机的 */
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
/* Count it even if it's bad */
TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
/* 检查包长至少比TCP的首部长 */
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
goto discard_it;
th = tcp_hdr(skb);
/* 检查TCP首部 */
if (th->doff < sizeof(struct tcphdr) / 4)
goto bad_packet;
if (!pskb_may_pull(skb, th->doff * 4))
goto discard_it;
/* An explanation is required here, I think.
* Packet length and doff are validated by header prediction,
* provided case of th->doff==0 is eliminated.
* So, we defer the checks. */
if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
goto bad_packet;
/* 将sequence,ack等保存到socket的TCP控制块中 */
th = tcp_hdr(skb);
iph = ip_hdr(skb);
TCP_SKB_CB(skb)->seq = ntohl(th->seq);
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff * 4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
TCP_SKB_CB(skb)->when = 0;
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
/*
通过源IP,目的IP,源端口,目的端口,和接收到的interface来查找socket。
这里一共涉及两个hash表,一个是保存已连接TCP session,一个是处于listening的TCP session
关于这两个hash,以后再分析。
*/
sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
if (!sk)
goto no_tcp_socket;
process:
/* TIME_WAIT的处理,以后再学习 */
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
}
/* IPsec的检查 */
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
nf_reset(skb);
/* socket filter没有用过。。。 */
if (sk_filter(sk, skb))
goto discard_and_relse;
skb->dev = NULL;
bh_lock_sock_nested(sk);
ret = 0;
/*
检查该socket是否由当前执行上下文拥有,如果是,可以继续处理该skb,
如果不是,那么就将skb加到当前socket的sk_backlog上。
这样的处理与UDP不同,因为TCP是有内部状态的,当处理一个TCP报文的时候,在中间又处理另外一个TCP报文的 时候,可能会改变TCP的状态,导致被打断的TCP报文处理失败。
这里保证TCP的一个报文处理不会被打断
*/
if (!sock_owned_by_user(sk)) {
#ifdef CONFIG_NET_DMA
struct tcp_sock *tp = tcp_sk(sk);
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
if (tp->ucopy.dma_chan)
ret = tcp_v4_do_rcv(sk, skb);
else
#endif
{
if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb);
}
} else if (unlikely(sk_add_backlog(sk, skb))) {
bh_unlock_sock(sk);
NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
goto discard_and_relse;
}
...... ......
进入tcp_v4_do_rcv
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
#ifdef CONFIG_TCP_MD5SIG
/*
* We really want to reject the packet as early as possible
* if:
* o We're expecting an MD5'd packet and this is no MD5 tcp option
* o There is an MD5 option and we're not expecting one
*/
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard;
#endif
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
/* 该TCP处于已连接状态,留作以后学习 */
sock_rps_save_rxhash(sk, skb);
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
return 0;
}
if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
goto csum_err;
if (sk->sk_state == TCP_LISTEN) {
/*
处理TCP request包,即请求连接本机TCP端口的TCP报文,并返回应处理该skb的socket。
对于第一个sync包,返回的nsk就是sk。
*/
struct sock *nsk = tcp_v4_hnd_req(sk, skb);
if (!nsk)
goto discard;
/* 如前面所说,对于第一个sync包,nsk就是sk,于是继续往下执行 */
if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb);
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
}
return 0;
}
} else
sock_rps_save_rxhash(sk, skb);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
return 0;
...... ......
}
进入tcp_rcv_state_process
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int queued = 0;
int res;
tp->rx_opt.saw_tstamp = 0;
switch (sk->sk_state) {
case TCP_CLOSE:
goto discard;
case TCP_LISTEN:
/* 本文的重点,第一个sync包会到这里 */
/* 非法的TCP包,LISTEN状态只处理sync包 */
if (th->ack)
return 1;
if (th->rst)
goto discard;
if (th->syn) {
/* 第一个syn包 */
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1;
/* Now we have several options: In theory there is
* nothing else in the frame. KA9Q has an option to
* send data with the syn, BSD accepts data with the
* syn up to the [to be] advertised window and
* Solaris 2.1 gives you a protocol error. For now
* we just ignore it, that fits the spec precisely
* and avoids incompatibilities. It would be nice in
* future to drop through and process the data.
*
* Now that TTCP is starting to be used we ought to
* queue this data.
* But, this leaves one open to an easy denial of
* service attack, and SYN cookies can't defend
* against this problem. So, we drop the data
* in the interest of security over speed unless
* it's still in use.
*/
kfree_skb(skb);
return 0;
}
goto discard;
...... ......
...... ......
}
对于IPv4的TCP数据包,conn_request为tcp_v4_conn_request
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct tcp_extend_values tmp_ext;
struct tcp_options_received tmp_opt;
const u8 *hash_location;
struct request_sock *req;
struct inet_request_sock *ireq;
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = NULL;
__be32 saddr = ip_hdr(skb)->saddr;
__be32 daddr = ip_hdr(skb)->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
int want_cookie = 0;
/* Never answer to SYNs send to broadcast or multicast */
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
goto drop;
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
* evidently real one.
*/
//检查syn queue是否已满,即request queue是否已满
if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
/* 是否使用sync cookie */
want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
if (!want_cookie)
goto drop;
}
/* Accept backlog is full. If we have already queued enough
* of warm entries in syn queue, drop request. It is better than
* clogging syn queue with openreqs with exponentially increasing
* timeout.
*/
//检查accept queue是否已满
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
//申请一个新的request_sock
req = inet_reqsk_alloc(&tcp_request_sock_ops);
if (!req)
goto drop;
#ifdef CONFIG_TCP_MD5SIG
tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
#endif
//解析TCP的option
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
tmp_opt.user_mss = tp->rx_opt.user_mss;
tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
if (tmp_opt.cookie_plus > 0 &&
tmp_opt.saw_tstamp &&
!tp->rx_opt.cookie_out_never &&
(sysctl_tcp_cookie_size > 0 ||
(tp->cookie_values != NULL &&
tp->cookie_values->cookie_desired > 0))) {
/*
不太确定这部分代码的用途,看上去跟sync cookie相关
貌似是为了检查sync-cookie。
*/
u8 *c;
u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
goto drop_and_release;
/* Secret recipe starts with IP addresses */
*mess++ ^= (__force u32)daddr;
*mess++ ^= (__force u32)saddr;
/* plus variable length Initiator Cookie */
c = (u8 *)mess;
while (l-- > 0)
*c++ ^= *hash_location++;
want_cookie = 0; /* not our kind of cookie */
tmp_ext.cookie_out_never = 0; /* false */
tmp_ext.cookie_plus = tmp_opt.cookie_plus;
} else if (!tp->rx_opt.cookie_in_always) {
/* redundant indications, but ensure initialization. */
tmp_ext.cookie_out_never = 1; /* true */
tmp_ext.cookie_plus = 0;
} else {
goto drop_and_release;
}
tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb);
ireq = inet_rsk(req);
ireq->loc_addr = daddr;
ireq->rmt_addr = saddr;
ireq->no_srccheck = inet_sk(sk)->transparent;
ireq->opt = tcp_v4_save_options(sk, skb);
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
if (!want_cookie || tmp_opt.tstamp_ok)
TCP_ECN_create_request(req, tcp_hdr(skb));
if (want_cookie) {
/* 生成sync cookie使用的Initial sequence numnber */
isn = cookie_v4_init_sequence(sk, skb, &req->mss);
req->cookie_ts = tmp_opt.tstamp_ok;
} else if (!isn) {
struct inet_peer *peer = NULL;
struct flowi4 fl4;
/* VJ's idea. We save last timestamp seen
* from the destination in peer table, when entering
* state TIME-WAIT, and check against it before
* accepting new connection request.
*
* If "isn" is not zero, this request hit alive
* timewait bucket, so that all the necessary checks
* are made in the function processing timewait state.
*/
/* 还是不懂这块的检查是为了什么。。。*/
if (tmp_opt.saw_tstamp &&
tcp_death_row.sysctl_tw_recycle &&
(dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
fl4.daddr == saddr &&
(peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
inet_peer_refcheck(peer);
if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
(s32)(peer->tcp_ts - req->ts_recent) >
TCP_PAWS_WINDOW) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
goto drop_and_release;
}
}
/* Kill the following clause, if you dislike this way. */
else if (!sysctl_tcp_syncookies &&
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
(!peer || !peer->tcp_ts_stamp) &&
(!dst || !dst_metric(dst, RTAX_RTT))) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
* It means that we continue to communicate
* to destinations, already remembered
* to the moment of synflood.
*/
LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
&saddr, ntohs(tcp_hdr(skb)->source));
goto drop_and_release;
}
/* 生成Initial Sequence Number */
isn = tcp_v4_init_sequence(skb);
}
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->snt_synack = tcp_time_stamp;
/* 回复syn+ack包 */
if (tcp_v4_send_synack(sk, dst, req,
(struct request_values *)&tmp_ext) ||
want_cookie)
goto drop_and_free;
/* 将该request_sock添加到父socket的icsk_accept_queue中的listen_opt上 */
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
return 0;
drop_and_release:
dst_release(dst);
drop_and_free:
reqsk_free(req);
drop:
return 0;
}
今天仅仅学习了一下TCP处理第一个sync包的过程,就发现了很多不明白的地方,还需要继续努力啊。争取早日把TCP的这些细节搞懂。