mylinuxer

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

之前对于TCP接收过程中的三个队列的关系之前没搞清楚。

这几天,在同事邱的帮助下,终于把关系理清了,故特此做个笔记。

一、在软中断中加入数据包

tcp_v4_rcv()函数是tcp层收包的入口。

1615 int tcp_v4_rcv(struct sk_buff *skb)
1616 {
1617         const struct iphdr *iph;
1618         struct tcphdr *th;
1619         struct sock *sk;
1620         int ret;
1621         struct net *net = dev_net(skb->dev);
1622 
1623         if (skb->pkt_type != PACKET_HOST)
1624                 goto discard_it;
1625 
1626         /* Count it even if it's bad */
1627         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1628 
1629         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1630                 goto discard_it;
1631 
1632         th = tcp_hdr(skb);
1633 
1634         if (th->doff < sizeof(struct tcphdr) / 4)
1635                 goto bad_packet;
1636         if (!pskb_may_pull(skb, th->doff * 4))
1637                 goto discard_it;
1638 
1639         /* An explanation is required here, I think.
1640          * Packet length and doff are validated by header prediction,
1641          * provided case of th->doff==0 is eliminated.
1642          * So, we defer the checks. */
1643         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1644                 goto bad_packet;
1645 
1646         th = tcp_hdr(skb);
1647         iph = ip_hdr(skb);
1648         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1649         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1650                                     skb->len - th->doff * 4);
1651         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1652         TCP_SKB_CB(skb)->when    = 0;
1653         TCP_SKB_CB(skb)->flags   = iph->tos;
1654         TCP_SKB_CB(skb)->sacked  = 0;
1655 
1656         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1657         if (!sk)
1658                 goto no_tcp_socket;
1659 
1660 process:
1661         if (sk->sk_state == TCP_TIME_WAIT)
1662                 goto do_time_wait;
1663 
1664         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1665                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1666                 goto discard_and_relse;
1667         }
1668 
1669         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1670                 goto discard_and_relse;
1671         nf_reset(skb);
1672 
1673         if (sk_filter(sk, skb))
1674                 goto discard_and_relse;
1675 
1676         skb->dev = NULL;
1677 
1678         bh_lock_sock_nested(sk);
1679         ret = 0;
1680         if (!sock_owned_by_user(sk)) {
1681 #ifdef CONFIG_NET_DMA
1682                 struct tcp_sock *tp = tcp_sk(sk);
1683                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1684                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1685                 if (tp->ucopy.dma_chan)
1686                         ret = tcp_v4_do_rcv(sk, skb);
1687                 else
1688 #endif
1689                 {
1690                         if (!tcp_prequeue(sk, skb))   //先尝试加入prequeue
1691                                 ret = tcp_v4_do_rcv(sk, skb);  //否则加入sk_receive_queue
1692                 }
1693         } else if (unlikely(sk_add_backlog(sk, skb))) {  //加入backlog
1694                 bh_unlock_sock(sk);
1695                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1696                 goto discard_and_relse;
1697         }
1698         bh_unlock_sock(sk);
1699 
1700         sock_put(sk);
1701 
1702         return ret;
1703 
1704 no_tcp_socket:
1705         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1706                 goto discard_it;
1707 
1708         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1709 bad_packet:
1710                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1711         } else {
1712                 tcp_v4_send_reset(NULL, skb);
1713         }
1714 
1715 discard_it:
1716         /* Discard frame. */
1717         kfree_skb(skb);
1718         return 0;
1719 
1720 discard_and_relse:
1721         sock_put(sk);
1722         goto discard_it;
1723 
1724 do_time_wait:
1725         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1726                 inet_twsk_put(inet_twsk(sk));
1727                 goto discard_it;
1728         }
1729 
1730         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1731                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1732                 inet_twsk_put(inet_twsk(sk));
1733                 goto discard_it;
1734         }
1735         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1736         case TCP_TW_SYN: {
1737                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1738                                                         &tcp_hashinfo,
1739                                                         iph->daddr, th->dest,
1740                                                         inet_iif(skb));
1741                 if (sk2) {
1742                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1743                         inet_twsk_put(inet_twsk(sk));
1744                         sk = sk2;
1745                         goto process;
1746                 }
1747                 /* Fall through to ACK */
1748         }
1749         case TCP_TW_ACK:
1750                 tcp_v4_timewait_ack(sk, skb);
1751                 break;
1752         case TCP_TW_RST:
1753                 goto no_tcp_socket;
1754         case TCP_TW_SUCCESS:;
1755         }
1756         goto discard_it;
1757 }

 跟踪tcp_prequeue()函数

920 static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
921 {
922         struct tcp_sock *tp = tcp_sk(sk);
923 
924         if (sysctl_tcp_low_latency || !tp->ucopy.task)
925                 return 0;
926 
927         __skb_queue_tail(&tp->ucopy.prequeue, skb);
928         tp->ucopy.memory += skb->truesize;
929         if (tp->ucopy.memory > sk->sk_rcvbuf) {  //如果prequeue的内存使用紧张
930                 struct sk_buff *skb1;
931 
932                 BUG_ON(sock_owned_by_user(sk));
933 
934                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { //遍历prequeue中的数据包
935                         sk_backlog_rcv(sk, skb1);  //处理数据包,即把prequeue中的数据包转移到sk_receive_queue中
936                         NET_INC_STATS_BH(sock_net(sk),
937                                          LINUX_MIB_TCPPREQUEUEDROPPED);
938                 }
939 
940                 tp->ucopy.memory = 0;
941         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
942                 wake_up_interruptible_sync_poll(sk->sk_sleep,
943                                            POLLIN | POLLRDNORM | POLLRDBAND);
944                 if (!inet_csk_ack_scheduled(sk))
945                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
946                                                   (3 * tcp_rto_min(sk)) / 4,
947                                                   TCP_RTO_MAX);
948         }
949         return 1;
950 }

 

 

618 static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
619 {
620         return sk->sk_backlog_rcv(sk, skb);  //实际回调函数tcp_v4_do_rcv()
621 }

 

1546 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1547 {
1548         struct sock *rsk;
1549 #ifdef CONFIG_TCP_MD5SIG
1550         /*
1551          * We really want to reject the packet as early as possible
1552          * if:
1553          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1554          *  o There is an MD5 option and we're not expecting one
1555          */
1556         if (tcp_v4_inbound_md5_hash(sk, skb))
1557                 goto discard;
1558 #endif
1559 
1560         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path  已经建立TCP连接时 */
1561                 TCP_CHECK_TIMER(sk);
1562                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {  //主要处理函数
1563                         rsk = sk;
1564                         goto reset;
1565                 }
1566                 TCP_CHECK_TIMER(sk);
1567                 return 0;
1568         }
1569 
1570         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1571                 goto csum_err;
1572 
1573         if (sk->sk_state == TCP_LISTEN) {
1574                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1575                 if (!nsk)
1576                         goto discard;
1577 
1578                 if (nsk != sk) {
1579                         if (tcp_child_process(sk, nsk, skb)) {
1580                                 rsk = nsk;
1581                                 goto reset;
1582                         }
1583                         return 0;
1584                 }
1585         }
1586 
1587         TCP_CHECK_TIMER(sk);
1588         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1589                 rsk = sk;
1590                 goto reset;
1591         }
1592         TCP_CHECK_TIMER(sk);
1593         return 0;
1594 
1595 reset:
1596         tcp_v4_send_reset(rsk, skb);
1597 discard:
1598         kfree_skb(skb);
1599         /* Be careful here. If this function gets more complicated and
1600          * gcc suffers from register pressure on the x86, sk (in %ebx)
1601          * might be destroyed here. This current version compiles correctly,
1602          * but you have been warned.
1603          */
1604         return 0;
1605 
1606 csum_err:
1607         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1608         goto discard;
1609 }

 

5224 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5225                         struct tcphdr *th, unsigned len)
5226 {
5227         struct tcp_sock *tp = tcp_sk(sk);
5228         int res;
5229 
5230         /*
5231          *      Header prediction.
5232          *      The code loosely follows the one in the famous
5233          *      "30 instruction TCP receive" Van Jacobson mail.
5234          *
5235          *      Van's trick is to deposit buffers into socket queue
5236          *      on a device interrupt, to call tcp_recv function
5237          *      on the receive process context and checksum and copy
5238          *      the buffer to user space. smart...
5239          *
5240          *      Our current scheme is not silly either but we take the
5241          *      extra cost of the net_bh soft interrupt processing...
5242          *      We do checksum and copy also but from device to kernel.
5243          */
5244 
5245         tp->rx_opt.saw_tstamp = 0;
5246 
5247         /*      pred_flags is 0xS?10 << 16 + snd_wnd
5248          *      if header_prediction is to be made
5249          *      'S' will always be tp->tcp_header_len >> 2
5250          *      '?' will be 0 for the fast path, otherwise pred_flags is 0 to
5251          *  turn it off (when there are holes in the receive
5252          *       space for instance)
5253          *      PSH flag is ignored.
5254          */
5255 
5256         if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
5257             TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
5258             !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
5259                 int tcp_header_len = tp->tcp_header_len;
5260 
5261                 /* Timestamp header prediction: tcp_header_len
5262                  * is automatically equal to th->doff*4 due to pred_flags
5263                  * match.
5264                  */
5265 
5266                 /* Check timestamp */
5267                 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
5268                         /* No? Slow path! */
5269                         if (!tcp_parse_aligned_timestamp(tp, th))
5270                                 goto slow_path;
5271 
5272                         /* If PAWS failed, check it more carefully in slow path */
5273                         if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
5274                                 goto slow_path;
5275 
5276                         /* DO NOT update ts_recent here, if checksum fails
5277                          * and timestamp was corrupted part, it will result
5278                          * in a hung connection since we will drop all
5279                          * future packets due to the PAWS test.
5280                          */
5281                 }
5282 
5283                 if (len <= tcp_header_len) {
5284                         /* Bulk data transfer: sender */
5285                         if (len == tcp_header_len) {
5286                                 /* Predicted packet is in window by definition.
5287                                  * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5288                                  * Hence, check seq<=rcv_wup reduces to:
5289                                  */
5290                                 if (tcp_header_len ==
5291                                     (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5292                                     tp->rcv_nxt == tp->rcv_wup)
5293                                         tcp_store_ts_recent(tp);
5294 
5295                                 /* We know that such packets are checksummed
5296                                  * on entry.
5297                                  */
5298                                 tcp_ack(sk, skb, 0);
5299                                 __kfree_skb(skb);
5300                                 tcp_data_snd_check(sk);
5301                                 return 0;
5302                         } else { /* Header too small */
5303                                 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5304                                 goto discard;
5305                         }
5306                 } else {
5307                         int eaten = 0;
5308                         int copied_early = 0;
5309 
5310                         if (tp->copied_seq == tp->rcv_nxt &&
5311                             len - tcp_header_len <= tp->ucopy.len) {
5312 #ifdef CONFIG_NET_DMA
5313                                 if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
5314                                         copied_early = 1;
5315                                         eaten = 1;
5316                                 }
5317 #endif
5318                                 if (tp->ucopy.task == current &&
5319                                     sock_owned_by_user(sk) && !copied_early) {
5320                                         __set_current_state(TASK_RUNNING);
5321 
5322                                         if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
5323                                                 eaten = 1;
5324                                 }
5325                                 if (eaten) {
5326                                         /* Predicted packet is in window by definition.
5327                                          * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5328                                          * Hence, check seq<=rcv_wup reduces to:
5329                                          */
5330                                         if (tcp_header_len ==
5331                                             (sizeof(struct tcphdr) +
5332                                              TCPOLEN_TSTAMP_ALIGNED) &&
5333                                             tp->rcv_nxt == tp->rcv_wup)
5334                                                 tcp_store_ts_recent(tp);
5335 
5336                                         tcp_rcv_rtt_measure_ts(sk, skb);
5337 
5338                                         __skb_pull(skb, tcp_header_len);
5339                                         tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
5340                                         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
5341                                 }
5342                                 if (copied_early)
5343                                         tcp_cleanup_rbuf(sk, skb->len);
5344                         }
5345                         if (!eaten) {
5346                                 if (tcp_checksum_complete_user(sk, skb))
5347                                         goto csum_error;
5348 
5349                                 /* Predicted packet is in window by definition.
5350                                  * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5351                                  * Hence, check seq<=rcv_wup reduces to:
5352                                  */
5353                                 if (tcp_header_len ==
5354                                     (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5355                                     tp->rcv_nxt == tp->rcv_wup)
5356                                         tcp_store_ts_recent(tp);
5357 
5358                                 tcp_rcv_rtt_measure_ts(sk, skb);
5359 
5360                                 if ((int)skb->truesize > sk->sk_forward_alloc)
5361                                         goto step5;
5362 
5363                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
5364 
5365                                 /* Bulk data transfer: receiver */
5366                                 __skb_pull(skb, tcp_header_len);
5367                                 __skb_queue_tail(&sk->sk_receive_queue, skb);   //把prequeu中的数据包合并到sk_receive_queue中
5368                                 skb_set_owner_r(skb, sk);
5369                                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
5370                         }
5371 
5372                         tcp_event_data_recv(sk, skb);
5373 
5374                         if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
5375                                 /* Well, only one small jumplet in fast path... */
5376                                 tcp_ack(sk, skb, FLAG_DATA);
5377                                 tcp_data_snd_check(sk);
5378                                 if (!inet_csk_ack_scheduled(sk))
5379                                         goto no_ack;
5380                         }
5381 
5382                         if (!copied_early || tp->rcv_nxt != tp->rcv_wup)
5383                                 __tcp_ack_snd_check(sk, 0);
5384 no_ack:
5385 #ifdef CONFIG_NET_DMA
5386                         if (copied_early)
5387                                 __skb_queue_tail(&sk->sk_async_wait_queue, skb);
5388                         else
5389 #endif
5390                         if (eaten)
5391                                 __kfree_skb(skb);
5392                         else
5393                                 sk->sk_data_ready(sk, 0);
5394                         return 0;
5395                 }
5396         }
5397 
5398 slow_path:
5399         if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
5400                 goto csum_error;
5401 
5402         /*
5403          *      Standard slow path.
5404          */
5405 
5406         res = tcp_validate_incoming(sk, skb, th, 1);
5407         if (res <= 0)
5408                 return -res;
5409 
5410 step5:
5411         if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
5412                 goto discard;
5413 
5414         tcp_rcv_rtt_measure_ts(sk, skb);
5415 
5416         /* Process urgent data. */
5417         tcp_urg(sk, skb, th);
5418 
5419         /* step 7: process the segment text */
5420         tcp_data_queue(sk, skb);
5421 
5422         tcp_data_snd_check(sk);
5423         tcp_ack_snd_check(sk);
5424         return 0;
5425 
5426 csum_error:
5427         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5428 
5429 discard:
5430         __kfree_skb(skb);
5431         return 0;
5432 }

 

 

二、在进程上下文中   

应用层收包函数recvmsg()最终调用内核里面的tcp_recvmsg()。

1385  *      This routine copies from a sock struct into the user buffer.
1386  *
1387  *      Technical note: in 2.3 we work on _locked_ socket, so that
1388  *      tricks with *seq access order and skb->users are not required.
1389  *      Probably, code can be easily improved even more.
1390  */
1391 
1392 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1393                 size_t len, int nonblock, int flags, int *addr_len)
1394 {
1395         struct tcp_sock *tp = tcp_sk(sk);
1396         int copied = 0;
1397         u32 peek_seq;
1398         u32 *seq;
1399         unsigned long used;
1400         int err;
1401         int target;             /* Read at least this many bytes */
1402         long timeo;
1403         struct task_struct *user_recv = NULL;
1404         int copied_early = 0;
1405         struct sk_buff *skb;
1406         u32 urg_hole = 0;
1407 
1408         lock_sock(sk);
1409 
1410         TCP_CHECK_TIMER(sk);
1411 
1412         err = -ENOTCONN;
1413         if (sk->sk_state == TCP_LISTEN)
1414                 goto out;
1415 
1416         timeo = sock_rcvtimeo(sk, nonblock);
1417 
1418         /* Urgent data needs to be handled specially. */
1419         if (flags & MSG_OOB)
1420                 goto recv_urg;
1421 
1422         seq = &tp->copied_seq;
1423         if (flags & MSG_PEEK) {
1424                 peek_seq = tp->copied_seq;
1425                 seq = &peek_seq;
1426         }
1427 
1428         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1429 
1430 #ifdef CONFIG_NET_DMA
1431         tp->ucopy.dma_chan = NULL;
1432         preempt_disable();
1433         skb = skb_peek_tail(&sk->sk_receive_queue);
1434         {
1435                 int available = 0;
1436 
1437                 if (skb)
1438                         available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1439                 if ((available < target) &&
1440                     (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1441                     !sysctl_tcp_low_latency &&
1442                     dma_find_channel(DMA_MEMCPY)) {
1443                         preempt_enable_no_resched();
1444                         tp->ucopy.pinned_list =
1445                                         dma_pin_iovec_pages(msg->msg_iov, len);
1446                 } else {
1447                         preempt_enable_no_resched();
1448                 }
1449         }
1450 #endif
1451 
1452         do {
1453                 u32 offset;
1454 
1455                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1456                 if (tp->urg_data && tp->urg_seq == *seq) {
1457                         if (copied)
1458                                 break;
1459                         if (signal_pending(current)) {
1460                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1461                                 break;
1462                         }
1463                 }
1464 
1465                 /* Next get a buffer. */
1466 
1467                 skb_queue_walk(&sk->sk_receive_queue, skb) {  //从sk_receive_queue中依次获得待读取的段
1468                         /* Now that we have two receive queues this
1469                          * shouldn't happen.
1470                          */
1471                         if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1472                              KERN_INFO "recvmsg bug: copied %X "
1473                                        "seq %X rcvnxt %X fl %X\n", *seq,
1474                                        TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1475                                        flags))
1476                                 break;
1477 
1478                         offset = *seq - TCP_SKB_CB(skb)->seq;
1479                         if (tcp_hdr(skb)->syn)
1480                                 offset--;
1481                         if (offset < skb->len)
1482                                 goto found_ok_skb;
1483                         if (tcp_hdr(skb)->fin)
1484                                 goto found_fin_ok;
1485                         WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: "
1486                                         "copied %X seq %X rcvnxt %X fl %X\n",
1487                                         *seq, TCP_SKB_CB(skb)->seq,
1488                                         tp->rcv_nxt, flags);
1489                 }
1490 
1491                 /* Well, if we have backlog, try to process it now yet. */
1492 
1493                 if (copied >= target && !sk->sk_backlog.tail)   //如果backlog不为空
1494                         break;                  //跳出循环,处理backlog
1495 
1496                 if (copied) {
1497                         if (sk->sk_err ||
1498                             sk->sk_state == TCP_CLOSE ||
1499                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1500                             !timeo ||
1501                             signal_pending(current))
1502                                 break;
1503                 } else {
1504                         if (sock_flag(sk, SOCK_DONE))
1505                                 break;
1506 
1507                         if (sk->sk_err) {
1508                                 copied = sock_error(sk);
1509                                 break;
1510                         }
1511 
1512                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1513                                 break;
1514 
1515                         if (sk->sk_state == TCP_CLOSE) {
1516                                 if (!sock_flag(sk, SOCK_DONE)) {
1517                                         /* This occurs when user tries to read
1518                                          * from never connected socket.
1519                                          */
1520                                         copied = -ENOTCONN;
1521                                         break;
1522                                 }
1523                                 break;
1524                         }
1525 
1526                         if (!timeo) {
1527                                 copied = -EAGAIN;
1528                                 break;
1529                         }
1530 
1531                         if (signal_pending(current)) {
1532                                 copied = sock_intr_errno(timeo);
1533                                 break;
1534                         }
1535                 }
1536 
1537                 tcp_cleanup_rbuf(sk, copied);
1538 
1539                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1540                         /* Install new reader */
1541                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1542                                 user_recv = current;
1543                                 tp->ucopy.task = user_recv;
1544                                 tp->ucopy.iov = msg->msg_iov;
1545                         }
1546 
1547                         tp->ucopy.len = len;
1548 
1549                         WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1550                                 !(flags & (MSG_PEEK | MSG_TRUNC)));
1551 
1552                         /* Ugly... If prequeue is not empty, we have to
1553                          * process it before releasing socket, otherwise
1554                          * order will be broken at second iteration.
1555                          * More elegant solution is required!!!
1556                          *
1557                          * Look: we have the following (pseudo)queues:
1558                          *
1559                          * 1. packets in flight
1560                          * 2. backlog
1561                          * 3. prequeue
1562                          * 4. receive_queue
1563                          *
1564                          * Each queue can be processed only if the next ones
1565                          * are empty. At this point we have empty receive_queue.
1566                          * But prequeue _can_ be not empty after 2nd iteration,
1567                          * when we jumped to start of loop because backlog
1568                          * processing added something to receive_queue.
1569                          * We cannot release_sock(), because backlog contains
1570                          * packets arrived _after_ prequeued ones.
1571                          *
1572                          * Shortly, algorithm is clear --- to process all
1573                          * the queues in order. We could make it more directly,
1574                          * requeueing packets from backlog to prequeue, if
1575                          * is not empty. It is more elegant, but eats cycles,
1576                          * unfortunately.
1577                          */
1578                         if (!skb_queue_empty(&tp->ucopy.prequeue))       
1579                                 goto do_prequeue;  //处理prequeue
1580 
1581                         /* __ Set realtime policy in scheduler __ */
1582                 }
1583 
1584 #ifdef CONFIG_NET_DMA
1585                 if (tp->ucopy.dma_chan)
1586                         dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1587 #endif
1588                 if (copied >= target) {
1589                         /* Do not sleep, just process backlog. */
1590                         release_sock(sk);
1591                         lock_sock(sk);
1592                 } else
1593                         sk_wait_data(sk, &timeo);
1594 
1595 #ifdef CONFIG_NET_DMA
1596                 tcp_service_net_dma(sk, false);  /* Don't block */
1597                 tp->ucopy.wakeup = 0;
1598 #endif
1599 
1600                 if (user_recv) {
1601                         int chunk;
1602 
1603                         /* __ Restore normal policy in scheduler __ */
1604 
1605                         if ((chunk = len - tp->ucopy.len) != 0) {
1606                                 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1607                                 len -= chunk;
1608                                 copied += chunk;
1609                         }
1610 
1611                         if (tp->rcv_nxt == tp->copied_seq &&
1612                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1613 do_prequeue:
1614                                 tcp_prequeue_process(sk);   //prequeue的处理函数
1615 
1616                                 if ((chunk = len - tp->ucopy.len) != 0) {
1617                                         NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1618                                         len -= chunk;
1619                                         copied += chunk;
1620                                 }
1621                         }
1622                 }
1623                 if ((flags & MSG_PEEK) &&
1624                     (peek_seq - copied - urg_hole != tp->copied_seq)) {
1625                         if (net_ratelimit())
1626                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1627                                        current->comm, task_pid_nr(current));
1628                         peek_seq = tp->copied_seq;
1629                 }
1630                 continue;
1631 
1632         found_ok_skb:
1633                 /* Ok so how much can we use? */
1634                 used = skb->len - offset;
1635                 if (len < used)
1636                         used = len;
1637 
1638                 /* Do we have urgent data here? */
1639                 if (tp->urg_data) {
1640                         u32 urg_offset = tp->urg_seq - *seq;
1641                         if (urg_offset < used) {
1642                                 if (!urg_offset) {
1643                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1644                                                 ++*seq;
1645                                                 urg_hole++;
1646                                                 offset++;
1647                                                 used--;
1648                                                 if (!used)
1649                                                         goto skip_copy;
1650                                         }
1651                                 } else
1652                                         used = urg_offset;
1653                         }
1654                 }
1655 
1656                 if (!(flags & MSG_TRUNC)) {
1657 #ifdef CONFIG_NET_DMA
1658                         if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1659                                 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1660 
1661                         if (tp->ucopy.dma_chan) {
1662                                 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1663                                         tp->ucopy.dma_chan, skb, offset,
1664                                         msg->msg_iov, used,
1665                                         tp->ucopy.pinned_list);
1666 
1667                                 if (tp->ucopy.dma_cookie < 0) {
1668 
1669                                         printk(KERN_ALERT "dma_cookie < 0\n");
1670 
1671                                         /* Exception. Bailout! */
1672                                         if (!copied)
1673                                                 copied = -EFAULT;
1674                                         break;
1675                                 }
1676 
1677                                 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1678 
1679                                 if ((offset + used) == skb->len)
1680                                         copied_early = 1;
1681 
1682                         } else
1683 #endif
1684                         {
1685                                 err = skb_copy_datagram_iovec(skb, offset,
1686                                                 msg->msg_iov, used);
1687                                 if (err) {
1688                                         /* Exception. Bailout! */
1689                                         if (!copied)
1690                                                 copied = -EFAULT;
1691                                         break;
1692                                 }
1693                         }
1694                 }
1695 
1696                 *seq += used;
1697                 copied += used;
1698                 len -= used;
1699 
1700                 tcp_rcv_space_adjust(sk);
1701 
1702 skip_copy:
1703                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1704                         tp->urg_data = 0;
1705                         tcp_fast_path_check(sk);
1706                 }
1707                 if (used + offset < skb->len)
1708                         continue;
1709 
1710                 if (tcp_hdr(skb)->fin)
1711                         goto found_fin_ok;
1712                 if (!(flags & MSG_PEEK)) {
1713                         sk_eat_skb(sk, skb, copied_early);
1714                         copied_early = 0;
1715                 }
1716                 continue;
1717 
1718         found_fin_ok:
1719                 /* Process the FIN. */
1720                 ++*seq;
1721                 if (!(flags & MSG_PEEK)) {
1722                         sk_eat_skb(sk, skb, copied_early);
1723                         copied_early = 0;
1724                 }
1725                 break;
1726         } while (len > 0);
1727 
1728         if (user_recv) {
1729                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1730                         int chunk;
1731 
1732                         tp->ucopy.len = copied > 0 ? len : 0;
1733 
1734                         tcp_prequeue_process(sk);
1735 
1736                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1737                                 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1738                                 len -= chunk;
1739                                 copied += chunk;
1740                         }
1741                 }
1742 
1743                 tp->ucopy.task = NULL;
1744                 tp->ucopy.len = 0;
1745         }
1746 
1747 #ifdef CONFIG_NET_DMA
1748         tcp_service_net_dma(sk, true);  /* Wait for queue to drain */
1749         tp->ucopy.dma_chan = NULL;
1750 
1751         if (tp->ucopy.pinned_list) {
1752                 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1753                 tp->ucopy.pinned_list = NULL;
1754         }
1755 #endif
1756 
1757         /* According to UNIX98, msg_name/msg_namelen are ignored
1758          * on connected socket. I was just happy when found this 8) --ANK
1759          */
1760 
1761         /* Clean up data we have read: This will do ACK frames. */
1762         tcp_cleanup_rbuf(sk, copied);
1763 
1764         TCP_CHECK_TIMER(sk);
1765         release_sock(sk);
1766         return copied;
1767 
1768 out:
1769         TCP_CHECK_TIMER(sk);
1770         release_sock(sk);           //backlog的处理函数
1771         return err;
1772 
1773 recv_urg:
1774         err = tcp_recv_urg(sk, msg, len, flags);
1775         goto out;
1776 }

 


跟踪一下tcp_prequeue_process()函数:

1240 static void tcp_prequeue_process(struct sock *sk)
1241 {
1242         struct sk_buff *skb;
1243         struct tcp_sock *tp = tcp_sk(sk);
1244 
1245         NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1246 
1247         /* RX process wants to run with disabled BHs, though it is not
1248          * necessary */
1249         local_bh_disable();
1250         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)  //遍历prequeue中的数据包
1251                 sk_backlog_rcv(sk, skb);  //处理数据包,把prequeue中的数据包合并到sk_receive_queue中
1252         local_bh_enable();
1253 
1254         /* Clear memory counter. */
1255         tp->ucopy.memory = 0;
1256 }

 

 

 

再来看看realease_sock()函数:

1952 void release_sock(struct sock *sk)
1953 {
1954         /*
1955          * The sk_lock has mutex_unlock() semantics:
1956          */
1957         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1958 
1959         spin_lock_bh(&sk->sk_lock.slock);
1960         if (sk->sk_backlog.tail)  //如果backlog中有数据包
1961                 __release_sock(sk);  //实际处理函数
1962         sk->sk_lock.owned = 0;
1963         if (waitqueue_active(&sk->sk_lock.wq))
1964                 wake_up(&sk->sk_lock.wq);
1965         spin_unlock_bh(&sk->sk_lock.slock);
1966 }
1523 static void __release_sock(struct sock *sk)
1524 {
1525         struct sk_buff *skb = sk->sk_backlog.head;  //保存sk_backlog.head
1526 
1527         do {
1528                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;  // 把head和tail置空
1529                 bh_unlock_sock(sk);   //解除自旋锁,对smp有效
1530 
1531                 do {
1532                         struct sk_buff *next = skb->next;
1533 
1534                         skb->next = NULL;
1535                         sk_backlog_rcv(sk, skb);  //调用tcp_v4_do_rcv()->tcp_rcv_established()处理数据包,
                                   //把backlog中的数据包合并到sk_receive_queue中
1536 1537 /* 1538 * We are in process context here with softirqs 1539 * disabled, use cond_resched_softirq() to preempt. 1540 * This is safe to do because we've taken the backlog 1541 * queue private: 1542 */ 1543 cond_resched_softirq(); 1544 1545 skb = next; //依次处理下一个数据包 1546 } while (skb != NULL); //遍历backlog中的数据包 1547 1548 bh_lock_sock(sk); 1549 } while ((skb = sk->sk_backlog.head) != NULL); //这里纠结了好久

          /*这个外层循环这么理解:
           * __release_sock()是在进程上下文中,sk_add_backlog()是在软中断中,
           * 由于优先处理软中断,所以在__release_sock()处理数据包的时候,
           * 有可能一直有新的数据包加入backlog,导致sk->sk_backlog.head不为空,
           * 除非没有新数据包加入backlog,这个时候才立即停止外层循环。
           */

1550 
1551         /*
1552          * Doing the zeroing here guarantee we can not loop forever
1553          * while a wild producer attempts to flood us.
1554          */
1555         sk->sk_backlog.len = 0;
1556 }

 

 

 

posted on 2015-04-01 18:40  mylinuxer  阅读(3341)  评论(0编辑  收藏  举报