socket connect tcp_v4_connect

tcp_v4_connect

/* This will initiate an outgoing connection. 
tcp_v4_connect函数初始化一个对外的连接请求,创建一个SYN包并发送出去,
把套接字的状态从CLOSE切换到SYN_SENT,初始化TCP部分选项数据包序列号、
窗口大小、MSS、套接字传送超时等*/
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
    struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
    struct inet_sock *inet = inet_sk(sk);
    struct tcp_sock *tp = tcp_sk(sk);
    __be16 orig_sport, orig_dport;
    __be32 daddr, nexthop;
    struct flowi4 *fl4;
    struct rtable *rt;
    int err;
    struct ip_options_rcu *inet_opt;

    if (addr_len < sizeof(struct sockaddr_in))
        return -EINVAL;

    if (usin->sin_family != AF_INET)
        return -EAFNOSUPPORT;
    //是否设置源路由选项

    nexthop = daddr = usin->sin_addr.s_addr;
    inet_opt = rcu_dereference_protected(inet->inet_opt,
                         sock_owned_by_user(sk));
    if (inet_opt && inet_opt->opt.srr) {
        if (!daddr)
            return -EINVAL;
        nexthop = inet_opt->opt.faddr;
    }
/*
根据目的ip、目的端口、网络设备接口调用ip_route_connect选路由,
路由结构保存到rt->rt_dst中,实际调用的函数是ip_route_output_flow,
如果是广播地址、组地址就返回

*/
    orig_sport = inet->inet_sport;
    orig_dport = usin->sin_port;
    fl4 = &inet->cork.fl.u.ip4;
    rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
                  RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
                  IPPROTO_TCP,
                  orig_sport, orig_dport, sk, true);
    if (IS_ERR(rt)) {
        err = PTR_ERR(rt);
        if (err == -ENETUNREACH)
            IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
        return err;
    }

    if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
        ip_rt_put(rt);
        return -ENETUNREACH;
    }

    if (!inet_opt || !inet_opt->opt.srr)
        daddr = fl4->daddr;

    if (!inet->inet_saddr)
        inet->inet_saddr = fl4->saddr;
    inet->inet_rcv_saddr = inet->inet_saddr;

    if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
        /* Reset inherited state */
        tp->rx_opt.ts_recent       = 0;
        tp->rx_opt.ts_recent_stamp = 0;
        if (likely(!tp->repair))
            tp->write_seq       = 0;
    }
    ////获取套接字最近使用的时间

    if (tcp_death_row.sysctl_tw_recycle &&
        !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
        tcp_fetch_timewait_stamp(sk, &rt->dst);

    inet->inet_dport = usin->sin_port;
    inet->inet_daddr = daddr;

    inet_csk(sk)->icsk_ext_hdr_len = 0;
    if (inet_opt)
        inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;

    tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;

    /* Socket identity is still unknown (sport may be zero).
     * However we set state to SYN-SENT and not releasing socket
     * lock select source port, enter ourselves into the hash tables and
     * complete initialization after this.
     调用tcp_set_state设置套接字状态为TCP_SYN_SENT,本把套接字sk加入到连接管理哈希链表中,
     为连接分配一个临时端口
     */
    tcp_set_state(sk, TCP_SYN_SENT);
    //将套接字sk放入TCP连接管理哈希链表中 同时 Bind a port
    //绑定IP地址和端口,并将socket加入到连接表中
    err = inet_hash_connect(&tcp_death_row, sk);
    if (err)
        goto failure;

    rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
                   inet->inet_sport, inet->inet_dport, sk);
    if (IS_ERR(rt)) {
        err = PTR_ERR(rt);
        rt = NULL;
        goto failure;
    }
    /* OK, now commit destination to socket.  */
    sk->sk_gso_type = SKB_GSO_TCPV4;
    sk_setup_caps(sk, &rt->dst);

    if (!tp->write_seq && likely(!tp->repair))
        tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
                               inet->inet_daddr,
                               inet->inet_sport,
                               usin->sin_port);

    inet->inet_id = tp->write_seq ^ jiffies;
/*
初始化第一个序列号,调用tcp_connect函数完成建立连接,
包括发送SYN,tcp_connect将创建号的SYN数据段加入到套接字发送队列,
最后调用tcp_transmit_skb数据包发送到IP层。

*/
    if (likely(!tp->repair))
        err = tcp_connect(sk);
    else
        err = tcp_repair_connect(sk);

    rt = NULL;
    if (err)
        goto failure;

    return 0;

failure:
    /*
     * This unhashes the socket and releases the local port,
     * if necessary.
     */
    tcp_set_state(sk, TCP_CLOSE);
    ip_rt_put(rt);
    sk->sk_route_caps = 0;
    inet->inet_dport = 0;
    return err;
}

/*
 * Bind a port for a connect operation and hash it.
 */
int inet_hash_connect(struct inet_timewait_death_row *death_row,
              struct sock *sk)
{
    return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
            __inet_check_established, __inet_hash_nolisten);
}


int __inet_hash_connect(struct inet_timewait_death_row *death_row,
        struct sock *sk, u32 port_offset,
        int (*check_established)(struct inet_timewait_death_row *,
            struct sock *, __u16, struct inet_timewait_sock **),
        int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
{
    struct inet_hashinfo *hinfo = death_row->hashinfo;
    const unsigned short snum = inet_sk(sk)->inet_num;
    struct inet_bind_hashbucket *head;
    struct inet_bind_bucket *tb;
    int ret;
    struct net *net = sock_net(sk);
    int twrefcnt = 1;

    if (!snum) {//端口未绑定
        int i, remaining, low, high, port;
        static u32 hint;
        u32 offset = hint + port_offset;
        struct hlist_node *node;
        struct inet_timewait_sock *tw = NULL;

        inet_get_local_port_range(&low, &high);
        remaining = (high - low) + 1;

        local_bh_disable();
        for (i = 1; i <= remaining; i++) {
            port = low + (i + offset) % remaining;
            if (inet_is_reserved_local_port(port))
                continue;
            head = &hinfo->bhash[inet_bhashfn(net, port,
                    hinfo->bhash_size)];
            spin_lock(&head->lock);

            /* Does not bother with rcv_saddr checks,
             * because the established check is already
             * unique enough.
             //绑定到一个port的socket可能是通过bind 系统调用,也可能是调用connect系统调用时__inet_hash_connect函数选取的
             */
            inet_bind_bucket_for_each(tb, node, &head->chain) {
                if (net_eq(ib_net(tb), net) &&
                    tb->port == port) {
                    if (tb->fastreuse >= 0)
                        goto next_port;
                    WARN_ON(hlist_empty(&tb->owners));
                    if (!check_established(death_row, sk,
                                port, &tw))
                        goto ok;
                    goto next_port;
                }
            }
//当前端口没有被使用
            tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
                    net, head, port);
            if (!tb) {
                spin_unlock(&head->lock);
                break;
            }
            tb->fastreuse = -1;
            goto ok;

        next_port:
            spin_unlock(&head->lock);
        }
        local_bh_enable();

        return -EADDRNOTAVAIL;

ok:
        hint += i;

        /* Head lock still held and bh's disabled 
        //将socket加入port对应的tb的socket队列中,即将此socket与port相关联
        */
        inet_bind_hash(sk, tb, port);
        if (sk_unhashed(sk)) { //如果socket没有被加入到“已建立连接”的连接表中
            inet_sk(sk)->inet_sport = htons(port);
            twrefcnt += hash(sk, tw);//将socket加入到“已建立连接”的连接表中
        }
        if (tw)
            twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
        spin_unlock(&head->lock);

        if (tw) {
            inet_twsk_deschedule(tw, death_row);
            while (twrefcnt) {
                twrefcnt--;
                inet_twsk_put(tw);
            }
        }

        ret = 0;
        goto out;
    }

    head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
    tb  = inet_csk(sk)->icsk_bind_hash;//将tb加入到bind hash表中
    spin_lock_bh(&head->lock);
//条件为false时,会执行else分支,检查是否可用。这么看来,调用bind()成功并不意味着这个端口就真的可以用
    if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {//有且仅有一个socket绑定到这个端口,无需冲突检查
        hash(sk, NULL);//将socket加入到“已建立连接”的连接表中
        spin_unlock_bh(&head->lock);
        return 0;
    } else {
        spin_unlock(&head->lock);
        /* No definite answer... Walk to established hash table */
        ret = check_established(death_row, sk, snum, NULL);
out:
        local_bh_enable();
        return ret;
    }
}

 

创建一个套接字,设置SO_REUSEADDR选项,建立连接后立即关闭,关闭后立即又重复同样的过程,发现在第二次调用connect()的时候返回EADDRNOTAVAIL错误
可以看到返回EADDRNOTVAIL错误的有两种情况:
   1、在TIME_WAIT传输控制块中找到匹配的端口,并且twsk_unique()返回true时
   2、在除TIME_WAIT和LISTEN状态外的传输块中存在匹配的端口。
  第二种情况很好容易理解了,只要状态在FIN_WAIT_1、ESTABLISHED等的传输控制块使用的端口和要查找的匹配,就会返回EADDRNOTVAIL错误。
第一种情况还要取决于twsk_uniqueue()的返回值


__inet_hash_connect的主要功能与bind系统调用中的inet_csk_get_port类似,都是:
1、如果没有选取端口则选定一个;

2、将socket与端口绑定;

3、将scoket加入到连接表中(这个功能inet_csk_get_port没有)。

  另外一点不同是:inet_csk_get_port进行冲突检查时关注的是绑定冲突
而__inet_hash_connect检查的是当前socket是否与“已建立连接的socket”的冲突。
__inet_hash_connect检查冲突的函数是__inet_check_established:

/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
                    struct sock *sk, __u16 lport,
                    struct inet_timewait_sock **twp)
{
    struct inet_hashinfo *hinfo = death_row->hashinfo;
    struct inet_sock *inet = inet_sk(sk);
    __be32 daddr = inet->inet_rcv_saddr;
    __be32 saddr = inet->inet_daddr;
    int dif = sk->sk_bound_dev_if;
    INET_ADDR_COOKIE(acookie, saddr, daddr)
    const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
    struct net *net = sock_net(sk);
    unsigned int hash = inet_ehashfn(net, daddr, lport,
                     saddr, inet->inet_dport);
    struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);//找到连接表中的表项
    spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
    struct sock *sk2;
    const struct hlist_nulls_node *node;
    struct inet_timewait_sock *tw;
    int twrefcnt = 0;

    spin_lock(lock);

    /* Check TIME-WAIT sockets first. 
    先检查TIME_WAIT表,然后再检查establish表,与这两个表中的任意一个冲突都是不允许的
    */
    sk_nulls_for_each(sk2, node, &head->twchain) {
        tw = inet_twsk(sk2);

        if (INET_TW_MATCH(sk2, net, hash, acookie,
                    saddr, daddr, ports, dif)) {
            if (twsk_unique(sk, sk2, twp))
                goto unique;
            else
                goto not_unique;
        }
    }
    tw = NULL;

    /* And established part... */
    sk_nulls_for_each(sk2, node, &head->chain) {
        if (INET_MATCH(sk2, net, hash, acookie,
                    saddr, daddr, ports, dif))
            goto not_unique;
    }

unique:
    /* Must record num and sport now. Otherwise we will see
     * in hash table socket with a funny identity. */
    inet->inet_num = lport;
    inet->inet_sport = htons(lport);
    sk->sk_hash = hash;
    WARN_ON(!sk_unhashed(sk));
    __sk_nulls_add_node_rcu(sk, &head->chain);
    if (tw) {
        twrefcnt = inet_twsk_unhash(tw);
        NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
    }
    spin_unlock(lock);
    if (twrefcnt)
        inet_twsk_put(tw);
    sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);

    if (twp) {
        *twp = tw;
    } else if (tw) {
        /* Silly. Should hash-dance instead... */
        inet_twsk_deschedule(tw, death_row);

        inet_twsk_put(tw);
    }
    return 0;

not_unique:
    spin_unlock(lock);
    return -EADDRNOTAVAIL;
}
 在listen系统调用中,inet_hash函数会将socket加入到listen连接表中:

static void __inet_hash(struct sock *sk)
{
    struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
    struct inet_listen_hashbucket *ilb;

    if (sk->sk_state != TCP_LISTEN) {
        __inet_hash_nolisten(sk, NULL);
        return;
    }

    WARN_ON(!sk_unhashed(sk));
    ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];

    spin_lock(&ilb->lock);
    __sk_nulls_add_node_rcu(sk, &ilb->head);
    sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
    spin_unlock(&ilb->lock);
}


int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
{
    struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
    struct hlist_nulls_head *list;
    spinlock_t *lock;
    struct inet_ehash_bucket *head;
    int twrefcnt = 0;

    WARN_ON(!sk_unhashed(sk));

    sk->sk_hash = inet_sk_ehashfn(sk);
    head = inet_ehash_bucket(hashinfo, sk->sk_hash);
    list = &head->chain;
    lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

    spin_lock(lock);
    __sk_nulls_add_node_rcu(sk, list);
    if (tw) {
        WARN_ON(sk->sk_hash != tw->tw_hash);
        twrefcnt = inet_twsk_unhash(tw);
    }
    spin_unlock(lock);
    sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
    return twrefcnt;
}


static inline struct inet_ehash_bucket *inet_ehash_bucket(
    struct inet_hashinfo *hashinfo,
    unsigned int hash)
{
    return &hashinfo->ehash[hash & hashinfo->ehash_mask];
}/*
可见server端的socket在进行listen系统调用后被加入到sk->sk_prot->h.hashinfo->listening_hash中,
client端的socket在进行connect系统调用后被加入到sk->sk_prot->h.hashinfo->ehash中,
而对于TCPv4和TCPv6,sk->sk_prot->h.hashinfo指向的都是tcp_hashinfo。*/

 在测试设备并发时 出现了如下log

tcp_connect2] idx 3 (192.168.43.4:0)-->(192.168.43.11, 80):errno:99 Cannot assign requested address

说明此时 ip port 不够用了

 

根据上述代码可知:

tcp_connect时,其源ip以及接口可以不用指定, 可以依靠目的ip 进行路由,查找到出口ip相同网段的接口以及对应接口ip

 

posted @ 2019-11-20 11:24  codestacklinuxer  阅读(578)  评论(0编辑  收藏  举报