Netfilter 之 iptable_nat

初始化

iptable_nat_table_init函数通过调用ipt_register_table完成NAT表注册和钩子函数注册的功能;该流程与iptable_filter的函数调用的函数一致,此处不再重复分析,详情请移步<iptable_filter分析>;

 1 static int __net_init iptable_nat_table_init(struct net *net)
 2 {
 3     struct ipt_replace *repl;
 4     int ret;
 5 
 6     /* nat表已经初始化过 */
 7     if (net->ipv4.nat_table)
 8         return 0;
 9 
10     /* 分配初始化表,用于下面的注册 */
11     repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
12     if (repl == NULL)
13         return -ENOMEM;
14     /* 表注册,钩子函数注册 */
15     ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
16                  nf_nat_ipv4_ops, &net->ipv4.nat_table);
17     kfree(repl);
18     return ret;
19 }

 

钩子函数分析
钩子函数以及钩子点

nf_nat_ipv4_ops是NAT相关钩子函数的数组,其调用顺序和钩子点见下面注释;其中filter工作在DNAT和SNAT之间;

这几个钩子函数都会调用nf_nat_ipv4_fn来完成NAT转换,本部分最后统一分析该函数;

 1 /* 钩子函数数组 */
 2 /* 顺序 DNAT->filter->SNAT */
 3 /* 输入本机 PRE_ROUTING(DNAT)->LOCAL_IN(SNAT) */
 4 /* 转发 PRE_ROUTING(DNAT)->POST_ROUTING(SNAT) */
 5 /* 本机输出 LOCAL_OUT(DNAT)->POST_ROUTING(SNAT) */
 6 static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
 7     /* Before packet filtering, change destination */
 8     {
 9         .hook        = iptable_nat_ipv4_in,
10         .pf        = NFPROTO_IPV4,
11         .hooknum    = NF_INET_PRE_ROUTING,
12         .priority    = NF_IP_PRI_NAT_DST, /* DNAT */
13     },
14     /* After packet filtering, change source */
15     {
16         .hook        = iptable_nat_ipv4_out,
17         .pf        = NFPROTO_IPV4,
18         .hooknum    = NF_INET_POST_ROUTING,
19         .priority    = NF_IP_PRI_NAT_SRC, /* SNAT */
20     },
21     /* Before packet filtering, change destination */
22     {
23         .hook        = iptable_nat_ipv4_local_fn,
24         .pf        = NFPROTO_IPV4,
25         .hooknum    = NF_INET_LOCAL_OUT,
26         .priority    = NF_IP_PRI_NAT_DST, /* DNAT */
27     },
28     /* After packet filtering, change source */
29     {
30         .hook        = iptable_nat_ipv4_fn,
31         .pf        = NFPROTO_IPV4,
32         .hooknum    = NF_INET_LOCAL_IN,
33         .priority    = NF_IP_PRI_NAT_SRC, /* SNAT */
34     },
35 };

 

iptable_nat_ipv4_in

函数工作在PRE_ROUTING钩子点,进行DNAT转换;

1 /* PRE_ROUTING,DNAT */
2 static unsigned int iptable_nat_ipv4_in(void *priv,
3                     struct sk_buff *skb,
4                     const struct nf_hook_state *state)
5 {
6     return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain);
7 }

 

nf_nat_ipv4_in函数在进行DNAT转换之前记录了目的地址,在进行转换之后,如果目的地址发生了改变,则需要释放skb中的路由缓存;NAT转换过程调用nf_nat_ipv4_fn完成,步骤见下面的该函数分析;

 1 /* PRE_ROUTING, DNAT */
 2 unsigned int
 3 nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
 4            const struct nf_hook_state *state,
 5            unsigned int (*do_chain)(void *priv,
 6                      struct sk_buff *skb,
 7                      const struct nf_hook_state *state,
 8                      struct nf_conn *ct))
 9 {
10     unsigned int ret;
11     /* 获取目的地址 */
12     __be32 daddr = ip_hdr(skb)->daddr;
13 
14     /* DNAT转换 */
15     ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
16 
17     /* 转换之后,目的地址发生变化,释放路由缓存 */
18     if (ret != NF_DROP && ret != NF_STOLEN &&
19         daddr != ip_hdr(skb)->daddr)
20         skb_dst_drop(skb);
21 
22     return ret;
23 }

 

iptable_nat_ipv4_fn

函数工作在LOCAL_IN钩子点,进行SNAT转换;NAT转换过程调用nf_nat_ipv4_fn完成,步骤见下面的该函数分析;

1 /* LOCAL_IN,SNAT */
2 static unsigned int iptable_nat_ipv4_fn(void *priv,
3                     struct sk_buff *skb,
4                     const struct nf_hook_state *state)
5 {
6     return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain);
7 }

 

iptable_nat_ipv4_local_fn

函数工作在LOCAL_OUT钩子点,进行DNAT转换;

1 /* LOCAL_OUT,DNAT */
2 static unsigned int iptable_nat_ipv4_local_fn(void *priv,
3                           struct sk_buff *skb,
4                           const struct nf_hook_state *state)
5 {
6     return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain);
7 }

 

nf_nat_ipv4_local_fn函数在进行DNAT转换之后,如果地址发生变化,则需要重新进行路由查;NAT转换过程调用nf_nat_ipv4_fn完成,步骤见下面的该函数分析;

 1 unsigned int
 2 nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
 3              const struct nf_hook_state *state,
 4              unsigned int (*do_chain)(void *priv,
 5                            struct sk_buff *skb,
 6                            const struct nf_hook_state *state,
 7                            struct nf_conn *ct))
 8 {
 9     const struct nf_conn *ct;
10     enum ip_conntrack_info ctinfo;
11     unsigned int ret;
12     int err;
13 
14     /* root is playing with raw sockets. */
15     if (skb->len < sizeof(struct iphdr) ||
16         ip_hdrlen(skb) < sizeof(struct iphdr))
17         return NF_ACCEPT;
18 
19     /* DNAT转换 */
20     ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
21 
22     /* 转换成功 */
23     if (ret != NF_DROP && ret != NF_STOLEN &&
24         (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
25         enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
26 
27         /* ip地址发生变化 */
28         if (ct->tuplehash[dir].tuple.dst.u3.ip !=
29             ct->tuplehash[!dir].tuple.src.u3.ip) {
30             /* 重新查路由 */
31             err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
32             if (err < 0)
33                 ret = NF_DROP_ERR(err);
34         }
35 #ifdef CONFIG_XFRM
36         else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
37              ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
38              ct->tuplehash[dir].tuple.dst.u.all !=
39              ct->tuplehash[!dir].tuple.src.u.all) {
40             err = nf_xfrm_me_harder(state->net, skb, AF_INET);
41             if (err < 0)
42                 ret = NF_DROP_ERR(err);
43         }
44 #endif
45     }
46     return ret;
47 }

 

iptable_nat_ipv4_out

函数工作在POST_ROUTING钩子点,进行SNAT转换;

1 /* POST_ROUTING,SNAT */
2 static unsigned int iptable_nat_ipv4_out(void *priv,
3                      struct sk_buff *skb,
4                      const struct nf_hook_state *state)
5 {
6     return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain);
7 }

 

 1 unsigned int
 2 nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
 3         const struct nf_hook_state *state,
 4         unsigned int (*do_chain)(void *priv,
 5                       struct sk_buff *skb,
 6                       const struct nf_hook_state *state,
 7                       struct nf_conn *ct))
 8 {
 9 #ifdef CONFIG_XFRM
10     const struct nf_conn *ct;
11     enum ip_conntrack_info ctinfo;
12     int err;
13 #endif
14     unsigned int ret;
15 
16     /* root is playing with raw sockets. */
17     if (skb->len < sizeof(struct iphdr) ||
18         ip_hdrlen(skb) < sizeof(struct iphdr))
19         return NF_ACCEPT;
20 
21     /* SNAT转换 */
22     ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
23 #ifdef CONFIG_XFRM
24     if (ret != NF_DROP && ret != NF_STOLEN &&
25         !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
26         (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
27         enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
28 
29         if ((ct->tuplehash[dir].tuple.src.u3.ip !=
30              ct->tuplehash[!dir].tuple.dst.u3.ip) ||
31             (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
32              ct->tuplehash[dir].tuple.src.u.all !=
33              ct->tuplehash[!dir].tuple.dst.u.all)) {
34             err = nf_xfrm_me_harder(state->net, skb, AF_INET);
35             if (err < 0)
36                 ret = NF_DROP_ERR(err);
37         }
38     }
39 #endif
40     return ret;
41 }

 

公共函数nf_nat_ipv4_fn

nf_nat_ipv4_fn完成具体的SNAT或者DNAT的转换流程,上面的四个钩子函数都会调用该函数;

 1 unsigned int
 2 nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 3            const struct nf_hook_state *state,
 4            unsigned int (*do_chain)(void *priv,
 5                     struct sk_buff *skb,
 6                     const struct nf_hook_state *state,
 7                     struct nf_conn *ct))
 8 {
 9     struct nf_conn *ct;
10     enum ip_conntrack_info ctinfo;
11     struct nf_conn_nat *nat;
12     /* maniptype == SRC for postrouting. */
13     /* 获取是进行DNAT还是SNAT,其中PRE_ROUTING和LOCAL_OUT进行DNAT,LOCAL_IN和POST_ROUTING进行SNAT */
14     enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
15 
16     /* 获取skb关联的连接跟踪sf_conn */
17     ct = nf_ct_get(skb, &ctinfo);
18     /* Can't track?  It's not due to stress, or conntrack would
19      * have dropped it.  Hence it's the user's responsibilty to
20      * packet filter it out, or implement conntrack/NAT for that
21      * protocol. 8) --RR
22      */
23     /* 没有,返回accpet */
24     if (!ct)
25         return NF_ACCEPT;
26 
27     /* 获取NAT扩展 */
28     nat = nfct_nat(ct);
29 
30     /* 判断连接跟踪状态 */
31     switch (ctinfo) {
32     /* 关联连接(或者icmp错误)或者关联连接的应答 */
33     case IP_CT_RELATED:
34     case IP_CT_RELATED_REPLY:
35         /* icmp协议的NAT操作 */
36         if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
37             if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
38                                state->hook))
39                 return NF_DROP;
40             else
41                 return NF_ACCEPT;
42         }
43         /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
44     case IP_CT_NEW:
45         /* Seen it before?  This can happen for loopback, retrans,
46          * or local packets.
47          */
48         /* 尚未进行过NAT转换 */
49         if (!nf_nat_initialized(ct, maniptype)) {
50             unsigned int ret;
51 
52             /* 进行规则匹配 */
53             ret = do_chain(priv, skb, state, ct);
54             if (ret != NF_ACCEPT)
55                 return ret;
56 
57             /* 打NAT转换标记 */
58             if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
59                 break;
60 
61             /* 连接跟踪进行NAT */
62             ret = nf_nat_alloc_null_binding(ct, state->hook);
63             if (ret != NF_ACCEPT)
64                 return ret;
65         } 
66         /* 进行过NAT转换 */
67         else {
68             pr_debug("Already setup manip %s for ct %p\n",
69                  maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
70                  ct);
71             /* 出接口发生改变 */
72             if (nf_nat_oif_changed(state->hook, ctinfo, nat,
73                            state->out))
74                 goto oif_changed;
75         }
76         break;
77 
78     default:
79         /* ESTABLISHED */
80         NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
81                  ctinfo == IP_CT_ESTABLISHED_REPLY);
82         /* 出接口发生改变 */
83         if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
84             goto oif_changed;
85     }
86 
87     /* skb数据包进行NAT转换修改 */
88     return nf_nat_packet(ct, ctinfo, state->hook, skb);
89 
90 oif_changed:
91     nf_ct_kill_acct(ct, ctinfo, skb);
92     return NF_DROP;
93 }

 

1 unsigned int
2 nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
3 {
4     return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum));
5 }

 

 1 static unsigned int
 2 __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
 3 {
 4     /* Force range to this IP; let proto decide mapping for
 5      * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
 6      * Use reply in case it's already been mangled (eg local packet).
 7      */
 8     /* 使用应答方向的ip地址,LOCAL_OUT会先经过mangle,可能改变了 */
 9     union nf_inet_addr ip =
10         (manip == NF_NAT_MANIP_SRC ?
11         ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
12         ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
13 
14     /* 设置range */
15     struct nf_nat_range range = {
16         .flags        = NF_NAT_RANGE_MAP_IPS,
17         .min_addr    = ip,
18         .max_addr    = ip,
19     };
20 
21     /* 进行NAT转换 */
22     return nf_nat_setup_info(ct, &range, manip);
23 }

 

 1 unsigned int
 2 nf_nat_setup_info(struct nf_conn *ct,
 3           const struct nf_nat_range *range,
 4           enum nf_nat_manip_type maniptype)
 5 {
 6     struct nf_conntrack_tuple curr_tuple, new_tuple;
 7 
 8     /* Can't setup nat info for confirmed ct. */
 9     /* 已经确认的,返回accpet */
10     if (nf_ct_is_confirmed(ct))
11         return NF_ACCEPT;
12 
13     NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
14              maniptype == NF_NAT_MANIP_DST);
15     BUG_ON(nf_nat_initialized(ct, maniptype));
16 
17     /* What we've got will look like inverse of reply. Normally
18      * this is what is in the conntrack, except for prior
19      * manipulations (future optimization: if num_manips == 0,
20      * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
21      */
22     /* 从应答tuple反向得到当前tuple */
23     nf_ct_invert_tuplepr(&curr_tuple,
24                  &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
25 
26     /* 根据当前tuple和range得到NAT转换之后的的tuple */
27     get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
28 
29     /* NAT转换之后和之前的tuple不同 */
30     if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
31         struct nf_conntrack_tuple reply;
32 
33         /* Alter conntrack table so will recognize replies. */
34         /* 通过新tuple得到reply_tuple */
35         nf_ct_invert_tuplepr(&reply, &new_tuple);
36         /* 加入到reply hash */
37         nf_conntrack_alter_reply(ct, &reply);
38 
39         /* 此时tuple类似如下 */
40         /*
41             //内网10.1通过100.1访问200.1,经过SNAT之后得到tuple
42             tuple SNAT(10.1->200.1, 200.1->100.1) 
43             
44             //外网300.1通过100.1访问20.1,经过DNAT之后,得到tuple
45             tuple DNAT(300.1->100.1, 20.1->300.1) 
46         */
47 
48         /* Non-atomic: we own this at the moment. */
49         /* 更新状态需要做NAT */
50         if (maniptype == NF_NAT_MANIP_SRC)
51             ct->status |= IPS_SRC_NAT;
52         else
53             ct->status |= IPS_DST_NAT;
54 
55         /* 扩展项的调整 */
56         if (nfct_help(ct))
57             if (!nfct_seqadj_ext_add(ct))
58                 return NF_DROP;
59     }
60 
61     /* SNAT */
62     if (maniptype == NF_NAT_MANIP_SRC) {
63         struct nf_nat_conn_key key = {
64             .net = nf_ct_net(ct),
65             .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
66             .zone = nf_ct_zone(ct),
67         };
68         int err;
69 
70         /* 加入到nf_nat_bysource_table */
71         err = rhltable_insert_key(&nf_nat_bysource_table,
72                       &key,
73                       &ct->nat_bysource,
74                       nf_nat_bysource_params);
75         if (err)
76             return NF_DROP;
77     }
78 
79     /* It's done. */
80     /* NAT转换完成 */
81     if (maniptype == NF_NAT_MANIP_DST)
82         ct->status |= IPS_DST_NAT_DONE;
83     else
84         ct->status |= IPS_SRC_NAT_DONE;
85 
86     return NF_ACCEPT;
87 }

 

 1 /* 根据orig_tuple和range得到NAT转换之后的tuple */
 2 static void
 3 get_unique_tuple(struct nf_conntrack_tuple *tuple,
 4          const struct nf_conntrack_tuple *orig_tuple,
 5          const struct nf_nat_range *range,
 6          struct nf_conn *ct,
 7          enum nf_nat_manip_type maniptype)
 8 {
 9     const struct nf_conntrack_zone *zone;
10     const struct nf_nat_l3proto *l3proto;
11     const struct nf_nat_l4proto *l4proto;
12     struct net *net = nf_ct_net(ct);
13 
14     zone = nf_ct_zone(ct);
15 
16     rcu_read_lock();
17 
18     /* 查找l3proto和l4proto */
19     l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
20     l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
21                     orig_tuple->dst.protonum);
22 
23     /* 1) If this srcip/proto/src-proto-part is currently mapped,
24      * and that same mapping gives a unique tuple within the given
25      * range, use that.
26      *
27      * This is only required for source (ie. NAT/masq) mappings.
28      * So far, we don't do local source mappings, so multiple
29      * manips not an issue.
30      */
31     /* SNAT && 没有打RANDOM_ALL标记 */
32     if (maniptype == NF_NAT_MANIP_SRC &&
33         !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
34         /* try the original tuple first */
35         /* 查看orig_tuple是否满足范围要求 */
36         if (in_range(l3proto, l4proto, orig_tuple, range)) {
37             /* tuple尚未被使用 */
38             if (!nf_nat_used_tuple(orig_tuple, ct)) {
39                 /* 使用原tuple */
40                 *tuple = *orig_tuple;
41                 goto out;
42             }
43         }
44         /* ori_range不满足要求,则从bysource_table中查找一个满足范围的tuple */
45         else if (find_appropriate_src(net, zone, l3proto, l4proto,
46                         orig_tuple, tuple, range)) {
47             pr_debug("get_unique_tuple: Found current src map\n");
48             /* tuple尚未被使用 */
49             if (!nf_nat_used_tuple(tuple, ct))
50                 goto out;
51         }
52     }
53 
54     /* 从给定range中选择一个最少使用的组合 */
55     /* 2) Select the least-used IP/proto combination in the given range */
56     *tuple = *orig_tuple;
57     find_best_ips_proto(zone, tuple, range, ct, maniptype);
58 
59     /* 3) The per-protocol part of the manip is made to map into
60      * the range to make a unique tuple.
61      */
62 
63     /* Only bother mapping if it's not already in range and unique */
64     /* 没有打RANDOM_ALL标记 */
65     if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
66         /* 有SPECIFIED标记,对端口号进行检查 */
67         if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
68             /* 端口号已经在范围之内&&(端口最小最大范围相等||tuple没有使用) */
69             if (l4proto->in_range(tuple, maniptype,
70                           &range->min_proto,
71                           &range->max_proto) &&
72                 (range->min_proto.all == range->max_proto.all ||
73                  !nf_nat_used_tuple(tuple, ct)))
74                 goto out;
75         } 
76         /* 没有SPECIFIED标记,端口号不变,tuple没有被使用 */
77         else if (!nf_nat_used_tuple(tuple, ct)) {
78             goto out;
79         }
80     }
81 
82     /* Last change: get protocol to try to obtain unique tuple. */
83     /* 随机选择端口号 */
84     l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
85 out:
86     rcu_read_unlock();
87 }

 

 1 unsigned int nf_nat_packet(struct nf_conn *ct,
 2                enum ip_conntrack_info ctinfo,
 3                unsigned int hooknum,
 4                struct sk_buff *skb)
 5 {
 6     const struct nf_nat_l3proto *l3proto;
 7     const struct nf_nat_l4proto *l4proto;
 8     /* 获取方向 */
 9     enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
10     unsigned long statusbit;
11     /* 获取进行SNAT还是DNAT */
12     enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
13 
14     /* 设置NAT标记 */
15     if (mtype == NF_NAT_MANIP_SRC)
16         statusbit = IPS_SRC_NAT;
17     else
18         statusbit = IPS_DST_NAT;
19 
20     /* Invert if this is reply dir. */
21     /* 应答方向需要取反 */
22     if (dir == IP_CT_DIR_REPLY)
23         statusbit ^= IPS_NAT_MASK;
24 
25     /* Non-atomic: these bits don't change. */
26 
27     /* 需要做NAT */
28     if (ct->status & statusbit) {
29         struct nf_conntrack_tuple target;
30 
31         /* We are aiming to look like inverse of other direction. */
32         /* 获取目标tuple */
33         /*
34             //内网10.1通过100.1访问200.1,经过SNAT之后得到tuple
35             tuple SNAT(10.1->200.1, 200.1->100.1) 
36             
37             //外网300.1通过100.1访问20.1,经过DNAT之后,得到tuple
38             tuple DNAT(300.1->100.1, 20.1->300.1) 
39         */
40         nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
41 
42         /* 获取l3proto,l4proto */
43         l3proto = __nf_nat_l3proto_find(target.src.l3num);
44         l4proto = __nf_nat_l4proto_find(target.src.l3num,
45                         target.dst.protonum);
46 
47         /* 将ip地址和端口的NAT转换结果写入skb */
48         if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
49             return NF_DROP;
50     }
51     return NF_ACCEPT;
52 }

 

posted @ 2019-10-28 21:41  AlexAlex  阅读(1549)  评论(0编辑  收藏  举报