Netfilter 之 iptable_nat
初始化
iptable_nat_table_init函数通过调用ipt_register_table完成NAT表注册和钩子函数注册的功能;该流程与iptable_filter的函数调用的函数一致,此处不再重复分析,详情请移步<iptable_filter分析>;
1 static int __net_init iptable_nat_table_init(struct net *net) 2 { 3 struct ipt_replace *repl; 4 int ret; 5 6 /* nat表已经初始化过 */ 7 if (net->ipv4.nat_table) 8 return 0; 9 10 /* 分配初始化表,用于下面的注册 */ 11 repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); 12 if (repl == NULL) 13 return -ENOMEM; 14 /* 表注册,钩子函数注册 */ 15 ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, 16 nf_nat_ipv4_ops, &net->ipv4.nat_table); 17 kfree(repl); 18 return ret; 19 }
钩子函数分析
钩子函数以及钩子点
nf_nat_ipv4_ops是NAT相关钩子函数的数组,其调用顺序和钩子点见下面注释;其中filter工作在DNAT和SNAT之间;
这几个钩子函数都会调用nf_nat_ipv4_fn来完成NAT转换,本部分最后统一分析该函数;
1 /* 钩子函数数组 */ 2 /* 顺序 DNAT->filter->SNAT */ 3 /* 输入本机 PRE_ROUTING(DNAT)->LOCAL_IN(SNAT) */ 4 /* 转发 PRE_ROUTING(DNAT)->POST_ROUTING(SNAT) */ 5 /* 本机输出 LOCAL_OUT(DNAT)->POST_ROUTING(SNAT) */ 6 static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { 7 /* Before packet filtering, change destination */ 8 { 9 .hook = iptable_nat_ipv4_in, 10 .pf = NFPROTO_IPV4, 11 .hooknum = NF_INET_PRE_ROUTING, 12 .priority = NF_IP_PRI_NAT_DST, /* DNAT */ 13 }, 14 /* After packet filtering, change source */ 15 { 16 .hook = iptable_nat_ipv4_out, 17 .pf = NFPROTO_IPV4, 18 .hooknum = NF_INET_POST_ROUTING, 19 .priority = NF_IP_PRI_NAT_SRC, /* SNAT */ 20 }, 21 /* Before packet filtering, change destination */ 22 { 23 .hook = iptable_nat_ipv4_local_fn, 24 .pf = NFPROTO_IPV4, 25 .hooknum = NF_INET_LOCAL_OUT, 26 .priority = NF_IP_PRI_NAT_DST, /* DNAT */ 27 }, 28 /* After packet filtering, change source */ 29 { 30 .hook = iptable_nat_ipv4_fn, 31 .pf = NFPROTO_IPV4, 32 .hooknum = NF_INET_LOCAL_IN, 33 .priority = NF_IP_PRI_NAT_SRC, /* SNAT */ 34 }, 35 };
iptable_nat_ipv4_in
函数工作在PRE_ROUTING钩子点,进行DNAT转换;
1 /* PRE_ROUTING,DNAT */ 2 static unsigned int iptable_nat_ipv4_in(void *priv, 3 struct sk_buff *skb, 4 const struct nf_hook_state *state) 5 { 6 return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain); 7 }
nf_nat_ipv4_in函数在进行DNAT转换之前记录了目的地址,在进行转换之后,如果目的地址发生了改变,则需要释放skb中的路由缓存;NAT转换过程调用nf_nat_ipv4_fn完成,步骤见下面的该函数分析;
1 /* PRE_ROUTING, DNAT */ 2 unsigned int 3 nf_nat_ipv4_in(void *priv, struct sk_buff *skb, 4 const struct nf_hook_state *state, 5 unsigned int (*do_chain)(void *priv, 6 struct sk_buff *skb, 7 const struct nf_hook_state *state, 8 struct nf_conn *ct)) 9 { 10 unsigned int ret; 11 /* 获取目的地址 */ 12 __be32 daddr = ip_hdr(skb)->daddr; 13 14 /* DNAT转换 */ 15 ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); 16 17 /* 转换之后,目的地址发生变化,释放路由缓存 */ 18 if (ret != NF_DROP && ret != NF_STOLEN && 19 daddr != ip_hdr(skb)->daddr) 20 skb_dst_drop(skb); 21 22 return ret; 23 }
iptable_nat_ipv4_fn
函数工作在LOCAL_IN钩子点,进行SNAT转换;NAT转换过程调用nf_nat_ipv4_fn完成,步骤见下面的该函数分析;
1 /* LOCAL_IN,SNAT */ 2 static unsigned int iptable_nat_ipv4_fn(void *priv, 3 struct sk_buff *skb, 4 const struct nf_hook_state *state) 5 { 6 return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain); 7 }
iptable_nat_ipv4_local_fn
函数工作在LOCAL_OUT钩子点,进行DNAT转换;
1 /* LOCAL_OUT,DNAT */ 2 static unsigned int iptable_nat_ipv4_local_fn(void *priv, 3 struct sk_buff *skb, 4 const struct nf_hook_state *state) 5 { 6 return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain); 7 }
nf_nat_ipv4_local_fn函数在进行DNAT转换之后,如果地址发生变化,则需要重新进行路由查;NAT转换过程调用nf_nat_ipv4_fn完成,步骤见下面的该函数分析;
1 unsigned int 2 nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, 3 const struct nf_hook_state *state, 4 unsigned int (*do_chain)(void *priv, 5 struct sk_buff *skb, 6 const struct nf_hook_state *state, 7 struct nf_conn *ct)) 8 { 9 const struct nf_conn *ct; 10 enum ip_conntrack_info ctinfo; 11 unsigned int ret; 12 int err; 13 14 /* root is playing with raw sockets. */ 15 if (skb->len < sizeof(struct iphdr) || 16 ip_hdrlen(skb) < sizeof(struct iphdr)) 17 return NF_ACCEPT; 18 19 /* DNAT转换 */ 20 ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); 21 22 /* 转换成功 */ 23 if (ret != NF_DROP && ret != NF_STOLEN && 24 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 25 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 26 27 /* ip地址发生变化 */ 28 if (ct->tuplehash[dir].tuple.dst.u3.ip != 29 ct->tuplehash[!dir].tuple.src.u3.ip) { 30 /* 重新查路由 */ 31 err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); 32 if (err < 0) 33 ret = NF_DROP_ERR(err); 34 } 35 #ifdef CONFIG_XFRM 36 else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && 37 ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && 38 ct->tuplehash[dir].tuple.dst.u.all != 39 ct->tuplehash[!dir].tuple.src.u.all) { 40 err = nf_xfrm_me_harder(state->net, skb, AF_INET); 41 if (err < 0) 42 ret = NF_DROP_ERR(err); 43 } 44 #endif 45 } 46 return ret; 47 }
iptable_nat_ipv4_out
函数工作在POST_ROUTING钩子点,进行SNAT转换;
1 /* POST_ROUTING,SNAT */ 2 static unsigned int iptable_nat_ipv4_out(void *priv, 3 struct sk_buff *skb, 4 const struct nf_hook_state *state) 5 { 6 return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain); 7 }
1 unsigned int 2 nf_nat_ipv4_out(void *priv, struct sk_buff *skb, 3 const struct nf_hook_state *state, 4 unsigned int (*do_chain)(void *priv, 5 struct sk_buff *skb, 6 const struct nf_hook_state *state, 7 struct nf_conn *ct)) 8 { 9 #ifdef CONFIG_XFRM 10 const struct nf_conn *ct; 11 enum ip_conntrack_info ctinfo; 12 int err; 13 #endif 14 unsigned int ret; 15 16 /* root is playing with raw sockets. */ 17 if (skb->len < sizeof(struct iphdr) || 18 ip_hdrlen(skb) < sizeof(struct iphdr)) 19 return NF_ACCEPT; 20 21 /* SNAT转换 */ 22 ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); 23 #ifdef CONFIG_XFRM 24 if (ret != NF_DROP && ret != NF_STOLEN && 25 !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && 26 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 27 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 28 29 if ((ct->tuplehash[dir].tuple.src.u3.ip != 30 ct->tuplehash[!dir].tuple.dst.u3.ip) || 31 (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && 32 ct->tuplehash[dir].tuple.src.u.all != 33 ct->tuplehash[!dir].tuple.dst.u.all)) { 34 err = nf_xfrm_me_harder(state->net, skb, AF_INET); 35 if (err < 0) 36 ret = NF_DROP_ERR(err); 37 } 38 } 39 #endif 40 return ret; 41 }
公共函数nf_nat_ipv4_fn
nf_nat_ipv4_fn完成具体的SNAT或者DNAT的转换流程,上面的四个钩子函数都会调用该函数;
1 unsigned int 2 nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, 3 const struct nf_hook_state *state, 4 unsigned int (*do_chain)(void *priv, 5 struct sk_buff *skb, 6 const struct nf_hook_state *state, 7 struct nf_conn *ct)) 8 { 9 struct nf_conn *ct; 10 enum ip_conntrack_info ctinfo; 11 struct nf_conn_nat *nat; 12 /* maniptype == SRC for postrouting. */ 13 /* 获取是进行DNAT还是SNAT,其中PRE_ROUTING和LOCAL_OUT进行DNAT,LOCAL_IN和POST_ROUTING进行SNAT */ 14 enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); 15 16 /* 获取skb关联的连接跟踪sf_conn */ 17 ct = nf_ct_get(skb, &ctinfo); 18 /* Can't track? It's not due to stress, or conntrack would 19 * have dropped it. Hence it's the user's responsibilty to 20 * packet filter it out, or implement conntrack/NAT for that 21 * protocol. 8) --RR 22 */ 23 /* 没有,返回accpet */ 24 if (!ct) 25 return NF_ACCEPT; 26 27 /* 获取NAT扩展 */ 28 nat = nfct_nat(ct); 29 30 /* 判断连接跟踪状态 */ 31 switch (ctinfo) { 32 /* 关联连接(或者icmp错误)或者关联连接的应答 */ 33 case IP_CT_RELATED: 34 case IP_CT_RELATED_REPLY: 35 /* icmp协议的NAT操作 */ 36 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { 37 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, 38 state->hook)) 39 return NF_DROP; 40 else 41 return NF_ACCEPT; 42 } 43 /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ 44 case IP_CT_NEW: 45 /* Seen it before? This can happen for loopback, retrans, 46 * or local packets. 47 */ 48 /* 尚未进行过NAT转换 */ 49 if (!nf_nat_initialized(ct, maniptype)) { 50 unsigned int ret; 51 52 /* 进行规则匹配 */ 53 ret = do_chain(priv, skb, state, ct); 54 if (ret != NF_ACCEPT) 55 return ret; 56 57 /* 打NAT转换标记 */ 58 if (nf_nat_initialized(ct, HOOK2MANIP(state->hook))) 59 break; 60 61 /* 连接跟踪进行NAT */ 62 ret = nf_nat_alloc_null_binding(ct, state->hook); 63 if (ret != NF_ACCEPT) 64 return ret; 65 } 66 /* 进行过NAT转换 */ 67 else { 68 pr_debug("Already setup manip %s for ct %p\n", 69 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", 70 ct); 71 /* 出接口发生改变 */ 72 if (nf_nat_oif_changed(state->hook, ctinfo, nat, 73 state->out)) 74 goto oif_changed; 75 } 76 break; 77 78 default: 79 /* ESTABLISHED */ 80 NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || 81 ctinfo == IP_CT_ESTABLISHED_REPLY); 82 /* 出接口发生改变 */ 83 if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) 84 goto oif_changed; 85 } 86 87 /* skb数据包进行NAT转换修改 */ 88 return nf_nat_packet(ct, ctinfo, state->hook, skb); 89 90 oif_changed: 91 nf_ct_kill_acct(ct, ctinfo, skb); 92 return NF_DROP; 93 }
1 unsigned int 2 nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) 3 { 4 return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); 5 }
1 static unsigned int 2 __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) 3 { 4 /* Force range to this IP; let proto decide mapping for 5 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). 6 * Use reply in case it's already been mangled (eg local packet). 7 */ 8 /* 使用应答方向的ip地址,LOCAL_OUT会先经过mangle,可能改变了 */ 9 union nf_inet_addr ip = 10 (manip == NF_NAT_MANIP_SRC ? 11 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : 12 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); 13 14 /* 设置range */ 15 struct nf_nat_range range = { 16 .flags = NF_NAT_RANGE_MAP_IPS, 17 .min_addr = ip, 18 .max_addr = ip, 19 }; 20 21 /* 进行NAT转换 */ 22 return nf_nat_setup_info(ct, &range, manip); 23 }
1 unsigned int 2 nf_nat_setup_info(struct nf_conn *ct, 3 const struct nf_nat_range *range, 4 enum nf_nat_manip_type maniptype) 5 { 6 struct nf_conntrack_tuple curr_tuple, new_tuple; 7 8 /* Can't setup nat info for confirmed ct. */ 9 /* 已经确认的,返回accpet */ 10 if (nf_ct_is_confirmed(ct)) 11 return NF_ACCEPT; 12 13 NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || 14 maniptype == NF_NAT_MANIP_DST); 15 BUG_ON(nf_nat_initialized(ct, maniptype)); 16 17 /* What we've got will look like inverse of reply. Normally 18 * this is what is in the conntrack, except for prior 19 * manipulations (future optimization: if num_manips == 0, 20 * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) 21 */ 22 /* 从应答tuple反向得到当前tuple */ 23 nf_ct_invert_tuplepr(&curr_tuple, 24 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 25 26 /* 根据当前tuple和range得到NAT转换之后的的tuple */ 27 get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); 28 29 /* NAT转换之后和之前的tuple不同 */ 30 if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { 31 struct nf_conntrack_tuple reply; 32 33 /* Alter conntrack table so will recognize replies. */ 34 /* 通过新tuple得到reply_tuple */ 35 nf_ct_invert_tuplepr(&reply, &new_tuple); 36 /* 加入到reply hash */ 37 nf_conntrack_alter_reply(ct, &reply); 38 39 /* 此时tuple类似如下 */ 40 /* 41 //内网10.1通过100.1访问200.1,经过SNAT之后得到tuple 42 tuple SNAT(10.1->200.1, 200.1->100.1) 43 44 //外网300.1通过100.1访问20.1,经过DNAT之后,得到tuple 45 tuple DNAT(300.1->100.1, 20.1->300.1) 46 */ 47 48 /* Non-atomic: we own this at the moment. */ 49 /* 更新状态需要做NAT */ 50 if (maniptype == NF_NAT_MANIP_SRC) 51 ct->status |= IPS_SRC_NAT; 52 else 53 ct->status |= IPS_DST_NAT; 54 55 /* 扩展项的调整 */ 56 if (nfct_help(ct)) 57 if (!nfct_seqadj_ext_add(ct)) 58 return NF_DROP; 59 } 60 61 /* SNAT */ 62 if (maniptype == NF_NAT_MANIP_SRC) { 63 struct nf_nat_conn_key key = { 64 .net = nf_ct_net(ct), 65 .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 66 .zone = nf_ct_zone(ct), 67 }; 68 int err; 69 70 /* 加入到nf_nat_bysource_table */ 71 err = rhltable_insert_key(&nf_nat_bysource_table, 72 &key, 73 &ct->nat_bysource, 74 nf_nat_bysource_params); 75 if (err) 76 return NF_DROP; 77 } 78 79 /* It's done. */ 80 /* NAT转换完成 */ 81 if (maniptype == NF_NAT_MANIP_DST) 82 ct->status |= IPS_DST_NAT_DONE; 83 else 84 ct->status |= IPS_SRC_NAT_DONE; 85 86 return NF_ACCEPT; 87 }
1 /* 根据orig_tuple和range得到NAT转换之后的tuple */ 2 static void 3 get_unique_tuple(struct nf_conntrack_tuple *tuple, 4 const struct nf_conntrack_tuple *orig_tuple, 5 const struct nf_nat_range *range, 6 struct nf_conn *ct, 7 enum nf_nat_manip_type maniptype) 8 { 9 const struct nf_conntrack_zone *zone; 10 const struct nf_nat_l3proto *l3proto; 11 const struct nf_nat_l4proto *l4proto; 12 struct net *net = nf_ct_net(ct); 13 14 zone = nf_ct_zone(ct); 15 16 rcu_read_lock(); 17 18 /* 查找l3proto和l4proto */ 19 l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num); 20 l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num, 21 orig_tuple->dst.protonum); 22 23 /* 1) If this srcip/proto/src-proto-part is currently mapped, 24 * and that same mapping gives a unique tuple within the given 25 * range, use that. 26 * 27 * This is only required for source (ie. NAT/masq) mappings. 28 * So far, we don't do local source mappings, so multiple 29 * manips not an issue. 30 */ 31 /* SNAT && 没有打RANDOM_ALL标记 */ 32 if (maniptype == NF_NAT_MANIP_SRC && 33 !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 34 /* try the original tuple first */ 35 /* 查看orig_tuple是否满足范围要求 */ 36 if (in_range(l3proto, l4proto, orig_tuple, range)) { 37 /* tuple尚未被使用 */ 38 if (!nf_nat_used_tuple(orig_tuple, ct)) { 39 /* 使用原tuple */ 40 *tuple = *orig_tuple; 41 goto out; 42 } 43 } 44 /* ori_range不满足要求,则从bysource_table中查找一个满足范围的tuple */ 45 else if (find_appropriate_src(net, zone, l3proto, l4proto, 46 orig_tuple, tuple, range)) { 47 pr_debug("get_unique_tuple: Found current src map\n"); 48 /* tuple尚未被使用 */ 49 if (!nf_nat_used_tuple(tuple, ct)) 50 goto out; 51 } 52 } 53 54 /* 从给定range中选择一个最少使用的组合 */ 55 /* 2) Select the least-used IP/proto combination in the given range */ 56 *tuple = *orig_tuple; 57 find_best_ips_proto(zone, tuple, range, ct, maniptype); 58 59 /* 3) The per-protocol part of the manip is made to map into 60 * the range to make a unique tuple. 61 */ 62 63 /* Only bother mapping if it's not already in range and unique */ 64 /* 没有打RANDOM_ALL标记 */ 65 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 66 /* 有SPECIFIED标记,对端口号进行检查 */ 67 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { 68 /* 端口号已经在范围之内&&(端口最小最大范围相等||tuple没有使用) */ 69 if (l4proto->in_range(tuple, maniptype, 70 &range->min_proto, 71 &range->max_proto) && 72 (range->min_proto.all == range->max_proto.all || 73 !nf_nat_used_tuple(tuple, ct))) 74 goto out; 75 } 76 /* 没有SPECIFIED标记,端口号不变,tuple没有被使用 */ 77 else if (!nf_nat_used_tuple(tuple, ct)) { 78 goto out; 79 } 80 } 81 82 /* Last change: get protocol to try to obtain unique tuple. */ 83 /* 随机选择端口号 */ 84 l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct); 85 out: 86 rcu_read_unlock(); 87 }
1 unsigned int nf_nat_packet(struct nf_conn *ct, 2 enum ip_conntrack_info ctinfo, 3 unsigned int hooknum, 4 struct sk_buff *skb) 5 { 6 const struct nf_nat_l3proto *l3proto; 7 const struct nf_nat_l4proto *l4proto; 8 /* 获取方向 */ 9 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 10 unsigned long statusbit; 11 /* 获取进行SNAT还是DNAT */ 12 enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); 13 14 /* 设置NAT标记 */ 15 if (mtype == NF_NAT_MANIP_SRC) 16 statusbit = IPS_SRC_NAT; 17 else 18 statusbit = IPS_DST_NAT; 19 20 /* Invert if this is reply dir. */ 21 /* 应答方向需要取反 */ 22 if (dir == IP_CT_DIR_REPLY) 23 statusbit ^= IPS_NAT_MASK; 24 25 /* Non-atomic: these bits don't change. */ 26 27 /* 需要做NAT */ 28 if (ct->status & statusbit) { 29 struct nf_conntrack_tuple target; 30 31 /* We are aiming to look like inverse of other direction. */ 32 /* 获取目标tuple */ 33 /* 34 //内网10.1通过100.1访问200.1,经过SNAT之后得到tuple 35 tuple SNAT(10.1->200.1, 200.1->100.1) 36 37 //外网300.1通过100.1访问20.1,经过DNAT之后,得到tuple 38 tuple DNAT(300.1->100.1, 20.1->300.1) 39 */ 40 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); 41 42 /* 获取l3proto,l4proto */ 43 l3proto = __nf_nat_l3proto_find(target.src.l3num); 44 l4proto = __nf_nat_l4proto_find(target.src.l3num, 45 target.dst.protonum); 46 47 /* 将ip地址和端口的NAT转换结果写入skb */ 48 if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype)) 49 return NF_DROP; 50 } 51 return NF_ACCEPT; 52 }