Linux xtables
Linux有各种XXtables, 比如 iptables ebtables arptables
用户空间:
iptables ebtables arptables这些用户层的工具会调用setsockopt/getsockopt来和内核通信
nf_sockopts是在iptables进行初始化时通过nf_register_sockopt()函数生成的一个struct nf_sockopt_ops结构
对于ipv4来说,在net/ipv4/netfilter/ip_tables.c中定义了一个ipt_sockopts变量(struct nf_sockopt_ops),其中的set操作指定为do_ipt_set_ctl(),因此,当nf_sockopt()调用对应的set操作时,控制将转入net/ipv4/netfilter/ip_tables.c::do_ipt_set_ctl()中。
对于IPT_SO_SET_REPLACE命令,do_ipt_set_ctl()调用do_replace()来处理,该函数将用户层传入的struct ipt_replace和struct ipt_entry组织到filter(根据struct ipt_replace::name项)表的hook_entry[NF_IP_FORWARD]所指向的区域,如果是添加规则,结果将是filter表的private(struct ipt_table_info)项的hook_entry[NF_IP_FORWARD]和underflow[NF_IP_FORWARD]的差值扩大(用于容纳该规则),private->number加1。
内核空间:
内核版本:3.18.14
结构体struct nf_sockopt_ops
把nf_sockopt_ops注册到全局的链表中(以ipt_sockopts为例)
static int __init ip_tables_init(void) { int ret; ret = register_pernet_subsys(&ip_tables_net_ops); if (ret < 0) goto err1; /* No one else will be downing sem now, so we won't sleep */ ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); if (ret < 0) goto err2; ret = xt_register_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt)); if (ret < 0) goto err4; /* Register setsockopt */ ret = nf_register_sockopt(&ipt_sockopts); if (ret < 0) goto err5; pr_info("(C) 2000-2006 Netfilter Core Team\n"); return 0; err5: xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt)); err4: xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); err2: unregister_pernet_subsys(&ip_tables_net_ops); err1: return ret; }
struct ipt_sockopts定义:
static struct nf_sockopt_ops ipt_sockopts = { .pf = PF_INET, .set_optmin = IPT_BASE_CTL, .set_optmax = IPT_SO_SET_MAX+1, .set = do_ipt_set_ctl, #ifdef CONFIG_COMPAT .compat_set = compat_do_ipt_set_ctl, #endif .get_optmin = IPT_BASE_CTL, .get_optmax = IPT_SO_GET_MAX+1, .get = do_ipt_get_ctl, #ifdef CONFIG_COMPAT .compat_get = compat_do_ipt_get_ctl, #endif .owner = THIS_MODULE, };
对于IPT_SO_SET_REPLACE命令,do_ipt_set_ctl()调用do_replace()来处理,该函数将用户层传入的struct ipt_replace和struct ipt_entry组织到filter(根据struct ipt_replace::name项)表的hook_entry[NF_IP_FORWARD]所指向的区域,如果是添加规则,结果将是filter表的private(struct ipt_table_info)项的hook_entry[NF_IP_FORWARD]和underflow[NF_IP_FORWARD]的差值扩大(用于容纳该规则),private->number加1。
iptables上有四张表(filter、mangle、raw和nat)
分别在
net/ipv4/netfilter/iptable_filter.c
net/ipv4/netfilter/iptable_mangle.c
net/ipv4/netfilter/iptable_nat.c
net/ipv4/netfilter/iptable_raw.c
以其中filter表为例:
在内核文件net/ipv4/netfilter/iptable_filter.c中:
static int __net_init iptable_filter_net_init(struct net *net) { struct ipt_replace *repl; repl = ipt_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ipt_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; net->ipv4.iptable_filter = ipt_register_table(net, &packet_filter, repl); kfree(repl); return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter); } ... static struct pernet_operations iptable_filter_net_ops = { .init = iptable_filter_net_init, .exit = iptable_filter_net_exit, }; static int __init iptable_filter_init(void) { int ret; ret = register_pernet_subsys(&iptable_filter_net_ops); if (ret < 0) return ret; /* Register hooks */ filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); if (IS_ERR(filter_ops)) { ret = PTR_ERR(filter_ops); unregister_pernet_subsys(&iptable_filter_net_ops); } return ret; }
调用ipt_register_table注册了一个struct xt_table,那我们介绍一下struct xtable
/* Furniture shopping... */ struct xt_table { struct list_head list; /* What hooks you will enter on */ unsigned int valid_hooks; /* Man behind the curtain... */ struct xt_table_info *private; /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; u_int8_t af; /* address/protocol family */ int priority; /* hook order */ /* A unique name... */ const char name[XT_TABLE_MAXNAMELEN]; };
再看窗帘后面的男人: strcut xt_table_info
/* The table itself */ struct xt_table_info { /* Size per table */ unsigned int size; /* Number of entries: FIXME. --RR */ unsigned int number; /* Initial number of entries. Needed for module usage count */ unsigned int initial_entries; /* Entry points and underflows */ unsigned int hook_entry[NF_INET_NUMHOOKS]; unsigned int underflow[NF_INET_NUMHOOKS]; /* * Number of user chains. Since tables cannot have loops, at most * @stacksize jumps (number of user chains) can possibly be made. */ unsigned int stacksize; unsigned int __percpu *stackptr; void ***jumpstack; /* ipt_entry tables: one per CPU */ /* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */ void *entries[1]; };
刚刚说啥来着:setsockopt最后把用户配置的规则加在xt_table_info的hook_entry中...
回到上面出现的 xt_hook_link 函数
先看 packet_filter 的定义和 iptable_filter_hook 的定义
#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_FILTER, }; static unsigned int iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net; if (ops->hooknum == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; net = dev_net((in != NULL) ? in : out); return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_filter); }
再看xt_hook_link函数的实现
/** * xt_hook_link - set up hooks for a new table * @table: table with metadata needed to set up hooks * @fn: Hook function * * This function will take care of creating and registering the necessary * Netfilter hooks for XT tables. */ struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn) { unsigned int hook_mask = table->valid_hooks; uint8_t i, num_hooks = hweight32(hook_mask); uint8_t hooknum; struct nf_hook_ops *ops; int ret; ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); for (i = 0, hooknum = 0; i < num_hooks && hook_mask != 0; hook_mask >>= 1, ++hooknum) { if (!(hook_mask & 1)) continue; ops[i].hook = fn; ops[i].owner = table->me; ops[i].pf = table->af; ops[i].hooknum = hooknum; ops[i].priority = table->priority; ++i; } ret = nf_register_hooks(ops, num_hooks); if (ret < 0) { kfree(ops); return ERR_PTR(ret); } return ops; }
是的,在这里调用了nf_register_hooks,注册了一堆hook,请关注这个filter表的NFPROTO_IPV4协议族中,注册点为:FILTER_VALID_HOOKS
也就是说在 FILTER_VALID_HOOKS 包含的三个HOOK点上都注册了同样的一个hook函数 ip_filter_hook
这个iptable_filter_hook最终调用 ipt_do_table
至于skb的处理怎么进入到这些hook函数,有很多高手写的很清楚,这里给个链接:
来看这个ipt_do_table
net/ipv4/netfilter/ip_tables.c文件中:
带着这个重点去看这个函数:struct xt_table结构体中有一个躲在窗帘后的男人
/* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int ipt_do_table(struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, struct xt_table *table) { static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); const struct iphdr *ip; /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; const char *indev, *outdev; const void *table_base; struct ipt_entry *e, **jumpstack; unsigned int *stackptr, origptr, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ ip = ip_hdr(skb); indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; /* We handle fragments by dealing with the first fragment as * if it was a normal packet. All other fragments are treated * normally, except that they will NEVER match rules that ask * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; acpar.thoff = ip_hdrlen(skb); acpar.hotdrop = false; acpar.in = in; acpar.out = out; acpar.family = NFPROTO_IPV4; acpar.hooknum = hook; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); local_bh_disable(); addend = xt_write_recseq_begin(); private = table->private; cpu = smp_processor_id(); /* * Ensure we load private-> members after we've fetched the base * pointer. */ smp_read_barrier_depends(); table_base = private->entries[cpu]; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; stackptr = per_cpu_ptr(private->stackptr, cpu); origptr = *stackptr; e = get_entry(table_base, private->hook_entry[hook]); pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n", table->name, hook, origptr, get_entry(table_base, private->underflow[hook])); do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; IP_NF_ASSERT(e); if (!ip_packet_match(ip, indev, outdev, &e->ip, acpar.fragoff)) { no_match: e = ipt_next_entry(e); continue; } xt_ematch_foreach(ematch, e) { acpar.match = ematch->u.kernel.match; acpar.matchinfo = ematch->data; if (!acpar.match->match(skb, &acpar)) goto no_match; } ADD_COUNTER(e->counters, skb->len, 1); t = ipt_get_target(e); IP_NF_ASSERT(t->u.kernel.target); #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) trace_packet(skb, hook, in, out, table->name, private, e); #endif /* Standard target? */ if (!t->u.kernel.target->target) { int v; v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ if (v != XT_RETURN) { verdict = (unsigned int)(-v) - 1; break; } if (*stackptr <= origptr) { e = get_entry(table_base, private->underflow[hook]); pr_debug("Underflow (this is normal) " "to %p\n", e); } else { e = jumpstack[--*stackptr]; pr_debug("Pulled %p out from pos %u\n", e, *stackptr); e = ipt_next_entry(e); } continue; } if (table_base + v != ipt_next_entry(e) && !(e->ip.flags & IPT_F_GOTO)) { if (*stackptr >= private->stacksize) { verdict = NF_DROP; break; } jumpstack[(*stackptr)++] = e; pr_debug("Pushed %p into pos %u\n", e, *stackptr - 1); } e = get_entry(table_base, v); continue; } acpar.target = t->u.kernel.target; acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); /* Target might have changed stuff. */ ip = ip_hdr(skb); if (verdict == XT_CONTINUE) e = ipt_next_entry(e); else /* Verdict */ break; } while (!acpar.hotdrop); pr_debug("Exiting %s; resetting sp from %u to %u\n", __func__, *stackptr, origptr); *stackptr = origptr; xt_write_recseq_end(addend); local_bh_enable(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; #else if (acpar.hotdrop) return NF_DROP; else return verdict; #endif }
这里只是给出了大概的工作原理,细节都没有研究到,后续更新...