网桥一二三
网桥?交换机的老爸是也。当然,一代要比一代强,交换机是一种多端口,自适应,再加上各种其他更好的性能,但这些形容词之后,仍然只是个“网桥”。
网桥涉及各协议,stp。网桥、终端等构成的网络,是个闭合的拓扑图,自然会有很多回环,圈圈什么的。数据包,当然不能无休止的转圈圈,所以,这个图,要有个逻辑概念,于是要修剪成无环路的树型网络。
首先从init开始:
static int __init br_init(void) { int err; err = stp_proto_register(&br_stp_proto); if (err < 0) { pr_err("bridge: can't register sap for STP\n"); return err; } err = br_fdb_init(); //kmem_cache_create if (err) goto err_out; err = register_pernet_subsys(&br_net_ops); if (err) goto err_out1; //网桥的netfiter处理 err = br_netfilter_init(); if (err) goto err_out2; //在netdev_chain通知链表上注册 err = register_netdevice_notifier(&br_device_notifier); if (err) goto err_out3; err = br_netlink_init(); if (err) goto err_out4; //用户空间ioctl调用的函数 brioctl_set(br_ioctl_deviceless_stub); //-->b: #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE) br_fdb_test_addr_hook = br_fdb_test_addr; #endif return 0; }
<b>
int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user * uarg)
{
switch (cmd) {
case SIOCGIFBR:
case SIOCSIFBR:
return old_deviceless(net, uarg);
//新建网桥
case SIOCBRADDBR:
//删除网桥
case SIOCBRDELBR:
{
char buf[IFNAMSIZ];
if (!capable(CAP_NET_ADMIN))
return-EPERM;
if (copy_from_user(buf, uarg, IFNAMSIZ))
return-EFAULT;
buf[IFNAMSIZ-1] =0;
if (cmd == SIOCBRADDBR)
return br_add_bridge(net, buf); //-->c:
return br_del_bridge(net, buf);
}
return-EOPNOTSUPP;
}
<c>
int br_add_bridge(struct net *net, constchar*name)
{
struct net_device *dev;
int ret;
//为虚拟桥新建一个net_device
dev = new_bridge_dev(net, name); //-->d:
if (!dev)
return-ENOMEM;
rtnl_lock();
if (strchr(dev->name, '%')) {
ret = dev_alloc_name(dev, dev->name); //内核给分配个名字
if (ret <0)
goto out_free;
}
SET_NETDEV_DEVTYPE(dev, &br_type);
ret = register_netdevice(dev); //然后注册该网络设备
if (ret)
goto out_free;
ret = br_sysfs_addbr(dev); //sysfs中建立相关信息
if (ret)
unregister_netdevice(dev);
out:
rtnl_unlock();
return ret;
out_free:
free_netdev(dev);
gotoout;
}
<d>
staticstruct net_device *new_bridge_dev(struct net *net, constchar*name)
{
struct net_bridge *br;
struct net_device *dev;
dev = alloc_netdev(sizeof(struct net_bridge), name,
br_dev_setup); //-->e:
if (!dev)
return NULL;
dev_net_set(dev, net);
br = netdev_priv(dev); //获得私有区间
br->dev = dev;
br->stats = alloc_percpu(struct br_cpu_netstats);
if (!br->stats) {
free_netdev(dev);
return NULL;
}
spin_lock_init(&br->lock);
//队列初始化。在port_list中保存了这个桥上的端口列表
INIT_LIST_HEAD(&br->port_list);
spin_lock_init(&br->hash_lock);
//stp协议相关
br->bridge_id.prio[0] =0x80;
br->bridge_id.prio[1] =0x00;
memcpy(br->group_addr, br_group_address, ETH_ALEN);
br->feature_mask = dev->features;
br->stp_enabled = BR_NO_STP;
br->designated_root = br->bridge_id;
br->root_path_cost =0;
br->root_port =0;
br->bridge_max_age = br->max_age =20* HZ;
br->bridge_hello_time = br->hello_time =2* HZ;
br->bridge_forward_delay = br->forward_delay =15* HZ;
br->topology_change =0;
br->topology_change_detected =0;
br->ageing_time =300* HZ;
br_netfilter_rtable_init(br);
br_stp_timer_init(br);
br_multicast_init(br);
return dev;
}
该函数主要是为*br (struct net_bridge) 赋值,但首先要初始化 dev (struct net_device)。
<e>
void br_dev_setup(struct net_device *dev)
{
//将桥的MAC地址设为零
random_ether_addr(dev->dev_addr);
//dev以太网部分初始化
ether_setup(dev);
dev->netdev_ops =&br_netdev_ops; //***
dev->destructor = br_dev_free;
SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
dev->tx_queue_len =0;
dev->priv_flags = IFF_EBRIDGE;
dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX |
NETIF_F_NETNS_LOCAL | NETIF_F_GSO | NETIF_F_HW_VLAN_TX;
}
代码中的网络处理函数部分:
将接口添进网桥时,用户空间调用ioctl(br_socket_fd, SIOCBRADDIF, &ifr)
在 dev->netdev_ops = &br_netdev_ops中,回调函数:
staticconststruct net_device_ops br_netdev_ops = {
... ...
.ndo_do_ioctl = br_dev_ioctl,
... ...
}
具体的网桥ioctl :
int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) { struct net_bridge *br = netdev_priv(dev); switch(cmd) { case SIOCDEVPRIVATE: return old_dev_ioctl(dev, rq, cmd); //添加一个接口 case SIOCBRADDIF: //删除一个接口 case SIOCBRDELIF: return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF); //-->f: } br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd); return -EOPNOTSUPP; }
<f>
/* called with RTNL */
staticint add_del_if(struct net_bridge *br, int ifindex, int isadd)
{
struct net_device *dev;
int ret;
if (!capable(CAP_NET_ADMIN))
return-EPERM;
dev = __dev_get_by_index(dev_net(br->dev), ifindex);
if (dev == NULL)
return-EINVAL;
if (isadd) //isadd: cmd == SIOCBRADDIF 为真
ret = br_add_if(br, dev); //-->g:
else
ret = br_del_if(br, dev);
return ret;
}
<g>
int br_add_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int err = 0; bool changed_addr; /* Don't allow bridging non-ethernet like devices */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN) return -EINVAL; /* No bridging of bridges */ if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) return -ELOOP; /* Device is already being bridged */ if (br_port_exists(dev)) return -EBUSY; /* No bridging devices that dislike that (e.g. wireless) */ if (dev->priv_flags & IFF_DONT_BRIDGE) return -EOPNOTSUPP; //为接口创建net_bridge_port p = new_nbp(br, dev); //-->h: if (IS_ERR(p)) return PTR_ERR(p); //设置接口为混杂模式 err = dev_set_promiscuity(dev, 1); if (err) goto put_back; err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj), SYSFS_BRIDGE_PORT_ATTR); if (err) goto err0; //更新port->MAC对应表 err = br_fdb_insert(br, p, dev->dev_addr); //-->i: if (err) goto err1; err = br_sysfs_addif(p); if (err) goto err2; if (br_netpoll_info(br) && ((err = br_netpoll_enable(p)))) goto err3; err = netdev_rx_handler_register(dev, br_handle_frame, p); //-->k: if (err) goto err4; dev->priv_flags |= IFF_BRIDGE_PORT; dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); //-->j: br_features_recompute(br); if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) && (br->dev->flags & IFF_UP)) br_stp_enable_port(p); spin_unlock_bh(&br->lock); br_ifinfo_notify(RTM_NEWLINK, p); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); dev_set_mtu(br->dev, br_min_mtu(br)); kobject_uevent(&p->kobj, KOBJ_ADD); return 0; err4: netdev_set_master(dev, NULL); err3: sysfs_remove_link(br->ifobj, p->dev->name); err2: br_fdb_delete_by_port(br, p, 1); err1: kobject_put(&p->kobj); p = NULL; /* kobject_put frees */ err0: dev_set_promiscuity(dev, -1); put_back: dev_put(dev); kfree(p); return err; }
<h>
/* 为接口创建net_bridge_port */
staticstruct net_bridge_port *new_nbp(struct net_bridge *br,
struct net_device *dev)
{
int index;
struct net_bridge_port *p;
index = find_portno(br);
if (index <0)
return ERR_PTR(index);
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (p == NULL)
return ERR_PTR(-ENOMEM);
p->br = br;
dev_hold(dev);
p->dev = dev;
p->path_cost = port_cost(dev);
p->priority =0x8000>> BR_PORT_BITS;
p->port_no = index;
p->flags =0;
br_init_port(p);
p->state = BR_STATE_DISABLED;
br_stp_port_timer_init(p);
br_multicast_add_port(p);
return p;
}
之后,把要加入的 接口对应的mac 与 接口
作为本机静态项 加入到port—mac对应表。
<i>
int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char*addr)
{
int ret;
spin_lock_bh(&br->hash_lock);
ret = fdb_insert(br, source, addr); //-->
spin_unlock_bh(&br->hash_lock);
return ret;
}
/*
* 此函数先判断要插入项是否存在,
* 若是已存在,且不为静态项,具更新对应项。
* 若不存在该项,则分配一个net_bridge_fdb_entry,插入到CAM表
*/
staticint fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char*addr)
{
struct hlist_head *head =&br->hash[br_mac_hash(addr)];
struct net_bridge_fdb_entry *fdb;
//判断是否为有效的mac 地址
if (!is_valid_ether_addr(addr))
return-EINVAL;
fdb = fdb_find(head, addr);
if (fdb) {
/* it is okay to have multiple ports with same
* address, just use the first one.
*/
if (fdb->is_local)
return0;
br_warn(br, "adding interface %s with same address "
"as a received packet\n",
source->dev->name);
fdb_delete(fdb);
}
if (!fdb_create(head, source, addr, 1))
return-ENOMEM;
return0;
}
<j>
bool br_stp_recalculate_bridge_id(struct net_bridge *br)
{
const unsigned char*br_mac_zero =
(const unsigned char*)br_mac_zero_aligned;
const unsigned char*addr = br_mac_zero;
struct net_bridge_port *p;
/* user has chosen a value so keep it */
if (br->flags & BR_SET_MAC_ADDR)
returnfalse;
//遍历桥中所有的端口
list_for_each_entry(p, &br->port_list, list) {
if (addr == br_mac_zero ||
memcmp(p->dev->dev_addr, addr, ETH_ALEN) <0)
addr = p->dev->dev_addr;
}
//如果不与现在桥的MAC相同
if (compare_ether_addr(br->bridge_id.addr, addr) ==0)
returnfalse; /* no change */
br_stp_change_bridge_id(br, addr); //-->
returntrue;
}
遍历桥对应的所有接口,然后取最小的MAC。然后判断最小MAC跟现在的MAC是否相同。
void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char*addr)
{
/* should be aligned on 2 bytes for compare_ether_addr() */
unsigned short oldaddr_aligned[ETH_ALEN >>1];
unsigned char*oldaddr = (unsigned char*)oldaddr_aligned;
struct net_bridge_port *p;
int wasroot;
wasroot = br_is_root_bridge(br);
memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN);
memcpy(br->bridge_id.addr, addr, ETH_ALEN);
//到这里,桥的MAC更新了!
memcpy(br->dev->dev_addr, addr, ETH_ALEN);
list_for_each_entry(p, &br->port_list, list) {
if (!compare_ether_addr(p->designated_bridge.addr, oldaddr))
memcpy(p->designated_bridge.addr, addr, ETH_ALEN);
if (!compare_ether_addr(p->designated_root.addr, oldaddr))
memcpy(p->designated_root.addr, addr, ETH_ALEN);
}
br_configuration_update(br);
br_port_state_selection(br);
if (br_is_root_bridge(br) &&!wasroot)
br_become_root_bridge(br);
}
以上的大致的网桥配置过程,配置好之后,便是发送接收数据,这里先瞧一眼网桥的接收数据的实现。
<k>
int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data) { ASSERT_RTNL(); if (dev->rx_handler) return -EBUSY; rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); //回调 return 0; }
利用回调,实际的数据处理函数便是:br_handle_frame
/*进行接收数据的处理*/
rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
{
struct net_bridge_port *p;
struct sk_buff *skb =*pskb;
const unsigned char*dest = eth_hdr(skb)->h_dest; //目的mac地址
br_should_route_hook_t *rhook;
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return RX_HANDLER_PASS;
if (!is_valid_ether_addr(eth_hdr(skb)->h_source))//源mac 为多播或者广播,丢弃
goto drop;
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
return RX_HANDLER_CONSUMED;
p = br_port_get_rcu(skb->dev);
if (unlikely(is_link_local(dest))) {
/* Pause frames shouldn't be passed up by driver anyway */
if (skb->protocol == htons(ETH_P_PAUSE))
goto drop;
/* If STP is turned off, then forward */
if (p->br->stp_enabled == BR_NO_STP && dest[5] ==0)
goto forward;
if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
NULL, br_handle_local_finish)) {
return RX_HANDLER_CONSUMED; /* consumed by filter */
} else {
*pskb = skb;
return RX_HANDLER_PASS; /* continue processing */
}
}
forward:
switch (p->state) {
case BR_STATE_FORWARDING: //状态为转发
rhook = rcu_dereference(br_should_route_hook);
if (rhook) {
if ((*rhook)(skb)) {
*pskb = skb;
return RX_HANDLER_PASS;
}
dest = eth_hdr(skb)->h_dest;
}
/* fall through */
case BR_STATE_LEARNING: //状态为学习
if (!compare_ether_addr(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
br_handle_frame_finish); //-->
break;
default:
drop:
kfree_skb(skb);
}
return RX_HANDLER_CONSUMED;
}
br_handle_frame_finish: 正常的数据包会流进br_handle_frame_finish()进行处理 :
int br_handle_frame_finish(struct sk_buff *skb)
{
//取得目的MAC地址
const unsigned char*dest = eth_hdr(skb)->h_dest;
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
struct net_bridge *br;
struct net_bridge_fdb_entry *dst;
struct net_bridge_mdb_entry *mdst;
struct sk_buff *skb2;
if (!p || p->state == BR_STATE_DISABLED)
goto drop;
/* insert into forwarding database after filtering to avoid spoofing */
br = p->br;
br_fdb_update(br, p, eth_hdr(skb)->h_source);
if (is_multicast_ether_addr(dest) &&
br_multicast_rcv(br, p, skb))
goto drop;
if (p->state == BR_STATE_LEARNING)
goto drop;
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
/* The packet skb2 goes to the local host (NULL to skip). */
skb2 = NULL;
/*如果网桥的虚拟网卡处于混杂模式,那么每个接收到的数据包都需要克隆一份,送到AF_PACKET协议处理体(网络软中断函数net_rx_action中ptype_all链的处理)*/
if (br->dev->flags & IFF_PROMISC)
skb2 = skb;
dst = NULL;
if (is_multicast_ether_addr(dest)) { //目的mac为多播或者广播,则需要传至上层进行处理
mdst = br_mdb_get(br, skb);
if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) {
if ((mdst && mdst->mglist) ||
br_multicast_is_router(br))
skb2 = skb;
br_multicast_forward(mdst, skb, skb2);
skb = NULL;
if (!skb2)
gotoout;
} else
skb2 = skb;
br->dev->stats.multicast++;
} elseif ((dst = __br_fdb_get(br, dest)) && dst->is_local) { //查询CAM 表,到本机的则传至上层协议处理
skb2 = skb;
/* Do not forward the packet since it's local. */
skb = NULL;
}
if (skb) {
if (dst)
br_forward(dst->dst, skb, skb2); //不是本机的数据,则转发
else
br_flood_forward(br, skb, skb2); //如果查询不到,在其它端口上都发送此包
}
if (skb2)
return br_pass_frame_up(skb2);
out:
return0;
drop:
kfree_skb(skb);
gotoout;
}
该函数,通过查找CAM表,取得发送端口,如果当前CAM表里没有到目的MAC的端口,则在其它端口上都发送此数据包。
在这个函数里,我们看到,查询CAM表的函数为:__br_fdb_get()
struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
const unsigned char*addr)
{
struct hlist_node *h;
struct net_bridge_fdb_entry *fdb;
hlist_for_each_entry_rcu(fdb, h, &br->hash[br_mac_hash(addr)], hlist) {
if (!compare_ether_addr(fdb->addr.addr, addr)) { //遍历,比较
if (unlikely(has_expired(br, fdb)))
break;
return fdb;
}
}
return NULL;
}
首先取得目的MAC对应的哈希项。
然后再遍历里面的数据,查看是否含有目的地址的项,fdb返回。
如果是送给本机的数据包,则传至上层协议,
如不是,则需要转发。