linux bridge

linux bridge - mac forward

https://www.jianshu.com/nb/46196362

linux bridge在虚拟化场景和docker中用的比较多,之前也知道它的原理,基本上就是类似二层交换机,根据mac地址和vid转发。但是对于vlan的处理网上的文档比较少,所以这次就看一下源码,分析下不配置vlan时如何转发,vlan又如何生效。

不配置vlan时,bridge纯靠mac转发,可通过如下两个命令之一查看mac转发表

//此命令只显示单播转发表,比较符合硬件交换机的显示规范,
//匹配到mac的,从port转发出去(可通过brctl showbsp br1查看端
//口号和端口的对应关系)
root@node2:~# brctl showmacs br1
port no mac addr        is local?   ageing timer
  2 12:27:96:8c:f4:58   yes        0.00
  2 12:27:96:8c:f4:58   yes        0.00
  1 66:e6:6f:a8:d4:97   yes        0.00
  1 66:e6:6f:a8:d4:97   yes        0.00

//通过此命令可显示所有的单播和组播表项
root@node2:~# bridge fdb show br br1
33:33:00:00:00:01 dev br1 self permanent
66:e6:6f:a8:d4:97 dev vetha master br1 permanent
66:e6:6f:a8:d4:97 dev vetha vlan 1 master br1 permanent
33:33:00:00:00:01 dev vetha self permanent
01:00:5e:00:00:01 dev vetha self permanent
12:27:96:8c:f4:58 dev vethx master br1 permanent
12:27:96:8c:f4:58 dev vethx vlan 1 master br1 permanent
33:33:00:00:00:01 dev vethx self permanent
01:00:5e:00:00:01 dev vethx self permanent

这篇文档就先介绍不使能vlan的情况,主要分为下面几个部分
a. kernel端bridge module的初始化都做了哪些事
b. 添加网桥时,命令行和kernel端代码流程
c. 给网桥添加端口时,命令行和kernel端代码流程
d. 从端口收到报文后,内部是如何转发的

广播/组播/未知单播报文flood到所有端口。
查找到转发表项的已知单播报文,发送到此表项的出端口。
广播/组播/已知单播并且dst为locol的报文,或者网桥设备使能了混杂模式,这几种情况都需要通过网桥设备将报文上送本机协议栈处理。

e. 从网桥br发出去的报文如何转发

广播/组播/未知单播报文,flood到所有端口。
能查找到转发表项的单播报文,从表项的出端口发送出去。

bridge还有如下几个注意的地方


单播flood: 控制单播报文是否从此端口发送一份,有两种设置方式,
a. bridge link set dev vnet1 flood on
b. echo 1 > /sys/class/net/br1/brif/vnet1/unicast_flood

hairpin模式:控制接收到广播/组播/未知单播的端口,再次从此端口发出。已知单播正常转发。
a. bridge link set dev vnet1 hairpin on
b. echo 1 > /sys/class/net/br1/brif/vnet1/hairpin_mode

网桥设备down后,所有端口状态都会变成 disabled, 导致网桥不会正确转发。
vetha (1)
 port id                8001                    state                  disabled
 designated root        8000.3adce07c2043       path cost                  2
 designated bridge      8000.3adce07c2043       message age timer          0.00
 designated port        8001                    forward delay timer        0.00
 designated cost           0                    hold timer                 0.00
 flags

bridge netfilter框架,可使用ebtables设置和查看

1. module初始化流程

#module初始化流程
module_init(br_init)
static int __init br_init(void)
    static const struct stp_proto br_stp_proto = {
        .rcv    = br_stp_rcv,
    };
    //注册stp协议处理函数,防止环路产生,此文不看stp部分
    stp_proto_register(&br_stp_proto);
    
    //初始化fdb表项用到的cache
    br_fdb_init();
        static struct kmem_cache *br_fdb_cache __read_mostly;
        br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
                     sizeof(struct net_bridge_fdb_entry),0,
                     SLAB_HWCACHE_ALIGN, NULL);
                     
        static u32 fdb_salt __read_mostly;
        get_random_bytes(&fdb_salt, sizeof(fdb_salt));
        
    static struct pernet_operations br_net_ops = {
        .exit   = br_net_exit,
    };
    //注册pernet操作,只提供了exit,所以namespace初始化时无操作
    register_pernet_subsys(&br_net_ops);
    
    static struct notifier_block br_device_notifier = {
        .notifier_call = br_device_event
    };
    //注册网络设备事件处理函数
    register_netdevice_notifier(&br_device_notifier);
    
    br_netlink_init();
        br_mdb_init();
            rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, NULL);
            rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, NULL);
            rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, NULL);
            
        static struct rtnl_af_ops br_af_ops = {
            .family         = AF_BRIDGE,
            .get_link_af_size   = br_get_link_af_size,
        };
        rtnl_af_register(&br_af_ops);
            list_add_tail(&ops->list, &rtnl_af_ops);
            
        struct rtnl_link_ops br_link_ops __read_mostly = {
            .kind           = "bridge",
            .priv_size      = sizeof(struct net_bridge),
            .setup          = br_dev_setup,
            .maxtype        = IFLA_BRPORT_MAX,
            .policy         = br_policy,
            .validate       = br_validate,
            .newlink        = br_dev_newlink,
            .changelink     = br_changelink,
            .dellink        = br_dev_delete,
            .get_size       = br_get_size,
            .fill_info      = br_fill_info,
            .slave_maxtype      = IFLA_BRPORT_MAX,
            .slave_policy       = br_port_policy,
            .slave_changelink   = br_port_slave_changelink,
            .get_slave_size     = br_port_get_slave_size,
            .fill_slave_info    = br_port_fill_slave_info,
        };
        rtnl_link_register(&br_link_ops);
            __rtnl_link_register(ops);
                list_add_tail(&ops->list, &link_ops);
    //注册hook函数到br_ioctl_hook,添加网桥时调用br_ioctl_hook
    brioctl_set(br_ioctl_deviceless_stub);
        static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg);
        br_ioctl_hook = hook;

2. 创建/删除桥流程

通过strace brctl命令,可知创建/删除桥是通过socket的ioctl调用到kernel端

//添加桥
root@node2:~# strace brctl addbr br1
execve("/usr/sbin/brctl", ["brctl", "addbr", "br1"], 0x7fffd27c39a0 /* 22 vars */) = 0
socket(AF_UNIX, SOCK_STREAM, 0)         = 3
ioctl(3, SIOCBRADDBR, "br1")            = 0

//删除桥
root@node2:~# strace brctl delbr br1
execve("/usr/sbin/brctl", ["brctl", "delbr", "br1"], 0x7fff18eceaa0 /* 22 vars */) = 0
socket(AF_UNIX, SOCK_STREAM, 0)         = 3
ioctl(3, SIOCBRDELBR, "br1")            = 0

#kernel端代码,ioctl最终会调用 sock_ioctl
static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        switch (cmd) {
        case SIOCGIFBR:
        case SIOCSIFBR:
        case SIOCBRADDBR:
        case SIOCBRDELBR:
            err = -ENOPKG;
            if (!br_ioctl_hook)
                request_module("bridge");

            mutex_lock(&br_ioctl_mutex);
            //调用之前注册的 br_ioctl_deviceless_stub
            if (br_ioctl_hook)
                err = br_ioctl_hook(net, cmd, argp); //br_ioctl_deviceless_stub
            mutex_unlock(&br_ioctl_mutex);
            break;
            
int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
{
    switch (cmd) {
    case SIOCGIFBR:
    case SIOCSIFBR:
        return old_deviceless(net, uarg);

    case SIOCBRADDBR:
    case SIOCBRDELBR:
    {
        char buf[IFNAMSIZ];

        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
            return -EPERM;

        if (copy_from_user(buf, uarg, IFNAMSIZ))
            return -EFAULT;

        buf[IFNAMSIZ-1] = 0;
        if (cmd == SIOCBRADDBR)
            return br_add_bridge(net, buf);

        return br_del_bridge(net, buf);
    }
    }
    return -EOPNOTSUPP;
}

int br_add_bridge(struct net *net, const char *name)
    struct net_device *dev;
    dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN, br_dev_setup);
        alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)
            alloc_size = sizeof(struct net_device);
            struct net_device *p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
            struct net_device *dev = PTR_ALIGN(p, NETDEV_ALIGN);
            dev_addr_init(dev)
            dev_mc_init(dev);
            dev_uc_init(dev);

            dev_net_set(dev, &init_net);

            dev->gso_max_size = GSO_MAX_SIZE;
            dev->gso_max_segs = GSO_MAX_SEGS;
            dev->gso_min_segs = 0;

            dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
            setup(dev); //br_dev_setup
                struct net_bridge *br = netdev_priv(dev);
                eth_hw_addr_random(dev);
                    dev->addr_assign_type = NET_ADDR_RANDOM;
                    eth_random_addr(dev->dev_addr);
                ether_setup(dev);
                    dev->header_ops     = &eth_header_ops;
                    dev->type       = ARPHRD_ETHER;
                    dev->hard_header_len    = ETH_HLEN;
                    dev->mtu        = ETH_DATA_LEN;
                    dev->addr_len       = ETH_ALEN;
                    dev->tx_queue_len   = 1000; /* Ethernet wants good queues */
                    dev->flags      = IFF_BROADCAST|IFF_MULTICAST;
                    dev->priv_flags     |= IFF_TX_SKB_SHARING;
                    memset(dev->broadcast, 0xFF, ETH_ALEN);
                dev->netdev_ops = &br_netdev_ops;
                dev->destructor = br_dev_free;
                dev->ethtool_ops = &br_ethtool_ops;
                SET_NETDEV_DEVTYPE(dev, &br_type);
                dev->tx_queue_len = 0;
                dev->priv_flags = IFF_EBRIDGE;

                dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL |
                        NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
                dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
                           NETIF_F_HW_VLAN_STAG_TX;
                dev->vlan_features = COMMON_FEATURES;

                br->dev = dev;
                spin_lock_init(&br->lock);
                INIT_LIST_HEAD(&br->port_list);
                spin_lock_init(&br->hash_lock);

                br->bridge_id.prio[0] = 0x80;
                br->bridge_id.prio[1] = 0x00;
                /* Reserved Ethernet Addresses per IEEE 802.1Q */
                static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
                { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
                ether_addr_copy(br->group_addr, eth_reserved_addr_base);

                br->stp_enabled = BR_NO_STP;
                br->group_fwd_mask = BR_GROUPFWD_DEFAULT;
                br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;

                br->designated_root = br->bridge_id;
                br->bridge_max_age = br->max_age = 20 * HZ;
                br->bridge_hello_time = br->hello_time = 2 * HZ;
                br->bridge_forward_delay = br->forward_delay = 15 * HZ;
                br->ageing_time = 300 * HZ;

                br_netfilter_rtable_init(br);
                    struct rtable *rt = &br->fake_rtable;
                    atomic_set(&rt->dst.__refcnt, 1);
                    rt->dst.dev = br->dev;
                    rt->dst.path = &rt->dst;
                    dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
                    rt->dst.flags   = DST_NOXFRM | DST_FAKE_RTABLE;
                    rt->dst.ops = &fake_dst_ops;
                br_stp_timer_init(br);
                    setup_timer(&br->hello_timer, br_hello_timer_expired, (unsigned long) br);
                    setup_timer(&br->tcn_timer, br_tcn_timer_expired, (unsigned long) br);
                    setup_timer(&br->topology_change_timer,br_topology_change_timer_expired,(unsigned long) br);
                    setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br);
                br_multicast_init(br);
    
            dev->num_tx_queues = txqs;
            dev->real_num_tx_queues = txqs;
            netif_alloc_netdev_queues(dev)
            dev->num_rx_queues = rxqs;
            dev->real_num_rx_queues = rxqs;
            netif_alloc_rx_queues(dev)
            strcpy(dev->name, name);
            dev->name_assign_type = name_assign_type;
            dev->group = INIT_NETDEV_GROUP;
    dev_net_set(dev, net);
    dev->rtnl_link_ops = &br_link_ops;
    register_netdev(dev);
        register_netdevice(dev);
            dev->netdev_ops->ndo_init(dev);//br_dev_init
                struct net_bridge *br = netdev_priv(dev);
                br->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
                /vlan相关初始化
                br_vlan_init(br);
                    //支持的vlan协议,可以通过(/sys/class/net/br1/bridge/vlan_protocol)修改
                    br->vlan_proto = htons(ETH_P_8021Q);
                    //默认 pvid 为 1
                    br->default_pvid = 1;
                    //将vid 1和网桥mac添加到fdb中
                    br_vlan_add(br, 1, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED);
                        struct net_port_vlans *pv = NULL;
                        pv = rtnl_dereference(br->vlan_info);
                        if (pv)
                            return __vlan_add(pv, vid, flags);
                        pv = kzalloc(sizeof(*pv), GFP_KERNEL);
                        pv->parent.br = br;
                        __vlan_add(pv, vid, flags);
                            if (v->port_idx) {
                                p = v->parent.port;
                                br = p->br;
                                dev = p->dev;
                            } else {//出去网桥和网桥设备
                                br = v->parent.br;
                                dev = br->dev;
                            }
                            if (p) {
                                vlan_vid_add(dev, br->vlan_proto, vid);
                                    vlan_info = rtnl_dereference(dev->vlan_info);
                                    vid_info = vlan_vid_info_get(vlan_info, proto, vid);
                                    if (!vid_info) {
                                        __vlan_vid_add(vlan_info, proto, vid, &vid_info);
                                            vid_info = vlan_vid_info_alloc(proto, vid);
                                            //如果硬件支持vlan filter,则设置到硬件
                                            if (vlan_hw_filter_capable(dev, vid_info)) {
                                                ops->ndo_vlan_rx_add_vid(dev, proto, vid);
                                            list_add(&vid_info->list, &vlan_info->vid_list);
                                            vlan_info->nr_vids++;
                                            *pvid_info = vid_info;
                                    vid_info->refcount++;
                            //插入fdb表项
                            br_fdb_insert(br, p, dev->dev_addr, vid);
                            //设置到 vlan_bitmap 中
                            set_bit(vid, v->vlan_bitmap);
                            v->num_vlans++;
                            __vlan_add_flags(v, vid, flags);
                        rcu_assign_pointer(br->vlan_info, pv);

3. 添加/删除接口流程

#添加接口
root@node2:~# strace brctl addif br1 vetha
execve("/usr/sbin/brctl", ["brctl", "addif", "br1", "vetha"], 0x7fff20137ba8 /* 22 vars */) = 0
socket(AF_UNIX, SOCK_STREAM, 0)         = 3
access("/proc/net", R_OK)               = 0
access("/proc/net/unix", R_OK)          = 0
socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 4
ioctl(4, SIOCGIFINDEX, {ifr_name="vetha", }) = 0
close(4)                                = 0
ioctl(3, SIOCBRADDIF)                   = 0

#删除接口
root@node2:~# strace brctl delif br1 vetha
execve("/usr/sbin/brctl", ["brctl", "delif", "br1", "vetha"], 0x7ffe8db2f1a8 /* 22 vars */) = 0
socket(AF_UNIX, SOCK_STREAM, 0)         = 3
socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 4
ioctl(4, SIOCGIFINDEX, {ifr_name="vetha", }) = 0
close(4)                                = 0
ioctl(3, SIOCBRDELIF)                   = 0

static const struct net_device_ops br_netdev_ops = {
    .ndo_do_ioctl        = br_dev_ioctl,
    ...
    .ndo_fix_features        = br_fix_features,
    .ndo_fdb_add         = br_fdb_add,
    .ndo_fdb_del         = br_fdb_delete,
    .ndo_fdb_dump        = br_fdb_dump,
    .ndo_bridge_getlink  = br_getlink,
    .ndo_bridge_setlink  = br_setlink,
    .ndo_bridge_dellink  = br_dellink,
};

#kernel端代码,添加接口
int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
    struct net_bridge *br = netdev_priv(dev);

    switch (cmd) {
    case SIOCDEVPRIVATE:
        return old_dev_ioctl(dev, rq, cmd);

    case SIOCBRADDIF:
    case SIOCBRDELIF:
        return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);

    }

    br_debug(br, "Bridge does not support ioctl 0x%x
", cmd);
    return -EOPNOTSUPP;
}
/* called with RTNL */
static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
{
    struct net *net = dev_net(br->dev);
    struct net_device *dev;
    int ret;

    if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
        return -EPERM;

    dev = __dev_get_by_index(net, ifindex);
    if (dev == NULL)
        return -EINVAL;

    if (isadd)
        ret = br_add_if(br, dev);
    else
        ret = br_del_if(br, dev);

    return ret;
}
int br_add_if(struct net_bridge *br, struct net_device *dev)
    int err = 0;
    bool changed_addr;

    /* Don't allow bridging non-ethernet like devices */
    if ((dev->flags & IFF_LOOPBACK) ||
        dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN ||
        !is_valid_ether_addr(dev->dev_addr))
        return -EINVAL;
    //bridge接口不能加入另一个bridge
    /* No bridging of bridges */
    if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
        return -ELOOP;
    //加入bridge的接口不能加入另一个bridge,即一个接口不能同时加到两个bridge
    /* Device is already being bridged */
    if (br_port_exists(dev)) //#define br_port_exists(dev) (dev->priv_flags & IFF_BRIDGE_PORT)
        return -EBUSY;

    /* No bridging devices that dislike that (e.g. wireless) */
    if (dev->priv_flags & IFF_DONT_BRIDGE)
        return -EOPNOTSUPP;

    struct net_bridge_port *p;
    p = new_nbp(br, dev);
        //找到最小可用的端口号。0保留不用,最大端口号为1<<10
        index = find_portno(br);
        p = kzalloc(sizeof(*p), GFP_KERNEL);
        p->br = br;
        dev_hold(dev);
        p->dev = dev;
        p->path_cost = port_cost(dev);
        p->priority = 0x8000 >> BR_PORT_BITS;
        //保存端口号
        p->port_no = index;
        p->flags = BR_LEARNING | BR_FLOOD;
        br_init_port(p);
            //优先级左移10位或上port_no作为端口号
            p->port_id = br_make_port_id(p->priority, p->port_no);
                return ((u16)priority << BR_PORT_BITS) | (port_no & ((1<<BR_PORT_BITS)-1));
            br_become_designated_port(p);
                struct net_bridge *br;
                br = p->br;
                p->designated_root = br->designated_root;
                p->designated_cost = br->root_path_cost;
                p->designated_bridge = br->bridge_id;
                p->designated_port = p->port_id;
            //初始状态为 BR_STATE_BLOCKING
            br_set_state(p, BR_STATE_BLOCKING);
                p->state = state;
            p->topology_change_ack = 0;
            p->config_pending = 0;
        //设置状态为 BR_STATE_DISABLED
        br_set_state(p, BR_STATE_DISABLED);
        br_stp_port_timer_init(p);
            setup_timer(&p->message_age_timer, br_message_age_timer_expired, (unsigned long) p);
            setup_timer(&p->forward_delay_timer, br_forward_delay_timer_expired, (unsigned long) p);
            setup_timer(&p->hold_timer, br_hold_timer_expired, (unsigned long) p);
        br_multicast_add_port(p);
    call_netdevice_notifiers(NETDEV_JOIN, dev);
    //使能组播
    dev_set_allmulti(dev, 1);
        __dev_set_allmulti(dev, inc, true);
            dev->flags |= IFF_ALLMULTI;
            dev->allmulti += inc;
            dev_change_rx_flags(dev, IFF_ALLMULTI);
            dev_set_rx_mode(dev);
    kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj), SYSFS_BRIDGE_PORT_ATTR);
    //将接口信息添加到 sys 文件系统中:/sys/class/net/br1/brif/vnet1(桥br1上的接口vnet1)
    br_sysfs_addif(p);
    br_netpoll_enable(p);
    //注册br_handle_frame到协议栈入口处
    netdev_rx_handler_register(dev, br_handle_frame, p);
    //设置flag IFF_BRIDGE_PORT,表示此接口已经加入桥
    dev->priv_flags |= IFF_BRIDGE_PORT;
    netdev_master_upper_dev_link(dev, br->dev);
    //关闭 lro 功能
    dev_disable_lro(dev);
    //将接口加入桥的端口链表 br->port_list
    list_add_rcu(&p->list, &br->port_list);
    nbp_update_port_count(br);
        list_for_each_entry(p, &br->port_list, list) {
            //#define BR_AUTO_MASK (BR_FLOOD | BR_LEARNING)
            //#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
            //上面初始化时,p->flags = BR_LEARNING | BR_FLOOD,所以此处成立,cnt加1
            if (br_auto_port(p))
                cnt++;
        }
        if (br->auto_cnt != cnt) {
            br->auto_cnt = cnt;
            br_manage_promisc(br);
                //如果bridge接口使能了混杂模式或者bridge接口没有使能vlan filter,则设置桥上所有接口使能混杂模式
                if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br))
                    set_all = true;
                        list_for_each_entry(p, &br->port_list, list) {
                if (set_all) {
                    br_port_set_promisc(p);
                        //使能接口混杂模式
                        dev_set_promiscuity(p->dev, 1);
                        //将fdb中静态表项从接口的单播地址列表删除
                        br_fdb_unsync_static(p->br, p);
                            for (i = 0; i < BR_HASH_SIZE; i++) {
                                hlist_for_each_entry_rcu(fdb, &br->hash[i], hlist) {
                                    /* We only care for static entries */
                                    if (!fdb->is_static)
                                        continue;
                                    dev_uc_del(p->dev, fdb->addr.addr);
                                }
                            }
                        p->flags |= BR_PROMISC;
                } else {
                    if (br->auto_cnt == 0 ||
                        (br->auto_cnt == 1 && br_auto_port(p)))
                        br_port_clear_promisc(p);
                            //如果接口已经不是混杂模式则返回
                            //或者接口不支持单播过滤,此时也返回,不用关闭混杂模式,因为不支持单播过滤的接口
                            //最终都会使能混杂模式
                            if (!br_promisc_port(p) || !(p->dev->priv_flags & IFF_UNICAST_FLT))
                                return;
                            br_fdb_sync_static(p->br, p);
                                struct net_bridge_fdb_entry *fdb, *tmp;
                                //将fdb中静态表项添加到接口的单播地址列表
                                for (i = 0; i < BR_HASH_SIZE; i++) {
                                    hlist_for_each_entry(fdb, &br->hash[i], hlist) {
                                        /* We only care for static entries */
                                        if (!fdb->is_static)
                                            continue;

                                        err = dev_uc_add(p->dev, fdb->addr.addr);
                                        if (err)
                                            goto rollback;
                                    }
                                }
                            dev_set_promiscuity(p->dev, -1);
                    else
                        br_port_set_promisc(p);
                }
        }
    netdev_update_features(br->dev);
    //给fdb插入一个表项,vid为 0
    br_fdb_insert(br, p, dev->dev_addr, 0)
        fdb_insert(br, source, addr, vid);
            struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
            struct net_bridge_fdb_entry *fdb;
            //根据mac地址和vid查找是否已经存在
            fdb = fdb_find(head, addr, vid);
            if (fdb) {
                /* it is okay to have multiple ports with same
                 * address, just use the first one.
                 */
                //已经存在相同的mac,如果已存在的也是local是允许的。
                //使用已存在的即可。这样正在添加的接口就不能根据fdb转发了
                if (fdb->is_local)
                    return 0;
                br_warn(br, "adding interface %s with same address "
                       "as a received packet
",
                       source ? source->dev->name : br->dev->name);
                //如果已经存在的fdb表项不是local的,则删除这个fdb,创建一个新的静态fdb
                fdb_delete(br, fdb);
            }
            fdb = fdb_create(head, source, addr, vid);
                struct net_bridge_fdb_entry *fdb;
                fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
                if (fdb) {
                    memcpy(fdb->addr.addr, addr, ETH_ALEN);
                    //source作为fdb表项的出接口
                    fdb->dst = source;
                    fdb->vlan_id = vid;
                    fdb->is_local = 0;
                    fdb->is_static = 0;
                    fdb->added_by_user = 0;
                    fdb->updated = fdb->used = jiffies;
                    //将fdb添加到链表
                    hlist_add_head_rcu(&fdb->hlist, head);
                }
                return fdb;
            fdb->is_local = fdb->is_static = 1;
            fdb_add_hw(br, addr);
                //将此接口地址添加到bridge中不是混杂模式的接口上
                list_for_each_entry(p, &br->port_list, list) {
                    if (!br_promisc_port(p)) {
                        err = dev_uc_add(p->dev, addr);
                        if (err)
                            goto undo;
                    }
                }
    nbp_vlan_init(p)
        //default_pvid默认为1
        p->br->default_pvid ? nbp_vlan_add(p, p->br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED) : 0;
            struct net_port_vlans *pv = NULL;
            pv = rtnl_dereference(port->vlan_info);
            if (pv)
                return __vlan_add(pv, vid, flags);
            pv = kzalloc(sizeof(*pv), GFP_KERNEL);
            pv->port_idx = port->port_no;
            pv->parent.port = port;
            __vlan_add(pv, vid, flags);
                br_fdb_insert(br, p, dev->dev_addr, vid);
                    set_bit(vid, v->vlan_bitmap);
                    v->num_vlans++;
                    __vlan_add_flags(v, vid, flags);
                        if (flags & BRIDGE_VLAN_INFO_PVID)
                            __vlan_add_pvid(v, vid);
                        else
                            __vlan_delete_pvid(v, vid);

                        if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
                            set_bit(vid, v->untagged_bitmap);
                        else
                            clear_bit(vid, v->untagged_bitmap);
            rcu_assign_pointer(port->vlan_info, pv);
    changed_addr = br_stp_recalculate_bridge_id(br);
    if (netif_running(dev) && netif_oper_up(dev) &&
        (br->dev->flags & IFF_UP))
        br_stp_enable_port(p);
            br_init_port(p);
                p->port_id = br_make_port_id(p->priority, p->port_no);
                br_become_designated_port(p);
                br_set_state(p, BR_STATE_BLOCKING);
                p->topology_change_ack = 0;
                p->config_pending = 0;
            br_port_state_selection(p->br);
            br_log_state(p);
                br_info(p->br, "port %u(%s) entered %s state
",(unsigned int) p->port_no, p->dev->name, br_port_state_names[p->state]);
    if (changed_addr)
        call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
    //将桥上所有接口mtu的最小值设置到bridge接口上
    dev_set_mtu(br->dev, br_min_mtu(br));
    kobject_uevent(&p->kobj, KOBJ_ADD);

4. 接收报文处理流程

在协议栈入口函数 __netif_receive_skb_core 调用添加接口时注册的回调函数 br_handle_frame
rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
    struct net_bridge_port *p;
    struct sk_buff *skb = *pskb;
    const unsigned char *dest = eth_hdr(skb)->h_dest;
    br_should_route_hook_t *rhook;
    
    //不处理loopback报文
    if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
        return RX_HANDLER_PASS;
    //如果源mac地址为全0,或者为组播地址,则drop此报文
    if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
        goto drop;
    //取出net_bridge_port结构
    p = br_port_get_rcu(skb->dev); //rcu_dereference(dev->rx_handler_data);
    //如果目的ip前三字节为01-80-C2-xx-xx-xx,则需要特殊处理此种报文
    if (unlikely(is_link_local_ether_addr(dest))) {
        /*
         * See IEEE 802.1D Table 7-10 Reserved addresses
         *
         * Assignment               Value
         * Bridge Group Address     01-80-C2-00-00-00
         * (MAC Control) 802.3      01-80-C2-00-00-01
         * (Link Aggregation) 802.3 01-80-C2-00-00-02
         * 802.1X PAE address       01-80-C2-00-00-03
         *
         * 802.1AB LLDP         01-80-C2-00-00-0E
         *
         * Others reserved for future standardization
         */
        switch (dest[5]) {
        case 0x00:  /* Bridge Group Address */
            /* If STP is turned off,
               then must forward to keep loop detection */
            if (p->br->stp_enabled == BR_NO_STP ||
                fwd_mask & (1u << dest[5]))
                goto forward;
            break;

        case 0x01:  /* IEEE MAC (Pause) */
            goto drop;

        default:
            /* Allow selective forwarding for most other protocols */
            fwd_mask |= p->br->group_fwd_mask;
            if (fwd_mask & (1u << dest[5]))
                goto forward;
        }

        /* Deliver packet to local host only */
        if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
                NULL, br_handle_local_finish)) {
            return RX_HANDLER_CONSUMED; /* consumed by filter */
        } else {
            *pskb = skb;
            return RX_HANDLER_PASS; /* continue processing */
        }
    }
forward:
    switch (p->state) {
    case BR_STATE_FORWARDING:
        //如果支持 broute
        rhook = rcu_dereference(br_should_route_hook); //ebt_broute
        if (rhook) {
            if ((*rhook)(skb)) {
                *pskb = skb;
                return RX_HANDLER_PASS;
            }
            dest = eth_hdr(skb)->h_dest;
        }
        /* fall through */
    case BR_STATE_LEARNING:
        //如果报文目的mac是br接口的mac,则设置 PACKET_HOST
        if (ether_addr_equal(p->br->dev->dev_addr, dest))
            skb->pkt_type = PACKET_HOST;
        //netfilter处理
        NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
            br_handle_frame_finish);
        break;
    default:
drop:
        kfree_skb(skb);
    }
    return RX_HANDLER_CONSUMED;
    
//广播/组播/未知单播报文flood到所有端口。
//查找到fdb表项的已知单播报文,发送到此表项的出端口。
//广播/组播/已知单播并且dst为locol的报文,或者网桥设备使能了混杂模式,这几种情况都需要通过网桥设备将报文上送本机协议栈处理。
/* note: already called with rcu_read_lock */
int br_handle_frame_finish(struct sk_buff *skb)
{
    const unsigned char *dest = eth_hdr(skb)->h_dest;
    struct net_bridge_port *p = br_port_get_rcu(skb->dev);
    struct net_bridge *br;
    struct net_bridge_fdb_entry *dst;
    struct net_bridge_mdb_entry *mdst;
    struct sk_buff *skb2;
    bool unicast = true;
    u16 vid = 0;

    if (!p || p->state == BR_STATE_DISABLED)
        goto drop;

    if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid))
        goto out;
        
    /* insert into forwarding database after filtering to avoid spoofing */
    br = p->br;
    //更新fdb表项,如果之前没有就新创建
    if (p->flags & BR_LEARNING)
        br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
    //处理组播报文
    if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) &&
        br_multicast_rcv(br, p, skb, vid))
        goto drop;

    if (p->state == BR_STATE_LEARNING)
        goto drop;
    //将网桥设备保存到 skb 中
    BR_INPUT_SKB_CB(skb)->brdev = br->dev;

    /* The packet skb2 goes to the local host (NULL to skip). */
    //如果skb2不为空,则需要上送本地协议栈
    skb2 = NULL;
    //如果网桥设备打开了混杂模式,则设置 skb2=skb,说明需要上送本地协议栈
    if (br->dev->flags & IFF_PROMISC)
        skb2 = skb;

    dst = NULL;
    //如果是广播报文,则也设置skb2=skb,说明需要上送本地协议栈
    if (is_broadcast_ether_addr(dest)) {
        skb2 = skb;
        unicast = false;
    } else if (is_multicast_ether_addr(dest)) {
        //组播报文处理
        mdst = br_mdb_get(br, skb, vid);
        if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
            br_multicast_querier_exists(br, eth_hdr(skb))) {
            if ((mdst && mdst->mglist) ||
                br_multicast_is_router(br))
                skb2 = skb;
            br_multicast_forward(mdst, skb, skb2);
            skb = NULL;
            if (!skb2)
                goto out;
        } else
            skb2 = skb;
        unicast = false;
        br->dev->stats.multicast++;
    //根据mac和vid查找fdb,如果目的地为local,则也要设置skb2=skb,说明需要上送本地协议栈
    } else if ((dst = __br_fdb_get(br, dest, vid)) && dst->is_local) {
        skb2 = skb;
        /* Do not forward the packet since it's local. */
        skb = NULL;
    }
    //以下四种情况时,skb不为空
    //a. 广播报文
    //b. 组播报文
    //c. 单播报文,查找到了dst并且dst为非local
    //d. 单播报文,查找不到dst,未知单播
    if (skb) {
        if (dst) {
            dst->used = jiffies;
            //查找到了dst并且dst为非local的单播报文。
            //如果网桥设备没有使能混杂模式,则此时skb2为NULL
            br_forward(dst->dst, skb, skb2);
                if (should_deliver(to, skb)) {
                    if (skb0)
                        deliver_clone(to, skb, __br_forward);
                    else
                        __br_forward(to, skb);
                    return;
                }
        } else
            //广播,组播和查找不到dst的单播报文
            br_flood_forward(br, skb, skb2, unicast);
                br_flood(br, skb, skb2, __br_forward, unicast);
                    //遍历网桥上所有端口,如果端口满足条件则给此端口发送一份报文
                    list_for_each_entry_rcu(p, &br->port_list, list) {
                        /* Do not flood unicast traffic to ports that turn it off */
                        //单播报文并且端口允许flood,则将报文发给此端口一份.
                        if (unicast && !(p->flags & BR_FLOOD))
                            continue;
                        prev = maybe_deliver(prev, p, skb, __packet_hook);
                            if (!should_deliver(p, skb))
                                //此处判断是否应该发给此端口,满足下面三个条件
                                //此端口不是接收报文端口。或者此端口使能了 hairpin 模式(针对此端口为接收报文端口来说)
                                //并且报文满足vlan过滤条件或者vlan过滤功能关闭
                                //并且端口状态为BR_STATE_FORWARDING
                                return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
                                    br_allowed_egress(p->br, nbp_get_vlan_info(p), skb) &&
                                    p->state == BR_STATE_FORWARDING;
                                return prev;

                            if (!prev)
                                goto out;

                            err = deliver_clone(prev, skb, __packet_hook);
                            if (err)
                                return ERR_PTR(err);
                        out:
                            return p;
                        if (IS_ERR(prev))
                            goto out;
                    }

                    if (!prev)
                        goto out;

                    if (skb0)
                        deliver_clone(prev, skb, __packet_hook);
                    else
                        __packet_hook(prev, skb);
                    return;     
    }
    //网桥设备使能了混杂模式,skb2肯定不为NULL
    //广播/组播报文
    //单播报文,查找到了dst并且dst为local
    if (skb2)
        return br_pass_frame_up(skb2);

out:
    return 0;
drop:
    kfree_skb(skb);
    goto out;
}

//将报文发送给指定出端口
static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
{
    struct net_device *indev;

    if (skb_warn_if_lro(skb)) {
        kfree_skb(skb);
        return;
    }

    skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb);
    if (!skb)
        return;

    indev = skb->dev;
    //将skb中的dev换成出端口的dev
    skb->dev = to->dev;
    skb_forward_csum(skb);

    NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev,
        br_forward_finish);
}
int br_forward_finish(struct sk_buff *skb)
{
    return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev,
               br_dev_queue_push_xmit);

}
int br_dev_queue_push_xmit(struct sk_buff *skb)
{
    /* ip_fragment doesn't copy the MAC header */
    if (nf_bridge_maybe_copy_header(skb) ||
        !is_skb_forwardable(skb->dev, skb)) {
        kfree_skb(skb);
    } else {
        skb_push(skb, ETH_HLEN);
        br_drop_fake_rtable(skb);
        //从网卡发送出去
        dev_queue_xmit(skb);
    }

    return 0;
}

//通过网桥设备将报文上送本机协议栈
static int br_pass_frame_up(struct sk_buff *skb)
{
    struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
    struct net_bridge *br = netdev_priv(brdev);
    struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
    struct net_port_vlans *pv;

    u64_stats_update_begin(&brstats->syncp);
    brstats->rx_packets++;
    brstats->rx_bytes += skb->len;
    u64_stats_update_end(&brstats->syncp);

    /* Bridge is just like any other port.  Make sure the
     * packet is allowed except in promisc modue when someone
     * may be running packet capture.
     */
    pv = br_get_vlan_info(br);
    if (!(brdev->flags & IFF_PROMISC) &&
        !br_allowed_egress(br, pv, skb)) {
        kfree_skb(skb);
        return NET_RX_DROP;
    }

    indev = skb->dev;
    //将skb中的dev换成网桥设备的dev
    //网桥设备dev没有注册 br_handle_frame,可以经过netif_receive_skb上送到协议栈
    skb->dev = brdev;
    skb = br_handle_vlan(br, pv, skb);
    if (!skb)
        return NET_RX_DROP;

    return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL,
               netif_receive_skb);
}

5. 网桥设备发送报文流程

处理比较简单,广播/组播/未知单播报文,flood到所有端口。
能查找到fdb表项的单播报文,从表项的出端口发送出去。
netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
    struct net_bridge *br = netdev_priv(dev);
    const unsigned char *dest = skb->data;
    struct net_bridge_fdb_entry *dst;
    struct net_bridge_mdb_entry *mdst;
    struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
    u16 vid = 0;

    rcu_read_lock();
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
    if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) {
        br_nf_pre_routing_finish_bridge_slow(skb);
        rcu_read_unlock();
        return NETDEV_TX_OK;
    }
#endif

    u64_stats_update_begin(&brstats->syncp);
    brstats->tx_packets++;
    brstats->tx_bytes += skb->len;
    u64_stats_update_end(&brstats->syncp);
    //将网桥设备dev保存到skb
    BR_INPUT_SKB_CB(skb)->brdev = dev;

    skb_reset_mac_header(skb);
    skb_pull(skb, ETH_HLEN);
    //是否满足vlan filter或者vlan filter功能关闭
    if (!br_allowed_ingress(br, br_get_vlan_info(br), skb, &vid))
        goto out;

    if (is_broadcast_ether_addr(dest))
        //广播报文,发送到所有网桥上的端口
        br_flood_deliver(br, skb, false);
            br_flood(br, skb, NULL, __br_deliver, unicast);
    else if (is_multicast_ether_addr(dest)) {
        //组播报文处理,不详细分析了
        if (unlikely(netpoll_tx_running(dev))) {
            br_flood_deliver(br, skb, false);
            goto out;
        }
        if (br_multicast_rcv(br, NULL, skb, vid)) {
            kfree_skb(skb);
            goto out;
        }

        mdst = br_mdb_get(br, skb, vid);
        if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
            br_multicast_querier_exists(br, eth_hdr(skb)))
            br_multicast_deliver(mdst, skb);
        else
            br_flood_deliver(br, skb, false);
    } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL)
        br_deliver(dst->dst, skb);
            //查找到fdb表项,经过netfilter处理后,最终调用dev_queue_xmit从网卡发送出去
            if (to && should_deliver(to, skb)) {
                __br_deliver(to, skb);
                    skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb);
                    skb->dev = to->dev;
                    NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, br_forward_finish);
                        NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, br_dev_queue_push_xmit);
                            dev_queue_xmit(skb);
    else
        //未知单播
        br_flood_deliver(br, skb, true);

out:
    rcu_read_unlock();
    return NETDEV_TX_OK;


=========== End
 
原文地址:https://www.cnblogs.com/lsgxeva/p/14256458.html