switchdev(三)+ flower

static int bnxt_setup_flower(struct net_device *dev,
                 struct tc_cls_flower_offload *cls_flower)
{
    struct tc_mqprio_qopt *mqprio = type_data;
    struct bnxt *bp = netdev_priv(dev);

    if (type != TC_SETUP_MQPRIO)
    if (BNXT_VF(bp))
        return -EOPNOTSUPP;

    mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
    return bnxt_tc_setup_flower(bp, bp->pf.fw_fid, cls_flower);
}
int bnxt_tc_setup_flower(struct bnxt *bp, u16 src_fid,
             struct flow_cls_offload *cls_flower)
{
    switch (cls_flower->command) {
    case FLOW_CLS_REPLACE:
        return bnxt_tc_add_flow(bp, src_fid, cls_flower);
    case FLOW_CLS_DESTROY:
        return bnxt_tc_del_flow(bp, cls_flower);
    case FLOW_CLS_STATS:
        return bnxt_tc_get_flow_stats(bp, cls_flower);
    default:
        return -EOPNOTSUPP;
    }
}
config BNXT_FLOWER_OFFLOAD
	bool "TC Flower offload support for NetXtreme-C/E"
	depends on BNXT
	default y
	---help---
	  This configuration parameter enables TC Flower packet classifier
	  offload for eswitch.  This option enables SR-IOV switchdev eswitch
	  offload.


static int bnxt_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
                  void *cb_priv)
{
    struct bnxt *bp = cb_priv;

    if (!bnxt_tc_flower_enabled(bp) ||
        !tc_cls_can_offload_and_chain0(bp->dev, type_data))
        return -EOPNOTSUPP;

    switch (type) {
    case TC_SETUP_CLSFLOWER:
        return bnxt_tc_setup_flower(bp, bp->pf.fw_fid, type_data);
    default:
        return -EOPNOTSUPP;
    }
}
static int mlx5e_rep_ndo_setup_tc(struct net_device *dev, u32 handle,
                  __be16 proto, struct tc_to_netdev *tc)
{
    struct mlx5e_priv *priv = netdev_priv(dev);

    if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS))
        return -EOPNOTSUPP;

    switch (tc->type) {
    case TC_SETUP_CLSFLOWER:
        switch (tc->cls_flower->command) {
        case TC_CLSFLOWER_REPLACE:
            return mlx5e_configure_flower(priv, proto, tc->cls_flower);
        case TC_CLSFLOWER_DESTROY:
            return mlx5e_delete_flower(priv, tc->cls_flower);
        case TC_CLSFLOWER_STATS:
            return mlx5e_stats_flower(priv, tc->cls_flower);
        }
    default:
        return -EOPNOTSUPP;
    }
}
static const struct net_device_ops mlx5e_netdev_ops_rep = {
    .ndo_open                = mlx5e_open,
    .ndo_stop                = mlx5e_close,
    .ndo_start_xmit          = mlx5e_xmit,
    .ndo_get_phys_port_name  = mlx5e_rep_get_phys_port_name,
    .ndo_setup_tc            = mlx5e_rep_ndo_setup_tc,
    .ndo_get_stats64         = mlx5e_get_stats,
};
static void mlx5e_build_rep_netdev(struct net_device *netdev)
{
    netdev->netdev_ops = &mlx5e_netdev_ops_rep;

    netdev->watchdog_timeo    = 15 * HZ;

    netdev->ethtool_ops      = &mlx5e_rep_ethtool_ops;

#ifdef CONFIG_NET_SWITCHDEV
    netdev->switchdev_ops = &mlx5e_rep_switchdev_ops;
#endif

    netdev->features     |= NETIF_F_VLAN_CHALLENGED | NETIF_F_HW_TC;
    netdev->hw_features      |= NETIF_F_HW_TC;

    eth_hw_addr_random(netdev);
}

 EOPNOTSUPP (Operation not supported)

/* called with RTNL */
int br_add_if(struct net_bridge *br, struct net_device *dev)
{
    err = nbp_switchdev_mark_set(p);
}

int nbp_switchdev_mark_set(struct net_bridge_port *p)
{
    struct netdev_phys_item_id ppid = { };
    int err;

    ASSERT_RTNL();

    err = dev_get_port_parent_id(p->dev, &ppid, true);
    if (err) {
        if (err == -EOPNOTSUPP)
            return 0;
        return err;
    }

    p->offload_fwd_mark = br_switchdev_mark_get(p->br, p->dev);

    return 0;
}
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <net/switchdev.h>

#include "br_private.h"

static int br_switchdev_mark_get(struct net_bridge *br, struct net_device *dev)
{
    struct net_bridge_port *p;

    /* dev is yet to be added to the port list. */
    list_for_each_entry(p, &br->port_list, list) {
        if (switchdev_port_same_parent_id(dev, p->dev))
            return p->offload_fwd_mark;
    }

    return ++br->offload_fwd_mark;
}

int nbp_switchdev_mark_set(struct net_bridge_port *p)
{
    struct switchdev_attr attr = {
        .orig_dev = p->dev,
        .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
    };
    int err;

    ASSERT_RTNL();

    err = switchdev_port_attr_get(p->dev, &attr);
    if (err) {
        if (err == -EOPNOTSUPP)
            return 0;
        return err;
    }

    p->offload_fwd_mark = br_switchdev_mark_get(p->br, p->dev);

    return 0;
}


/**
 *	switchdev_port_attr_get - Get port attribute
 *
 *	@dev: port device
 *	@attr: attribute to get
 */
int switchdev_port_attr_get(struct net_device *dev, struct switchdev_attr *attr)
{
	const struct switchdev_ops *ops = dev->switchdev_ops;
	struct net_device *lower_dev;
	struct list_head *iter;
	struct switchdev_attr first = {
		.id = SWITCHDEV_ATTR_ID_UNDEFINED
	};
	int err = -EOPNOTSUPP;

	if (ops && ops->switchdev_port_attr_get)
		return ops->switchdev_port_attr_get(dev, attr);

	if (attr->flags & SWITCHDEV_F_NO_RECURSE)
		return err;

	/* Switch device port(s) may be stacked under
	 * bond/team/vlan dev, so recurse down to get attr on
	 * each port.  Return -ENODATA if attr values don't
	 * compare across ports.
	 */

	netdev_for_each_lower_dev(dev, lower_dev, iter) {
		err = switchdev_port_attr_get(lower_dev, attr);
		if (err)
			break;
		if (first.id == SWITCHDEV_ATTR_ID_UNDEFINED)
			first = *attr;
		else if (memcmp(&first, attr, sizeof(*attr)))
			return -ENODATA;
	}

	return err;
}
EXPORT_SYMBOL_GPL(switchdev_port_attr_get);

#ifdef CONFIG_NET_SWITCHDEV
const struct switchdev_ops *switchdev_ops;
#endif
static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
    {
        struct net_device *dev = skb->dev;          // 网络设备结构体
        struct netdev_queue *txq;                   // 网络设备队列结构体
        struct Qdisc *q;                            // 排队准则结构体
        int rc = -ENOMEM;
        skb_reset_mac_header(skb);                  // 这里是计算了套接字头到数据的这段大小
        //  这里告诉cpu 这种概率不是很大,但是有可能是真的,我比较倾向是假的
        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
            __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
        /* Disable soft irqs for various locks below. Also
         * stops preemption for RCU.
         */
        rcu_read_lock_bh();
        skb_update_prio(skb);
        /* If device/qdisc don't need skb->dst, release it right now while
         * its hot in this cpu cache.
         */
        /*  检查netdevice的flag 是否要去掉skb DTS相关的信息,一般情况下这个flag是默认被设置的。
         *  在alloc_netdev_mqs 的时候,已经被默认设置了。
         */
        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
            skb_dst_drop(skb);
        else
            skb_dst_force(skb);
    #ifdef CONFIG_NET_SWITCHDEV
        /* Don't forward if offload device already forwarded */
        if (skb->offload_fwd_mark &&
            skb->offload_fwd_mark == dev->offload_fwd_mark) {
            consume_skb(skb);
            rc = NET_XMIT_SUCCESS;
            goto out;
        }
    #endif
        /*  这里是要取出此netdevice的txq和txq的Qdisc, Qdisc主要是用于进行堵塞处理
         *  一般情况下,直接将数据包给driver了,如果遇到busy的情况,就需要进行阻塞处理了,就会用到Qdisc
         */
        txq = netdev_pick_tx(dev, skb, accel_priv);
        q = rcu_dereference_bh(txq->qdisc);
    #ifdef CONFIG_NET_CLS_ACT
        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
    #endif
        trace_net_dev_queue(skb);
        /*  检查Qdisc 中是否存在enqueue规则,如果有就会调用__dev_xmit_skb,进入带有阻塞的控制的Flow,
         *  注意这个地方,虽然是走阻塞控制的,Flow但是并不一定进行enqueue操作,只有busy的情况下,
         *  才会走Qdisc的enqueue 操作进行
         */
        if (q->enqueue) {
            rc = __dev_xmit_skb(skb, q, dev, txq);
            goto out;
        }
        /* The device has no queue. Common case for software devices:
           loopback, all the sorts of tunnels...
           Really, it is unlikely that netif_tx_lock protection is necessary
           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
           counters.)
           However, it is possible, that they rely on protection
           made by us here.
           Check this and shot the lock. It is not prone from deadlocks.
           Either shot noqueue qdisc, it is even simpler 8)
        */
        /*  如果Qdisc 的enqueue 不存在,就会到这里,
         *  对于一些loopback/tunnel interface 比较常见,判断设备是不是UP状态
         */
        if (dev->flags & IFF_UP) {
           int cpu = smp_processor_id(); /* ok because BHs are off */
            if (txq->xmit_lock_owner != cpu) {
                if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
                    goto recursion_alert;
                skb = validate_xmit_skb(skb, dev);
                if (!skb)
                    goto drop;
                HARD_TX_LOCK(dev, txq, cpu);
                /*  如果txq 不是stop 状态,那么就会调用dev_hard_start_xmit  函数发送数据 */
                if (!netif_xmit_stopped(txq)) {
                    __this_cpu_inc(xmit_recursion);
                    skb = dev_hard_start_xmit(skb, dev, txq, &rc);
                    __this_cpu_dec(xmit_recursion);
                    if (dev_xmit_complete(rc)) {
                        HARD_TX_UNLOCK(dev, txq);
                        goto out;
                    }
                }
                HARD_TX_UNLOCK(dev, txq);
                net_crit_ratelimited("Virtual device %s asks to queue packet!
",
                             dev->name);
            } else {
                /* Recursion is detected! It is possible,
                 * unfortunately
                 */
    recursion_alert:
                net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!
",
                             dev->name);
            }
        }
        rc = -ENETDOWN;
    drop:
        rcu_read_unlock_bh();
        atomic_long_inc(&dev->tx_dropped);
        kfree_skb_list(skb);
        return rc;
    out:
        rcu_read_unlock_bh();
        return rc;
    }
首先从三个重要的API开始:

netif_napi_add   --driver告诉内核要使用napi的机制,初始化响应参数,注册poll的回调函数

napi_schedule    --driver告诉内核开始调度napi的机制,稍后poll回调函数会被调用

napi_complete    --driver告诉内核其工作不饱满即中断不多,数据量不大,改变napi的状态机,后续将采用纯中断方式响应数据

net_rx_action      --内核初始化注册的软中断,注册poll的回调函数会被其调用

使用起来好像不是特别复杂。

 
 

请注意上面的图不是软件流程图,只是大概的因果关系,具体怎么回事等我细细道来,

1. HW ISR,具体来说算是DMA中断,告诉CPU搬砖完成,这样会通过触发中断最终触发driver注册到内核的中断函数,例如注册的PCI的中断.

2. 在这个IRQ函数中,通常driver会disable和clear IRQ,比如关PCI中断,然后调用napi_schedule,它所作的事情就是enqueue,然后mask一个NET_RX_SOFTIRQ event

3.接下来不久,当中断上半部结束,开始中断下半部的时候,会在__do_softirq中发现先前mask的NET_RX_SOFTIRQ有效,net_rx_action作为其callback会被调用。在net_rx_action会调用driver注册的poll函数。



#define ROCKER_PORT_MIN_MTU    ETH_MIN_MTU
#define ROCKER_PORT_MAX_MTU    9000
static int rocker_probe_port(struct rocker *rocker, unsigned int port_number)
{
    struct pci_dev *pdev = rocker->pdev;
    struct rocker_port *rocker_port;
    struct net_device *dev;
    int err;

    dev = alloc_etherdev(sizeof(struct rocker_port));
    if (!dev)
        return -ENOMEM;
    SET_NETDEV_DEV(dev, &pdev->dev);
    rocker_port = netdev_priv(dev);
    rocker_port->dev = dev;
    rocker_port->rocker = rocker;
    rocker_port->port_number = port_number;
    rocker_port->pport = port_number + 1;

    err = rocker_world_check_init(rocker_port);
    if (err) {
        dev_err(&pdev->dev, "world init failed
");
        goto err_world_check_init;
    }

    rocker_port_dev_addr_init(rocker_port);
    dev->netdev_ops = &rocker_port_netdev_ops;
    dev->ethtool_ops = &rocker_port_ethtool_ops;
    netif_tx_napi_add(dev, &rocker_port->napi_tx, rocker_port_poll_tx,
              NAPI_POLL_WEIGHT);
    netif_napi_add(dev, &rocker_port->napi_rx, rocker_port_poll_rx,
               NAPI_POLL_WEIGHT);
    rocker_carrier_init(rocker_port);

    dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_SG;

    /* MTU range: 68 - 9000 */
    dev->min_mtu = ROCKER_PORT_MIN_MTU;
    dev->max_mtu = ROCKER_PORT_MAX_MTU;

    err = rocker_world_port_pre_init(rocker_port);
    if (err) {
        dev_err(&pdev->dev, "port world pre-init failed
");
        goto err_world_port_pre_init;
    }
    err = register_netdev(dev);
    if (err) {
        dev_err(&pdev->dev, "register_netdev failed
");
        goto err_register_netdev;
    }
    rocker->ports[port_number] = rocker_port;

    err = rocker_world_port_init(rocker_port);
    if (err) {
        dev_err(&pdev->dev, "port world init failed
");
        goto err_world_port_init;
    }

    return 0;

err_world_port_init:
    rocker->ports[port_number] = NULL;
    unregister_netdev(dev);
err_register_netdev:
    rocker_world_port_post_fini(rocker_port);
err_world_port_pre_init:
err_world_check_init:
    free_netdev(dev);
    return err;
}
static int rocker_port_poll_rx(struct napi_struct *napi, int budget)
{
    struct rocker_port *rocker_port = rocker_port_napi_rx_get(napi);
    const struct rocker *rocker = rocker_port->rocker;
    struct rocker_desc_info *desc_info;
    u32 credits = 0;
    int err;

    /* Process rx descriptors */
    while (credits < budget &&
           (desc_info = rocker_desc_tail_get(&rocker_port->rx_ring))) {
        err = rocker_desc_err(desc_info);
        if (err) {
            if (net_ratelimit())
                netdev_err(rocker_port->dev, "rx desc received with err %d
",
                       err);
        } else {
            err = rocker_port_rx_proc(rocker, rocker_port,
                          desc_info);
            if (err && net_ratelimit())
                netdev_err(rocker_port->dev, "rx processing failed with err %d
",
                       err);
        }
        if (err)
            rocker_port->dev->stats.rx_errors++;

        rocker_desc_gen_clear(desc_info);
        rocker_desc_head_set(rocker, &rocker_port->rx_ring, desc_info);
        credits++;
    }

    if (credits < budget)
        napi_complete_done(napi, credits);

    rocker_dma_ring_credits_set(rocker, &rocker_port->rx_ring, credits);

    return credits;
}
static int rocker_port_rx_proc(const struct rocker *rocker,
                   const struct rocker_port *rocker_port,
                   struct rocker_desc_info *desc_info)
{
    const struct rocker_tlv *attrs[ROCKER_TLV_RX_MAX + 1];
    struct sk_buff *skb = rocker_desc_cookie_ptr_get(desc_info);
    size_t rx_len;
    u16 rx_flags = 0;

    if (!skb)
        return -ENOENT;

    rocker_tlv_parse_desc(attrs, ROCKER_TLV_RX_MAX, desc_info);
    if (!attrs[ROCKER_TLV_RX_FRAG_LEN])
        return -EINVAL;
    if (attrs[ROCKER_TLV_RX_FLAGS])
        rx_flags = rocker_tlv_get_u16(attrs[ROCKER_TLV_RX_FLAGS]);

    rocker_dma_rx_ring_skb_unmap(rocker, attrs);

    rx_len = rocker_tlv_get_u16(attrs[ROCKER_TLV_RX_FRAG_LEN]);
    skb_put(skb, rx_len);
    skb->protocol = eth_type_trans(skb, rocker_port->dev);

    if (rx_flags & ROCKER_RX_FLAGS_FWD_OFFLOAD)
        skb->offload_fwd_mark = 1;

    rocker_port->dev->stats.rx_packets++;
    rocker_port->dev->stats.rx_bytes += skb->len;

    netif_receive_skb(skb);

    return rocker_dma_rx_ring_skb_alloc(rocker_port, desc_info);
}
https://gitlab.com/kalilinux/packages/linux/-/blob/a17bad0db9da44cd73f594794a58cc5646393b13/drivers/net/ethernet/rocker/rocker_main.c
原文地址:https://www.cnblogs.com/dream397/p/12842097.html