mbufs multi segments + offload

DPDK技术栈在电信云中的最佳实践(三)

 https://kkutysllb.cn/2019/05/01/DPDK%E6%8A%80%E6%9C%AF%E6%A0%88%E5%9C%A8%E7%94%B5%E4%BF%A1%E4%BA%91%E4%B8%AD%E7%9A%84%E6%9C%80%E4%BD%B3%E5%AE%9E%E8%B7%B5%EF%BC%88%E4%B8%89%EF%BC%89/

ethtool -k enp6s0
Features for enp6s0:
rx-checksumming: off [fixed]
tx-checksumming: off
        tx-checksum-ipv4: off [fixed]
        tx-checksum-ip-generic: off [fixed]
        tx-checksum-ipv6: off [fixed]
        tx-checksum-fcoe-crc: off [fixed]
        tx-checksum-sctp: off [fixed]
scatter-gather: on
        tx-scatter-gather: on
        tx-scatter-gather-fraglist: off [fixed]
tcp-segmentation-offload: off
        tx-tcp-segmentation: off [fixed]
        tx-tcp-ecn-segmentation: off [fixed]
        tx-tcp-mangleid-segmentation: off [fixed]
        tx-tcp6-segmentation: off [fixed]
udp-fragmentation-offload: off
generic-segmentation-offload: on
generic-receive-offload: on
large-receive-offload: off [fixed]
rx-vlan-offload: off [fixed]
tx-vlan-offload: off [fixed]
ntuple-filters: off [fixed]
receive-hashing: off [fixed]
highdma: on
rx-vlan-filter: on [fixed]
vlan-challenged: off [fixed]
tx-lockless: off [fixed]
netns-local: off [fixed]
tx-gso-robust: off [fixed]
tx-fcoe-segmentation: off [fixed]
tx-gre-segmentation: off [fixed]
tx-gre-csum-segmentation: off [fixed]
tx-ipxip4-segmentation: off [fixed]
tx-ipxip6-segmentation: off [fixed]
tx-udp_tnl-segmentation: off [fixed]
tx-udp_tnl-csum-segmentation: off [fixed]
tx-gso-partial: off [fixed]
tx-sctp-segmentation: off [fixed]
tx-esp-segmentation: off [fixed]
fcoe-mtu: off [fixed]
tx-nocache-copy: off
loopback: off [fixed]
rx-fcs: off [fixed]
rx-all: off [fixed]
tx-vlan-stag-hw-insert: off [fixed]
rx-vlan-stag-hw-parse: off [fixed]
rx-vlan-stag-filter: off [fixed]
l2-fwd-offload: off [fixed]
hw-tc-offload: off [fixed]
esp-hw-offload: off [fixed]
esp-tx-csum-hw-offload: off [fixed]
rx-udp_tunnel-port-offload: off [fixed]

.txmode = {                                                                                                                                                                                                                                                                  
    .offloads = DEV_TX_OFFLOAD_MULTI_SEGS,    

DMA的实现简述

在实现DMA传输时,是由DMA控制器直接掌管总线,因此,存在着一个总线控制权转移问题。即DMA传输前,CPU要把总线控制权交给DMA控制器,而在结束DMA传输后,DMA控制器应立即把总线控制权再交回给CPU。一个完整的DMA传输过程必须经过DMA请求、DMA响应、DMA传输、DMA结束 4个步骤。

scatter-gather DMA 与 block DMA
传统的block DMA 一次只能传输物理上连续的一个块的数据, 完成传输后发起中断。而scatter-gather DMA允许一次传输多个物理上不连续的块,完成传输后只发起一次中断。 

传统的block DMA像这样:

先进的scatter-gather DMA像这样:

这样做的好处是直观的,大大减少了中断的次数,提高了数据传输的效率。

scatter-gather DMA的应用

dpdk在ip分片的实现中,采用了一种称作零拷贝的技术。而这种实现方式的底层,正是由scatter-gather DMA支撑的。dpdk的分片包采用了链式管理,同一个数据包的数据,分散存储在不连续的块中(mbuf结构)。这就要求DMA一次操作,需要从不连续的多个块中搬移数据。附上e1000驱动发包部分代码:

uint16_t
eth_em_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
        uint16_t nb_pkts)
{
    //e1000驱动部分代码
    ...
    m_seg = tx_pkt;
    do {
        txd = &txr[tx_id];
        txn = &sw_ring[txe->next_id];
 
        if (txe->mbuf != NULL)
            rte_pktmbuf_free_seg(txe->mbuf);
            txe->mbuf = m_seg;
 
        /*
        * Set up Transmit Data Descriptor.
        */
        slen = m_seg->data_len;
        buf_dma_addr = rte_mbuf_data_iova(m_seg);
 
        txd->buffer_addr = rte_cpu_to_le_64(buf_dma_addr);
        txd->lower.data = rte_cpu_to_le_32(cmd_type_len | slen);
        txd->upper.data = rte_cpu_to_le_32(popts_spec);
 
        txe->last_id = tx_last;
        tx_id = txe->next_id;
        txe = txn;
        m_seg = m_seg->next;
    } while (m_seg != NULL);
 
    /*
    * The last packet data descriptor needs End Of Packet (EOP)
    */
    cmd_type_len |= E1000_TXD_CMD_EOP;
    txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);
    txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_used);
    ...
}
DEV_TX_OFFLOAD_IPV4_CKSUM;
  txq_conf.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
    ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
                                 rte_eth_dev_socket_id(portid),
                                 &txq_conf);
Ethdev port_id=0 tx_queue_id=0, new added offloads 0x2 must be within per-queue offload capabilities 0x0 in rte_eth_tx_queue_setup()
EAL: Error - exiting with code: 1
  Cause: rte_eth_tx_queue_setup:err=-22, port=0

 添加:

local_port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
    ret = rte_eth_dev_configure(portid, 1, 1, &local_port_conf);
                        //ip_h->hdr_checksum = ipv4_hdr_cksum(ip_h);
                        ip_h->hdr_checksum = 0;
                        pkt->ol_flags |= PKT_TX_IP_CKSUM;
Breakpoint 1, hinic_tx_offload_pkt_prepare (m=0x13e82a480, off_info=0xffffbd40cd28)
    at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:794
794             u16 eth_type = 0;
(gdb) bt
#0  hinic_tx_offload_pkt_prepare (m=0x13e82a480, off_info=0xffffbd40cd28)
    at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:794
#1  0x000000000078a5cc in hinic_get_sge_txoff_info (mbuf_pkt=0x13e82a480, sqe_info=0xffffbd40cd38, 
    off_info=0xffffbd40cd28) at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:991
#2  0x000000000078a890 in hinic_xmit_pkts (tx_queue=0x13e7e7000, tx_pkts=0xffffbd40ce08, nb_pkts=1)
(gdb) s
796             uint64_t ol_flags = m->ol_flags;
(gdb) list
791             struct rte_udp_hdr *udp_hdr;
792             struct rte_ether_hdr *eth_hdr;
793             struct rte_vlan_hdr *vlan_hdr;
794             u16 eth_type = 0;
795             uint64_t inner_l3_offset;
796             uint64_t ol_flags = m->ol_flags;
797
798             /* Check if the packets set available offload flags */
799             if (!(ol_flags & HINIC_TX_CKSUM_OFFLOAD_MASK))
800                     return 0;
(gdb) n
799             if (!(ol_flags & HINIC_TX_CKSUM_OFFLOAD_MASK))
(gdb) n
800                     return 0;
(gdb) n
978     }
(gdb) n
hinic_get_sge_txoff_info (mbuf_pkt=0x13e82a480, sqe_info=0xffffbd40cd38, off_info=0xffffbd40cd28)
    at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:992
992             if (unlikely(ret))
(gdb) n
995             sqe_info->cpy_mbuf_cnt = 0;
(gdb) n
998             if (likely(!(mbuf_pkt->ol_flags & PKT_TX_TCP_SEG))) {
(gdb) n
999                     if (unlikely(mbuf_pkt->pkt_len > MAX_SINGLE_SGE_SIZE)) {
(gdb) n
1002                    } else if (unlikely(HINIC_NONTSO_SEG_NUM_INVALID(sge_cnt))) {
(gdb) n
1024                    sqe_info->sge_cnt = sge_cnt;
(gdb) n
1037            return true;
(gdb) n
1038    }
(gdb) n
hinic_xmit_pkts (tx_queue=0x13e7e7000, tx_pkts=0xffffbd40ce08, nb_pkts=1)
    at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:1093
1093                    wqe_wqebb_cnt = HINIC_SQ_WQEBB_CNT(sqe_info.sge_cnt);
(gdb) n
1094                    free_wqebb_cnt = HINIC_GET_SQ_FREE_WQEBBS(txq);
(gdb) n
1095                    if (unlikely(wqe_wqebb_cnt > free_wqebb_cnt)) {
(gdb) n
1108                    sq_wqe = hinic_get_sq_wqe(txq, wqe_wqebb_cnt, &sqe_info);
(gdb) n
1111                    if (unlikely(!hinic_mbuf_dma_map_sge(txq, mbuf_pkt,
(gdb) n
1121                    task = &sq_wqe->task;
(gdb) n
1124                    hinic_fill_tx_offload_info(mbuf_pkt, task, &queue_info,
(gdb) n
1128                    tx_info = &txq->tx_info[sqe_info.pi];
(gdb) n
1129                    tx_info->mbuf = mbuf_pkt;
(gdb) n
1130                    tx_info->wqebb_cnt = wqe_wqebb_cnt;
(gdb) n
1133                    hinic_fill_sq_wqe_header(&sq_wqe->ctrl, queue_info,
(gdb) n
1134                                             sqe_info.sge_cnt, sqe_info.owner);
(gdb) n
1133                    hinic_fill_sq_wqe_header(&sq_wqe->ctrl, queue_info,
(gdb) c
Continuing.
Breakpoint 1, hinic_tx_offload_pkt_prepare (m=0x13e82ac00, off_info=0xffffc357c558)
    at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:794
794             u16 eth_type = 0;
(gdb) s
796             uint64_t ol_flags = m->ol_flags;
(gdb) n
799             if (!(ol_flags & HINIC_TX_CKSUM_OFFLOAD_MASK))
(gdb) n
803             if ((ol_flags & PKT_TX_TUNNEL_MASK) &&
(gdb) n
812             if (ol_flags & PKT_TX_TUNNEL_VXLAN) {
(gdb) n
847                     inner_l3_offset = m->l2_len;
(gdb) n
848                     off_info->inner_l2_len = m->l2_len;
(gdb) n
849                     off_info->inner_l3_len = m->l3_len;
(gdb) n
850                     off_info->inner_l4_len = m->l4_len;
(gdb) n
851                     off_info->tunnel_type = NOT_TUNNEL;
(gdb) n
853                     hinic_get_pld_offset(m, off_info,
(gdb) n
858             if (unlikely(off_info->payload_offset > MAX_PLD_OFFSET))
(gdb) n
862             if ((ol_flags & PKT_TX_TUNNEL_VXLAN) && ((ol_flags & PKT_TX_TCP_SEG) ||
(gdb) n
901             } else if (ol_flags & PKT_TX_OUTER_IPV4) {
(gdb) n
907             if (ol_flags & PKT_TX_IPV4)
(gdb) n
908                     off_info->inner_l3_type = (ol_flags & PKT_TX_IP_CKSUM) ?
(gdb) n
915             if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
(gdb) n
942             } else if (((ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM) ||
(gdb) n
943                             (ol_flags & PKT_TX_TCP_SEG)) {
(gdb) n
942             } else if (((ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM) ||
(gdb) n
971             } else if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_SCTP_CKSUM) {
(gdb) n
977             return 0;
(gdb) n
978     }
(gdb) n
hinic_get_sge_txoff_info (mbuf_pkt=0x13e82ac00, sqe_info=0xffffc357c568, off_info=0xffffc357c558)
    at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:992
992             if (unlikely(ret))
(gdb) 

 

原文地址:https://www.cnblogs.com/dream397/p/13677950.html