remap_pfn_range: 将bar空间映射到user space pci_map_device

网上的Linux PCI驱动教程基本就没有有用的。扯半天PCI配置空间就完了。但是PCI配置空间是最容易访问的，只是内核启动时扫描PCI设备时比较重要。对于PCI驱动，更常用的是PCI设备的IO空间和内存空间。
以前只知道在PCI设备的配置空间中，BAR0-BAR5能够读取到PCI设备的IO空间或地址空间的基址，但是如何区分这个BAR代表的到底是IO空间还是内存地址空间呢？
在PCI网卡的示例程序（pci-skeleton.c）中：

pio_start = pci_resource_start(pdev, 0);

    pio_end = pci_resource_end(pdev, 0);

    pio_flags = pci_resource_flags(pdev, 0);

    pio_len = pci_resource_len(pdev, 0);

    mmio_start = pci_resource_start(pdev, 1);

    mmio_end = pci_resource_end(pdev, 1);

    mmio_flags = pci_resource_flags(pdev, 1);

    mmio_len = pci_resource_len(pdev, 1);

    /* make sure PCI base addr 0 is PIO */

    if (!(pio_flags & IORESOURCE_IO)) {

    dev_err(&pdev->dev, “region #0 not a PIO resource, aborting ”);

    rc = -ENODEV;

    goto err_out;

    }

    /* make sure PCI base addr 1 is MMIO */

    if (!(mmio_flags & IORESOURCE_MEM)) {

    dev_err(&pdev->dev, “region #1 not an MMIO resource, aborting ”);

    rc = -ENODEV;

    goto err_out;

    }

可以看到如果只写驱动程序的话，内核在扫描pci设备的时候早就把设备的BAR的属性识别好了。当然，到底有几个BAR，每个BAR到底是IO空间还是PCI地址空间可以直接问制作PCI设备的硬件工程师。
那么内核是如何获得这个flags呢？我跟了半天源码也没找到。只是知道，PCI总线规范规定直接读BAR，返回的是BAR空间基址。先写全1到BAR再读，就能读取到BAR空间大小和属性。选最低的一位非0的，比如读到0xFFFFFF00，那个空间的大小就为0x100个Byte ，最后一位为0说明是地址区域，为1则这个BAR是IO空间。

此外，非常重要的一个概念是，BAR读取到的是PCI地址空间中的地址，不等同于CPU认识的内存地址。虽然在x86上如果没有开启IOMMU时，它们的值一般是相同的，但是对于其他构架的CPU如PowerPC就可以是不一样的。
所以正确的使用BAR空间的方法：

pciaddr=pci_resource_start(pdev,1);
if(pciaddr!=NULL)
{
ioremap(pciaddr,xx_SIZE);
}

错误的方法：

pci_read_config_dword(pdev,1,&pciaddr);
ioremap(pciaddr,xx_SIZE);

int container, group, device, i;
struct vfio_group_status group_status =
                { .argsz = sizeof(group_status) };
struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map) };
struct vfio_device_info device_info = { .argsz = sizeof(device_info) };

/* Create a new container */
container = open("/dev/vfio/vfio", O_RDWR);

if (ioctl(container, VFIO_GET_API_VERSION) != VFIO_API_VERSION)
    /* Unknown API version */

if (!ioctl(container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
    /* Doesn't support the IOMMU driver we want. */

/* Open the group */
group = open("/dev/vfio/26", O_RDWR);

/* Test the group is viable and available */
ioctl(group, VFIO_GROUP_GET_STATUS, &group_status);

if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE))
    /* Group is not viable (ie, not all devices bound for vfio) */

/* Add the group to the container */
ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);

/* Enable the IOMMU model we want */
ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);

/* Get addition IOMMU info */
ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info);

/* Allocate some space and setup a DMA mapping */
dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE,
             MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
dma_map.size = 1024 * 1024;
dma_map.iova = 0; /* 1MB starting at 0x0 from device view */
dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;

ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);

/* Get a file descriptor for the device */
device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");

/* Test and setup the device */
ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);

for (i = 0; i < device_info.num_regions; i++) {
    struct vfio_region_info reg = { .argsz = sizeof(reg) };

    reg.index = i;

    ioctl(device, VFIO_DEVICE_GET_REGION_INFO, &reg);

    /* Setup mappings... read/write offsets, mmaps
     * For PCI devices, config space is a region */
}

for (i = 0; i < device_info.num_irqs; i++) {
    struct vfio_irq_info irq = { .argsz = sizeof(irq) };

    irq.index = i;

    ioctl(device, VFIO_DEVICE_GET_IRQ_INFO, &irq);

    /* Setup IRQs... eventfds, VFIO_DEVICE_SET_IRQS */
}

/* Gratuitous device reset and go... */
ioctl(device, VFIO_DEVICE_RESET);

 申请和映射了iommu的DMA内存。这些内存必须要给设备使用才有意义。因此首先获取VFIO的设备文件描述符；并通过设备的文件描述符获取设备的PCI BAR信息和IRQ信息。当然也可以对设备做复位操作
int device, i;
struct vfio_device_info device_info = { .argsz = sizeof(device_info) };

/* Get a file descriptor for the device */
device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");

/* Test and setup the device */
ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);

for (i = 0; i < device_info.num_regions; i++) {
        struct vfio_region_info reg = { .argsz = sizeof(reg) };

        reg.index = i;

        ioctl(device, VFIO_DEVICE_GET_REGION_INFO, &reg);

        /* Setup mappings... read/write offsets, mmaps
         * For PCI devices, config space is a region */
}

for (i = 0; i < device_info.num_irqs; i++) {
        struct vfio_irq_info irq = { .argsz = sizeof(irq) };

        irq.index = i;

        ioctl(device, VFIO_DEVICE_GET_IRQ_INFO, &irq);

        /* Setup IRQs... eventfds, VFIO_DEVICE_SET_IRQS */
}

/* Gratuitous device reset and go... */
ioctl(device, VFIO_DEVICE_RESET);

librte_eal/linux/eal/eal_vfio.h:27:#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU

VFIO_SET_IOMMU

const struct vfio_iommu_type *
vfio_set_iommu_type(int vfio_container_fd)
{
        unsigned idx;
        for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
                const struct vfio_iommu_type *t = &iommu_types[idx];

                int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
                                t->type_id);
                if (!ret) {
                        RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)
",
                                        t->type_id, t->name);
                        return t;
                }
                /* not an error, there may be more supported IOMMU types */
                RTE_LOG(DEBUG, EAL, "  set IOMMU type %d (%s) failed, "
                                "error %i (%s)
", t->type_id, t->name, errno,
                                strerror(errno));
        }
        /* if we didn't find a suitable IOMMU type, fail */
        return NULL;
}



dma_mem_map -------------VFIO_IOMMU_MAP_DMA

static int
vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
        uint64_t len, int do_map)
{
    struct vfio_iommu_type1_dma_map dma_map;
    struct vfio_iommu_type1_dma_unmap dma_unmap;
    int ret;

    if (do_map != 0) {
        memset(&dma_map, 0, sizeof(dma_map));
        dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
        dma_map.vaddr = vaddr;
        dma_map.size = len;
        dma_map.iova = iova;
        dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
                VFIO_DMA_MAP_FLAG_WRITE;

//VFIO_IOMMU_MAP_DMA这个命令就是将iova通过IOMMU映射到vaddr对应的物理地址上去。
        ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
        if (ret) {
            /**
             * In case the mapping was already done EEXIST will be
             * returned from kernel.
             */
            if (errno == EEXIST) {
                RTE_LOG(DEBUG, EAL,
                    " Memory segment is already mapped,"
                    " skipping");
            } else {
                RTE_LOG(ERR, EAL,
                    "  cannot set up DMA remapping,"
                    " error %i (%s)
",
                    errno, strerror(errno));
                return -1;
            }
        }
    } else {
        memset(&dma_unmap, 0, sizeof(dma_unmap));
        dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
        dma_unmap.size = len;
        dma_unmap.iova = iova;

        ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
                &dma_unmap);
        if (ret) {
            RTE_LOG(ERR, EAL, "  cannot clear DMA remapping, error %i (%s)
",
                    errno, strerror(errno));
            return -1;
        }
    }

    return 0;
}

static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
{
    struct vfio_pci_device *vdev = device_data;
    struct pci_dev *pdev = vdev->pdev;
    unsigned int index;
    u64 phys_len, req_len, pgoff, req_start;
    int ret;
    index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
    if (vma->vm_end < vma->vm_start)
        return -EINVAL;
    if ((vma->vm_flags & VM_SHARED) == 0)
        return -EINVAL;
    if (index >= VFIO_PCI_ROM_REGION_INDEX)
        return -EINVAL;
    if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
        return -EINVAL;
    phys_len = pci_resource_len(pdev, index);
    req_len = vma->vm_end - vma->vm_start;
    pgoff = vma->vm_pgoff &
        ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
    req_start = pgoff << PAGE_SHIFT;
    if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
        return -EINVAL;
    if (index == vdev->msix_bar) {
        /*
         * Disallow mmaps overlapping the MSI-X table; users don't
         * get to touch this directly.  We could find somewhere
         * else to map the overlap, but page granularity is only
         * a recommendation, not a requirement, so the user needs
         * to know which bits are real.  Requiring them to mmap
         * around the table makes that clear.
         */
        /* If neither entirely above nor below, then it overlaps */
        if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
              req_start + req_len <= vdev->msix_offset))
            return -EINVAL;
    }
    /*
     * Even though we don't make use of the barmap for the mmap,
     * we need to request the region and the barmap tracks that.
     */
    if (!vdev->barmap[index]) {
        ret = pci_request_selected_regions(pdev,
                           1 << index, "vfio-pci");
        if (ret)
            return ret;
        vdev->barmap[index] = pci_iomap(pdev, index, 0);
    }
    vma->vm_private_data = vdev;
    vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
    vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
    vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
    return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
                   req_len, vma->vm_page_prot);
}

vfio-pci与igb_uio映射硬件资源

DPDK(version 20.02)函数rte_pci_map_device用来映射pci device resource到用户态:

/* Map pci device, only reserve skeleton codes */
int
rte_pci_map_device(struct rte_pci_device *dev)
{
    switch (dev->kdrv) {
    case RTE_KDRV_VFIO:
        pci_vfio_map_resource(dev);
        break;
    case RTE_KDRV_IGB_UIO:
        pci_uio_map_resource(dev);
        break;
}

一 vfio-pci
当设备绑定到vfio-pci时，调用函数pci_vfio_map_resource

1.1 函数pci_vfio_map_resource

我们在此对函数pci_vfio_map_resource_primary的主要部分进行分析。

static int
pci_vfio_map_resource_primary(struct rte_pci_device *dev)
{
    struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
    char pci_addr[PATH_MAX] = {0};
    int vfio_dev_fd;
    struct rte_pci_addr *loc = &dev->addr;
    int i, ret;
    struct mapped_pci_resource *vfio_res = NULL;
    struct mapped_pci_res_list *vfio_res_list =
        RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);

    struct pci_map *maps;

    dev->intr_handle.fd = -1;
#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
    dev->vfio_req_intr_handle.fd = -1;
#endif

    /* store PCI address string */
    snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
            loc->domain, loc->bus, loc->devid, loc->function);

    ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
                    &vfio_dev_fd, &device_info);
    if (ret)
        return ret;

    /* allocate vfio_res and get region info */
    vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0);
    if (vfio_res == NULL) {
        RTE_LOG(ERR, EAL,
            "%s(): cannot store vfio mmap details
", __func__);
        goto err_vfio_dev_fd;
    }
    memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));

    /* get number of registers (up to BAR5) */
    vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
            VFIO_PCI_BAR5_REGION_INDEX + 1);

    /* map BARs */
    maps = vfio_res->maps;

    vfio_res->msix_table.bar_index = -1;
    /* get MSI-X BAR, if any (we have to know where it is because we can't
     * easily mmap it when using VFIO)
     */
    ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table);
    if (ret < 0) {
        RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!
",
                pci_addr);
        goto err_vfio_res;
    }
    /* if we found our MSI-X BAR region, check if we can mmap it */
    if (vfio_res->msix_table.bar_index != -1) {
        int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
                vfio_res->msix_table.bar_index);
        if (ret < 0) {
            RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable
");
            goto err_vfio_res;
        } else if (ret != 0) {
            /* we can map it, so we don't care where it is */
            RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable
");
            vfio_res->msix_table.bar_index = -1;
        }
    }

    for (i = 0; i < (int) vfio_res->nb_maps; i++) {
        struct vfio_region_info *reg = NULL;
        void *bar_addr;

        ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
        if (ret < 0) {
            RTE_LOG(ERR, EAL, "  %s cannot get device region info "
                "error %i (%s)
", pci_addr, errno,
                strerror(errno));
            goto err_vfio_res;
        }

        /* chk for io port region */
        ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
        if (ret < 0) {
            free(reg);
            goto err_vfio_res;
        } else if (ret) {
            RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)
",
                    i);
            free(reg);
            continue;
        }

        /* skip non-mmapable BARs */
        if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
            free(reg);
            continue;
        }

        /* try mapping somewhere close to the end of hugepages */
        if (pci_map_addr == NULL)
            pci_map_addr = pci_find_max_end_va();

        bar_addr = pci_map_addr;
        pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);

        maps[i].addr = bar_addr;
        maps[i].offset = reg->offset;
        maps[i].size = reg->size;
        maps[i].path = NULL; /* vfio doesn't have per-resource paths */

        ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
        if (ret < 0) {
            RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s
",
                    pci_addr, i, strerror(errno));
            free(reg);
            goto err_vfio_res;
        }

        dev->mem_resource[i].addr = maps[i].addr;

        free(reg);
    }

    if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
        RTE_LOG(ERR, EAL, "  %s setup device failed
", pci_addr);
        goto err_vfio_res;
    }

 
}

1.1.1 rte_vfio_setup_device
此函数的主要工作内容如下：

首先要获取device对应的iommu_group，找到iommu_group id, 并打开对应的字符设备
/* 此函数通过sys文件系统获取iommu_group的id号 */
int
rte_vfio_get_group_num(const char *sysfs_base,
const char *dev_addr, int *iommu_group_num)

/* 此函数打开字符设备/dev/vfio/{iommu_group}，并返回字符设备句柄 */
int
rte_vfio_get_group_fd(int iommu_group_num)
{
struct vfio_config *vfio_cfg;

/* get the vfio_config it belongs to */
vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;

return vfio_get_group_fd(vfio_cfg, iommu_group_num);
}
获取当前设备所属iommu_group的配置
struct vfio_config {
int vfio_enabled;
int vfio_container_fd;
int vfio_active_groups;
const struct vfio_iommu_type *vfio_iommu_type;
struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
struct user_mem_maps mem_maps;
};

/* get the vfio_config it belongs to */
struct vfio_config *vfio_cfg;
vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
vfio_container_fd = vfio_cfg->vfio_container_fd;
user_mem_maps = &vfio_cfg->mem_maps;
? 将刚刚打开的字符设备添加到container中，并完成iommu的内存映射，在Intel架构中，调用函数vfio_type1_dma_map做映射，DPDK映射的内存有(看上去是所有DPDK管理的内存都做了映射)。。。。。

获取device fd及device info并返回。

/* get a file descriptor for the device */
*vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);

/* test and setup the device */
ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);

================VFIO_GROUP_GET_STATUS VFIO_GROUP_SET_CONTAINER===============

 rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
        int *vfio_dev_fd, struct vfio_device_info *device_info)
{
    struct vfio_group_status group_status = {
            .argsz = sizeof(group_status)
    };
    struct vfio_config *vfio_cfg;
    struct user_mem_maps *user_mem_maps;
    int vfio_container_fd;
    int vfio_group_fd;
    int iommu_group_num;
    int i, ret;

    /* get group number */
    ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
 

    /* if negative, something failed */
    if (ret < 0)
        return -1;

    /* get the actual group fd */
    vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
    if (vfio_group_fd < 0)
        return -1;

    /* if group_fd == 0, that means the device isn't managed by VFIO */
    if (vfio_group_fd == 0) {
        RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping
",
                dev_addr);
        return 1;
    }

    /*
     * at this point, we know that this group is viable (meaning, all devices
     * are either bound to VFIO or not bound to anything)
     */

    /* check if the group is viable */
    ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
     

    /* get the vfio_config it belongs to */
    vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
    vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
    vfio_container_fd = vfio_cfg->vfio_container_fd;
    user_mem_maps = &vfio_cfg->mem_maps;

    /* check if group does not have a container yet */
    if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {

        /* add group to a container */
        ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
                &vfio_container_fd);
     

        /*
         * pick an IOMMU type and set up DMA mappings for container
         *
         * needs to be done only once, only when first group is
         * assigned to a container and only in primary process.
         * Note this can happen several times with the hotplug
         * functionality.
         */
        if (internal_config.process_type == RTE_PROC_PRIMARY &&
                vfio_cfg->vfio_active_groups == 1 &&
                vfio_group_device_count(vfio_group_fd) == 0) {
            const struct vfio_iommu_type *t;

            /* select an IOMMU type which we will be using */
            t = vfio_set_iommu_type(vfio_container_fd);
            if (!t) {
                RTE_LOG(ERR, EAL,
                    "  %s failed to select IOMMU type
",
                    dev_addr);
                close(vfio_group_fd);
                rte_vfio_clear_group(vfio_group_fd);
                return -1;
            }
            /* lock memory hotplug before mapping and release it
             * after registering callback, to prevent races
             */
            rte_mcfg_mem_read_lock();
            if (vfio_cfg == default_vfio_cfg)
                ret = t->dma_map_func(vfio_container_fd);
            else
                ret = 0;
            if (ret) {
                RTE_LOG(ERR, EAL,
                    "  %s DMA remapping failed, error %i (%s)
",
                    dev_addr, errno, strerror(errno));
                close(vfio_group_fd);
                rte_vfio_clear_group(vfio_group_fd);
                rte_mcfg_mem_read_unlock();
                return -1;
            }

            vfio_cfg->vfio_iommu_type = t;

            /* re-map all user-mapped segments */
            rte_spinlock_recursive_lock(&user_mem_maps->lock);

            /* this IOMMU type may not support DMA mapping, but
             * if we have mappings in the list - that means we have
             * previously mapped something successfully, so we can
             * be sure that DMA mapping is supported.
             */
            for (i = 0; i < user_mem_maps->n_maps; i++) {
                struct user_mem_map *map;
                map = &user_mem_maps->maps[i];

                ret = t->dma_user_map_func(
                        vfio_container_fd,
                        map->addr, map->iova, map->len,
                        1);
                if (ret) {
                    RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: "
                            "va: 0x%" PRIx64 " "
                            "iova: 0x%" PRIx64 " "
                            "len: 0x%" PRIu64 "
",
                            map->addr, map->iova,
                            map->len);
                    rte_spinlock_recursive_unlock(
                            &user_mem_maps->lock);
                    rte_mcfg_mem_read_unlock();
                    return -1;
                }
            }
            rte_spinlock_recursive_unlock(&user_mem_maps->lock);

            /* register callback for mem events */
            if (vfio_cfg == default_vfio_cfg)
                ret = rte_mem_event_callback_register(
                    VFIO_MEM_EVENT_CLB_NAME,
                    vfio_mem_event_callback, NULL);
            else
                ret = 0;
            /* unlock memory hotplug */
            rte_mcfg_mem_read_unlock();

            if (ret && rte_errno != ENOTSUP) {
                RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO
");
                return -1;
            }
            if (ret)
                RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported
");
            else
                RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO
");
        }
    }  

    /* get a file descriptor for the device */
    *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
    if (*vfio_dev_fd < 0) {
        /* if we cannot get a device fd, this implies a problem with
         * the VFIO group or the container not having IOMMU configured.
         */

        RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed
",
                dev_addr);
        close(vfio_group_fd);
        rte_vfio_clear_group(vfio_group_fd);
        return -1;
    }
 
    vfio_group_device_get(vfio_group_fd);

    return 0;
}

1.1.2 pci_vfio_get_msix_bar
通过读取设备的PCI配置空间，读取的方法是通过上一步取得的设备句柄，获取msix的配置信息。并保存到vfio_res结构体中。

/* get MSI-X BAR, if any (we have to know where it is because we can't
* easily mmap it when using VFIO)
*/
ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table);

1.1.3 pci_vfio_get_region_info & pci_vfio_mmap_bar
获取设备的BAR REGION(寄存器，中断等信息)，并完成寄存器的mmap映射，让用户态程序能够直接访问PCI设备的寄存器。

1.1.4 pci_rte_vfio_setup_device
这个函数首先设置中断，将第一个中断添加到系统的中断轮训链表去。
然后设置开启设备，并对设备复位。

static int
pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
{
if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
RTE_LOG(ERR, EAL, "Error setting up interrupts! ");
return -1;
}

/* set bus mastering for the device */
if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
RTE_LOG(ERR, EAL, "Cannot set up bus mastering! ");
return -1;
}

/*
* Reset the device. If the device is not capable of resetting,
* then it updates errno as EINVAL.
*/
if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) {
RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s) ",
errno, strerror(errno));
return -1;
}

return 0;
}

pci map mmap

pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
        int bar_index, int additional_flags)
{
    struct memreg {
        uint64_t offset;
        size_t   size;
    } memreg[2] = {};
    void *bar_addr;
    struct pci_msix_table *msix_table = &vfio_res->msix_table;
    struct pci_map *bar = &vfio_res->maps[bar_index];

    if (bar->size == 0) {
        RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d
", bar_index);
        return 0;
    }

    if (msix_table->bar_index == bar_index) {
        /*
         * VFIO will not let us map the MSI-X table,
         * but we can map around it.
         */
        uint32_t table_start = msix_table->offset;
        uint32_t table_end = table_start + msix_table->size;
        table_end = RTE_ALIGN(table_end, PAGE_SIZE);
        table_start = RTE_ALIGN_FLOOR(table_start, PAGE_SIZE);

        /* If page-aligned start of MSI-X table is less than the
         * actual MSI-X table start address, reassign to the actual
         * start address.
         */
        if (table_start < msix_table->offset)
            table_start = msix_table->offset;

        if (table_start == 0 && table_end >= bar->size) {
            /* Cannot map this BAR */
            RTE_LOG(DEBUG, EAL, "Skipping BAR%d
", bar_index);
            bar->size = 0;
            bar->addr = 0;
            return 0;
        }

        memreg[0].offset = bar->offset;
        memreg[0].size = table_start;
        if (bar->size < table_end) {
            /*
             * If MSI-X table end is beyond BAR end, don't attempt
             * to perform second mapping.
             */
            memreg[1].offset = 0;
            memreg[1].size = 0;
        } else {
            memreg[1].offset = bar->offset + table_end;
            memreg[1].size = bar->size - table_end;
        }

        RTE_LOG(DEBUG, EAL,
            "Trying to map BAR%d that contains the MSI-X "
            "table. Trying offsets: "
            "0x%04" PRIx64 ":0x%04zx, 0x%04" PRIx64 ":0x%04zx
",
            bar_index,
            memreg[0].offset, memreg[0].size,
            memreg[1].offset, memreg[1].size);
    } else {
        memreg[0].offset = bar->offset;
        memreg[0].size = bar->size;
    }

    /* reserve the address using an inaccessible mapping */
    bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE |
            MAP_ANONYMOUS | additional_flags, -1, 0);
    if (bar_addr != MAP_FAILED) {
        void *map_addr = NULL;
        if (memreg[0].size) {
            /* actual map of first part */
            map_addr = pci_map_resource(bar_addr, vfio_dev_fd,
                            memreg[0].offset,
                            memreg[0].size,
                            MAP_FIXED);
        }

        /* if there's a second part, try to map it */
        if (map_addr != MAP_FAILED
            && memreg[1].offset && memreg[1].size) {
            void *second_addr = RTE_PTR_ADD(bar_addr,
                        (uintptr_t)(memreg[1].offset -
                        bar->offset));
            map_addr = pci_map_resource(second_addr,
                            vfio_dev_fd,
                            memreg[1].offset,
                            memreg[1].size,
                            MAP_FIXED);
        }

        if (map_addr == MAP_FAILED || !map_addr) {
            munmap(bar_addr, bar->size);
            bar_addr = MAP_FAILED;
            RTE_LOG(ERR, EAL, "Failed to map pci BAR%d
",
                    bar_index);
            return -1;
        }
    } else {
        RTE_LOG(ERR, EAL,
                "Failed to create inaccessible mapping for BAR%d
",
                bar_index);
        return -1;
    }

    bar->addr = bar_addr;
    return 0;
}

pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size,
                 int additional_flags)
{
        void *mapaddr;

        /* Map the PCI memory resource of device */
        mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE,
                        MAP_SHARED | additional_flags, fd, offset);                     ----------------pci vfio mmap
        if (mapaddr == MAP_FAILED) {
                RTE_LOG(ERR, EAL,
                        "%s(): cannot mmap(%d, %p, 0x%zx, 0x%llx): %s (%p)
",
                        __func__, fd, requested_addr, size,
                        (unsigned long long)offset,
                        strerror(errno), mapaddr);
        } else
                RTE_LOG(DEBUG, EAL, "  PCI memory mapped at %p
", mapaddr);

        return mapaddr;
}

VFIO_DEVICE_GET_REGION_INFO

首先，利用mmap映射出1MB字节的虚拟空间，因为物理地址对于用户态不可见，只能通过虚拟地址访问物理空间。然后执行ioctl的VFIO_IOMMU_MAP_DMA命令，传入参数主要包含vaddr及iova，其中iova代表的是设备发起DMA请求时要访问的地址，也就是IOMMU映射前的地址，vaddr就是mmap的地址。VFIO_IOMMU_MAP_DMA命令会为虚拟地址vaddr找到物理页并pin住（因为设备DMA是异步的，随时可能发生，物理页面不能交换出去），然后找到Group对应的Contex Entry，建立页表项，页表项能够将iova地址映射成上面pin住的物理页对应的物理地址上去，这样对用户态程序完全屏蔽了物理地址，实现了用户空间驱动。IOVA地址的00x100000对应DRAM地址0x100000000x10100000，size为1024 * 1024。一句话概述，VFIO_IOMMU_MAP_DMA这个命令就是将iova通过IOMMU映射到vaddr对应的物理地址上去。