vhost 控制平面 + handle_kick + VhostOps

vhost是virtio的另一种方案,用于跳过qemu,减少qemu和内核之间上下文切换的开销,对于网络IO而言提升尤其明显。vhost目前有两种实现方案,内核态和用户态,本文重点讨论内核态的vhost

vhost内核模块主要处理数据面的事情,控制面上还是交给qemu,vhost的数据结构如下

struct vhost_dev {
    MemoryListener memory_listener;  /* MemoryListener是物理内存操作的回调函数集合 */
    struct vhost_memory *mem;
    int n_mem_sections;
    MemoryRegionSection *mem_sections;
    struct vhost_virtqueue *vqs;  /* vhost_virtqueue列表和个数 */
    int nvqs;
    /* the first virtuque which would be used by this vhost dev */
    int vq_index;
    unsigned long long features;  /* vhost设备支持的features */
    unsigned long long acked_features;  /* guest acked的features */
    unsigned long long backend_features;  /* backend, e.g. tap设备,支持的features */
    bool started;
    bool log_enabled;
    vhost_log_chunk_t *log;
    unsigned long long log_size;
    Error *migration_blocker;
    bool force;
    bool memory_changed;
    hwaddr mem_changed_start_addr;
    hwaddr mem_changed_end_addr;
    const VhostOps *vhost_ops; /* VhostOps基于kernel和user两种形态的vhost有不同的实现,内核的实现最终调用ioctl完成 */
    void *opaque;
};
 
struct vhost_virtqueue {
    int kick;
    int call;
    void *desc;
    void *avail;
    void *used;
    int num;
    unsigned long long used_phys;
    unsigned used_size;
    void *ring;
    unsigned long long ring_phys;
    unsigned ring_size;
    EventNotifier masked_notifier;

/* The routine to call when the Guest pings us, or timeout. */
vhost_work_fn_t handle_kick;

}; vhost的内存布局,也是由一组vhost_memory_region构成,
struct vhost_memory_region { __u64 guest_phys_addr; __u64 memory_size; /* bytes */ __u64 userspace_addr; __u64 flags_padding; /* No flags are currently specified. */ }; /* All region addresses and sizes must be 4K aligned. */ #define VHOST_PAGE_SIZE 0x1000 struct vhost_memory { __u32 nregions; __u32 padding; struct vhost_memory_region regions[0]; };

1、/dev/vhost-net实现ioctl操作,这样vhost_kernel_call调用/dev/vhost-net的ioctl 发送ring_kick或者ring_call

2、怎么实现类似kvm_vm_ioctl(...,KVM_IOEVENTFD,...)一样的操作

3、怎么实现vhost_dev的VhostOps的vhost_set_vring_kick和vhost_set_vring_call

kernnel vhost方式通过vhost_vring_ioctl 设置VHOST_SET_VRING_CALL,设置irqfd,把中断注入guest;通过  VHOST_SET_VRING_KICK,设置ioeventfd, 获取guest notify

4、非vhost 通过kvm_irqchip_assign_irqfd最终调用kvm_vm_ioctl来设置kvm模块的irqfd的

 

 

 handle_kick 

/* Init poll
/* Init poll structure */
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
             __poll_t mask, struct vhost_dev *dev)
{
    init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
    init_poll_funcptr(&poll->table, vhost_poll_func);
    poll->mask = mask;
    poll->dev = dev;
    poll->wqh = NULL;

    vhost_work_init(&poll->work, fn);
}
void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
{
    clear_bit(VHOST_WORK_QUEUED, &work->flags);
    work->fn = fn;
}

 

static int vhost_net_open(struct inode *inode, struct file *f)
{
    struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
    struct vhost_dev *dev;
    int r;
 
    if (!n)
        return -ENOMEM;
 
    dev = &n->dev;
    n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; /* TX virtqueue->kick的callback函数 */
    n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; /* RX virtqueue->kick的callback函数 */
    r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
 
    vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);  /* 初始化vhost_net的TX vhost_poll */
    vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);   /* 初始化vhost_net的RX vhost_poll */
 
 
    return 0;
void vhost_dev_init(struct vhost_dev *dev,
            struct vhost_virtqueue **vqs, int nvqs,
            int iov_limit, int weight, int byte_weight,
            bool use_worker,
            int (*msg_handler)(struct vhost_dev *dev,
                       struct vhost_iotlb_msg *msg))
{
 

    for (i = 0; i < dev->nvqs; ++i) {
        vq = dev->vqs[i];
        vq->log = NULL;
        vq->indirect = NULL;
        vq->heads = NULL;
        vq->dev = dev;
        mutex_init(&vq->mutex);
        vhost_vq_reset(dev, vq);
        if (vq->handle_kick)
            vhost_poll_init(&vq->poll, vq->handle_kick,
                    EPOLLIN, dev);
    }
}

VhostOps

vhost-backend.c    kernel_ops   

vhost-user.c   user_ops 

可以看出来一个是内核态用的,一个是给用户态用的(vhost-user,ovs+dpdk)

static const VhostOps kernel_ops = {
    .backend_type = VHOST_BACKEND_TYPE_KERNEL,
    ....
    .vhost_set_mem_table = vhost_kernel_set_mem_table,
    .vhost_set_vring_addr = vhost_kernel_set_vring_addr,
    ....
}
 
const VhostOps user_ops = {
    .backend_type = VHOST_BACKEND_TYPE_USER,
    ...
    .vhost_set_mem_table = vhost_user_set_mem_table,
    .vhost_set_vring_addr = vhost_user_set_vring_addr,
    ...

vhost_set_mem_table 和 .vhost_set_vring_addr非常重要,用来实现共享内存, vhost-user 的基础是 vhost-user进程和QEMU进程之间是通过共享内存的。

 virtio_net_vhost_status-->vhost_net_start--> vhost_net_start_one-->vhost_dev_start ---> vhost_set_mem_table

 hw/virtio/vhost-backend.c:294:static const VhostOps kernel_ops = {

hw/virtio/vhost-user.c:2357:const VhostOps user_ops = {
include/hw/virtio/vhost-backend.h:175:extern const VhostOps user_ops;

Qemu层 --evenfd 设置

ioctt命令实现

/* Set eventfd to poll for added buffers */
#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
/* Set eventfd to signal when buffers have beed used */
#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
/* Set eventfd to signal an error */
#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
/* Set busy loop timeout (in us) */
#define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23,    
                     struct vhost_vring_state)
/* Get busy loop timeout (in us) */
#define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24,    
                     struct vhost_vring_state)

vhost_net的启用是在命令行的-netdev tap,…中指定vhost=on选项,其初始化流程如下:

  1. 根据“Qemu之Network Device全虚拟方案一:前端网络流的建立”一文中,tap设备的创建会调用到net_init_tap()函数;
  2. net_init_tap()其中会检查选项是否指定vhost=on,如果指定,则会调用到vhost_net_init()进行初始化;
  3. 通过open(“/dev/vhost-net”, O_RDWR)打开了vhost driver;并通过ioctl(vhost_fd)进行了一系列的初始化;
  4. 调用ioctl VHOST_SET_VRING_KICK 设置kick fd(guest ->vhost) (VirtQueue.host_notifier.fd);
  5. 调用ioctl VHOST_SET_VRING_CALL 设置call fd(vhost ->guest) (VirtQueue.guest_notifier.fd);

vhost_vring_ioctl----eventfd_ctx_fdget

long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
{
    struct file *eventfp, *filep = NULL;
    bool pollstart = false, pollstop = false;
    struct eventfd_ctx *ctx = NULL;
    u32 __user *idxp = argp;
    struct vhost_virtqueue *vq;
    struct vhost_vring_state s;
    struct vhost_vring_file f;
    u32 idx;
    long r;

    r = get_user(idx, idxp);
    if (r < 0)
        return r;
    if (idx >= d->nvqs)
        return -ENOBUFS;

    idx = array_index_nospec(idx, d->nvqs);
    vq = d->vqs[idx];

    if (ioctl == VHOST_SET_VRING_NUM ||
        ioctl == VHOST_SET_VRING_ADDR) {
        return vhost_vring_set_num_addr(d, vq, ioctl, argp);
    }

    mutex_lock(&vq->mutex);

    switch (ioctl) {
    case VHOST_SET_VRING_BASE:
        /* Moving base with an active backend?
         * You don't want to do that. */
        if (vq->private_data) {
            r = -EBUSY;
            break;
        }
        if (copy_from_user(&s, argp, sizeof s)) {
            r = -EFAULT;
            break;
        }
        if (s.num > 0xffff) {
            r = -EINVAL;
            break;
        }
        vq->last_avail_idx = s.num;
        /* Forget the cached index value. */
        vq->avail_idx = vq->last_avail_idx;
        break;
    case VHOST_GET_VRING_BASE:
        s.index = idx;
        s.num = vq->last_avail_idx;
        if (copy_to_user(argp, &s, sizeof s))
            r = -EFAULT;
        break;
    case VHOST_SET_VRING_KICK:
        if (copy_from_user(&f, argp, sizeof f)) {
            r = -EFAULT;
            break;
        }
        eventfp = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_fget(f.fd);
        if (IS_ERR(eventfp)) {
            r = PTR_ERR(eventfp);
            break;
        }
        if (eventfp != vq->kick) {
            pollstop = (filep = vq->kick) != NULL;
            pollstart = (vq->kick = eventfp) != NULL;
        } else
            filep = eventfp;
        break;
    case VHOST_SET_VRING_CALL:
        if (copy_from_user(&f, argp, sizeof f)) {
            r = -EFAULT;
            break;
        }
        ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
        if (IS_ERR(ctx)) {
            r = PTR_ERR(ctx);
            break;
        }

        swap(ctx, vq->call_ctx.ctx);
        break;
    case VHOST_SET_VRING_ERR:
        if (copy_from_user(&f, argp, sizeof f)) {
            r = -EFAULT;
            break;
        }
        ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
        if (IS_ERR(ctx)) {
            r = PTR_ERR(ctx);
            break;
        }
        swap(ctx, vq->error_ctx);
        break;
    case VHOST_SET_VRING_ENDIAN:
        r = vhost_set_vring_endian(vq, argp);
        break;
    case VHOST_GET_VRING_ENDIAN:
        r = vhost_get_vring_endian(vq, idx, argp);
        break;
    case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
        if (copy_from_user(&s, argp, sizeof(s))) {
            r = -EFAULT;
            break;
        }
        vq->busyloop_timeout = s.num;
        break;
    case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
        s.index = idx;
        s.num = vq->busyloop_timeout;
        if (copy_to_user(argp, &s, sizeof(s)))
            r = -EFAULT;
        break;
    default:
        r = -ENOIOCTLCMD;
    }

    if (pollstop && vq->handle_kick)
        vhost_poll_stop(&vq->poll);

    if (!IS_ERR_OR_NULL(ctx))
        eventfd_ctx_put(ctx); ------------ 
if (filep) fput(filep); if (pollstart && vq->handle_kick) r = vhost_poll_start(&vq->poll, vq->kick); mutex_unlock(&vq->mutex); if (pollstop && vq->handle_kick) vhost_poll_flush(&vq->poll); return r; } EXPORT_SYMBOL_GPL(vhost_vring_ioctl);

 qemu实现

 hw/virtio/vhost-backend.c:23:static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request

vhost_kernel_call调用/dev/vhost-net的ioctl

static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request,
                             void *arg)
{
    int fd = (uintptr_t) dev->opaque;

    assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL);

    return ioctl(fd, request, arg);  
}
static int vhost_kernel_set_vring_kick(struct vhost_dev *dev,
                                       struct vhost_vring_file *file)
{
    return vhost_kernel_call(dev, VHOST_SET_VRING_KICK, file);
}

static int vhost_kernel_set_vring_call(struct vhost_dev *dev,
                                       struct vhost_vring_file *file)
{
    return vhost_kernel_call(dev, VHOST_SET_VRING_CALL, file);
}

 

 

vhost_set_vring_call
static int vhost_virtqueue_init(struct vhost_dev *dev,
                                struct vhost_virtqueue *vq, int n)
{
    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
    struct vhost_vring_file file = {
        .index = vhost_vq_index,
    };
    int r = event_notifier_init(&vq->masked_notifier, 0);
    if (r < 0) {
        return r;
    }

    file.fd = event_notifier_get_fd(&vq->masked_notifier);
    r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
    if (r) {
        VHOST_OPS_DEBUG("vhost_set_vring_call failed");
        r = -errno;
        goto fail_call;
    }

    vq->dev = dev;

    return 0;
fail_call:
    event_notifier_cleanup(&vq->masked_notifier);
    return r;
}

 vHOST net---/dev/vhost-net 内核模块 

[root@localhost qemu]# ls /dev/vhost-net 
/dev/vhost-net
[root@localhost qemu]# 
[root@localhost dpdk-19.11]# lsof /dev/vhost-net 
COMMAND     PID USER   FD   TYPE DEVICE SIZE/OFF  NODE NAME
qemu-syst 49786 root   19u   CHR 10,238      0t0 83987 /dev/vhost-net
[root@localhost dpdk-19.11]# ps -elf | grep 49786
7 S root      49786      1 53  80   0 - 74701 poll_s 02:26 ?        00:00:09 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 2 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0  root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -drive file=vhuser-test1.qcow2 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -net nic,model=virtio,macaddr=00:16:3e:22:22:22 -net tap,id=hostnet1,script=qemu-ifup,vnet_hdr=on,vhost=on -vnc :10
1 S root      49806      2  0  80   0 -     0 vhost_ 02:26 ?        00:00:00 [vhost-49786]
0 S root      49846  49322  0  80   0 -  1729 pipe_w 02:27 pts/4    00:00:00 grep --color=auto 49786
[root@localhost dpdk-19.11]# 

dpdk vhost不需要vhost-net

[root@localhost ~]# lsof /dev/vhost-net
[root@localhost ~]# ps -elf | grep qemu
3 S root      49916      1 24  80   0 - 94022 poll_s 02:29 ?        00:00:19 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0  root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10
0 S root      49991  49249  0  80   0 -  1729 pipe_w 02:30 pts/3    00:00:00 grep --color=auto qemu
[root@localhost ~]# 

当给一个Qemu进程传递了参数-netdev tap,vhost=on 的时候,QEMU会通过调用几个ioctl命令对这个文件描述符进行一些初始化的工作,然后进行特性的协商,从而宿主机跟客户机的vhost-net driver建立关系。 QEMU代码调用如下:

vhost_net_init -> vhost_dev_init 

vhost内核模块主要是把virtiO后端驱动的数据平面迁移到了内核中,而控制平面还在qemu中,因此就须要一些列的注册把相关信息记录在内核中,如虚拟机内存布局,设备关联的eventfd等。虽然KVM中有虚拟机的内存布局,可是因为vhost并不是在KVM中,而是单独的一个内核模块,因此须要qemu单独处理。且目前vhost只支持网络部分,块设备等其余部分尚不支持。内核中两个文件比较重要:vhost.c和vhost-net.c。其中前者实现的是脱离具体功能的vhost核心实现,后者实现网络方面的功能。内核模块加载主要是初始化vhost-net,起始于vhost_net_init(vhost/net.c)函数

static const struct file_operations vhost_net_fops = {
    .owner          = THIS_MODULE,
    .release        = vhost_net_release,
    .unlocked_ioctl = vhost_net_ioctl,
#ifdef CONFIG_COMPAT
    .compat_ioctl   = vhost_net_compat_ioctl,
#endif
    .open           = vhost_net_open,
    .llseek        = noop_llseek,
};

函数表中vhost_net_open和vhost_net_ioctl两个函数须要注意,简单来说,前者初始化,后者控制,固然是qemu经过ioctl进行控制。

 ioctl函数实现

 首先说明在2.6.36以后ioctl函数已经不再存在了,而是用unlocked_ioctl和compat_ioctl两个函数实现以前版本的ioctl函数。

vhost_net_ioctl--->vhost_dev_ioctl
vhost_net_ioctl--->vhost_vring_ioctl
 
static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
                unsigned long arg)
{
    struct vhost_net *n = f->private_data;
    void __user *argp = (void __user *)arg;
    u64 __user *featurep = argp;
    struct vhost_vring_file backend;
    u64 features;
    int r;

    switch (ioctl) {
    case VHOST_NET_SET_BACKEND:
        if (copy_from_user(&backend, argp, sizeof backend))
            return -EFAULT;
        return vhost_net_set_backend(n, backend.index, backend.fd);
    case VHOST_GET_FEATURES:
        features = VHOST_NET_FEATURES;
        if (copy_to_user(featurep, &features, sizeof features))
            return -EFAULT;
        return 0;
    case VHOST_SET_FEATURES:
        if (copy_from_user(&features, featurep, sizeof features))
            return -EFAULT;
        if (features & ~VHOST_NET_FEATURES)
            return -EOPNOTSUPP;
        return vhost_net_set_features(n, features);
    case VHOST_GET_BACKEND_FEATURES:
        features = VHOST_NET_BACKEND_FEATURES;
        if (copy_to_user(featurep, &features, sizeof(features)))
            return -EFAULT;
        return 0;
    case VHOST_SET_BACKEND_FEATURES:
        if (copy_from_user(&features, featurep, sizeof(features)))
            return -EFAULT;
        if (features & ~VHOST_NET_BACKEND_FEATURES)
            return -EOPNOTSUPP;
        vhost_set_backend_features(&n->dev, features);
        return 0;
    case VHOST_RESET_OWNER:
        return vhost_net_reset_owner(n);
    case VHOST_SET_OWNER:
        return vhost_net_set_owner(n);
    default:
        mutex_lock(&n->dev.mutex);
        r = vhost_dev_ioctl(&n->dev, ioctl, argp);
        if (r == -ENOIOCTLCMD)
            r = vhost_vring_ioctl(&n->dev, ioctl, argp);
        else
            vhost_net_flush(n);
        mutex_unlock(&n->dev.mutex);
        return r;
    }
}
/* Caller must have device mutex */
long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
{
    struct eventfd_ctx *ctx;
    u64 p;
    long r;
    int i, fd;

    /* If you are not the owner, you can become one */
    if (ioctl == VHOST_SET_OWNER) {
        r = vhost_dev_set_owner(d);
        goto done;
    }

    /* You must be the owner to do anything else */
    r = vhost_dev_check_owner(d);
    if (r)
        goto done;

    switch (ioctl) {
    case VHOST_SET_MEM_TABLE:
        r = vhost_set_memory(d, argp);
        break;
    case VHOST_SET_LOG_BASE:
        if (copy_from_user(&p, argp, sizeof p)) {
            r = -EFAULT;
            break;
        }
        if ((u64)(unsigned long)p != p) {
            r = -EFAULT;
            break;
        }
        for (i = 0; i < d->nvqs; ++i) {
            struct vhost_virtqueue *vq;
            void __user *base = (void __user *)(unsigned long)p;
            vq = d->vqs[i];
            mutex_lock(&vq->mutex);
            /* If ring is inactive, will check when it's enabled. */
            if (vq->private_data && !vq_log_access_ok(vq, base))
                r = -EFAULT;
            else
                vq->log_base = base;
            mutex_unlock(&vq->mutex);
        }
        break;
    case VHOST_SET_LOG_FD:
        r = get_user(fd, (int __user *)argp);
        if (r < 0)
            break;
        ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd);
        if (IS_ERR(ctx)) {
            r = PTR_ERR(ctx);
            break;
        }
        swap(ctx, d->log_ctx);
        for (i = 0; i < d->nvqs; ++i) {
            mutex_lock(&d->vqs[i]->mutex);
            d->vqs[i]->log_ctx = d->log_ctx;
            mutex_unlock(&d->vqs[i]->mutex);
        }
        if (ctx)
            eventfd_ctx_put(ctx);
        break;
    default:
        r = -ENOIOCTLCMD;
        break;
    }
done:
    return r;
}

https://blog.csdn.net/majieyue/article/details/51262510

原文地址:https://www.cnblogs.com/dream397/p/13936103.html