Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ

qemu-devel
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ

From:	Eugenio Perez Martin
Subject:	Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ
Date:	Tue, 1 Mar 2022 09:50:08 +0100
On Mon, Feb 28, 2022 at 8:37 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2022/2/27 下午9:41, Eugenio Pérez 写道:
> > Use translations added in VhostIOVATree in SVQ.
> >
> > Only introduce usage here, not allocation and deallocation. As with
> > previous patches, we use the dead code paths of shadow_vqs_enabled to
> > avoid commiting too many changes at once. These are impossible to take
> > at the moment.
> >
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > ---
> >   hw/virtio/vhost-shadow-virtqueue.h |   6 +-
> >   include/hw/virtio/vhost-vdpa.h     |   3 +
> >   hw/virtio/vhost-shadow-virtqueue.c |  76 ++++++++++++++++-
> >   hw/virtio/vhost-vdpa.c             | 128 ++++++++++++++++++++++++-----
> >   4 files changed, 187 insertions(+), 26 deletions(-)
> >
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.h 
> > b/hw/virtio/vhost-shadow-virtqueue.h
> > index 04c67685fd..b2f722d101 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > @@ -13,6 +13,7 @@
> >   #include "qemu/event_notifier.h"
> >   #include "hw/virtio/virtio.h"
> >   #include "standard-headers/linux/vhost_types.h"
> > +#include "hw/virtio/vhost-iova-tree.h"
> >
> >   /* Shadow virtqueue to relay notifications */
> >   typedef struct VhostShadowVirtqueue {
> > @@ -43,6 +44,9 @@ typedef struct VhostShadowVirtqueue {
> >       /* Virtio device */
> >       VirtIODevice *vdev;
> >
> > +    /* IOVA mapping */
> > +    VhostIOVATree *iova_tree;
> > +
> >       /* Map for use the guest's descriptors */
> >       VirtQueueElement **ring_id_maps;
> >
> > @@ -78,7 +82,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, 
> > VirtIODevice *vdev,
> >                        VirtQueue *vq);
> >   void vhost_svq_stop(VhostShadowVirtqueue *svq);
> >
> > -VhostShadowVirtqueue *vhost_svq_new(void);
> > +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
> >
> >   void vhost_svq_free(gpointer vq);
> >   G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
> > diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
> > index 009a9f3b6b..ee8e939ad0 100644
> > --- a/include/hw/virtio/vhost-vdpa.h
> > +++ b/include/hw/virtio/vhost-vdpa.h
> > @@ -14,6 +14,7 @@
> >
> >   #include <gmodule.h>
> >
> > +#include "hw/virtio/vhost-iova-tree.h"
> >   #include "hw/virtio/virtio.h"
> >   #include "standard-headers/linux/vhost_types.h"
> >
> > @@ -30,6 +31,8 @@ typedef struct vhost_vdpa {
> >       MemoryListener listener;
> >       struct vhost_vdpa_iova_range iova_range;
> >       bool shadow_vqs_enabled;
> > +    /* IOVA mapping used by the Shadow Virtqueue */
> > +    VhostIOVATree *iova_tree;
> >       GPtrArray *shadow_vqs;
> >       struct vhost_dev *dev;
> >       VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
> > b/hw/virtio/vhost-shadow-virtqueue.c
> > index a38d313755..7e073773d1 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > @@ -11,6 +11,7 @@
> >   #include "hw/virtio/vhost-shadow-virtqueue.h"
> >
> >   #include "qemu/error-report.h"
> > +#include "qemu/log.h"
> >   #include "qemu/main-loop.h"
> >   #include "qemu/log.h"
> >   #include "linux-headers/linux/vhost.h"
> > @@ -84,7 +85,58 @@ static void 
> > vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> >       }
> >   }
> >
> > +/**
> > + * Translate addresses between the qemu's virtual address and the SVQ IOVA
> > + *
> > + * @svq    Shadow VirtQueue
> > + * @vaddr  Translated IOVA addresses
> > + * @iovec  Source qemu's VA addresses
> > + * @num    Length of iovec and minimum length of vaddr
> > + */
> > +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> > +                                     void **addrs, const struct iovec 
> > *iovec,
> > +                                     size_t num)
> > +{
> > +    if (num == 0) {
> > +        return true;
> > +    }
> > +
> > +    for (size_t i = 0; i < num; ++i) {
> > +        DMAMap needle = {
> > +            .translated_addr = (hwaddr)iovec[i].iov_base,
> > +            .size = iovec[i].iov_len,
> > +        };
> > +        size_t off;
> > +
> > +        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, 
> > &needle);
> > +        /*
> > +         * Map cannot be NULL since iova map contains all guest space and
> > +         * qemu already has a physical address mapped
> > +         */
> > +        if (unlikely(!map)) {
> > +            qemu_log_mask(LOG_GUEST_ERROR,
> > +                          "Invalid address 0x%"HWADDR_PRIx" given by 
> > guest",
> > +                          needle.translated_addr);
> > +            return false;
> > +        }
> > +
> > +        off = needle.translated_addr - map->translated_addr;
> > +        addrs[i] = (void *)(map->iova + off);
> > +
> > +        if (unlikely(int128_gt(int128_add(needle.translated_addr,
> > +                                          iovec[i].iov_len),
> > +                               map->translated_addr + map->size))) {
> > +            qemu_log_mask(LOG_GUEST_ERROR,
> > +                          "Guest buffer expands over iova range");
> > +            return false;
> > +        }
> > +    }
> > +
> > +    return true;
> > +}
> > +
> >   static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > +                                    void * const *vaddr_sg,
>
>
> Nit: it looks to me we are not passing vaddr but iova here, so it might
> be better to use "sg"?
>

Sure, this is a leftover of pre-IOVA translations. I see better to
write as you say.

>
> >                                       const struct iovec *iovec,
> >                                       size_t num, bool more_descs, bool 
> > write)
> >   {
> > @@ -103,7 +155,7 @@ static void 
> > vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> >           } else {
> >               descs[i].flags = flags;
> >           }
> > -        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
> > +        descs[i].addr = cpu_to_le64((hwaddr)vaddr_sg[n]);
> >           descs[i].len = cpu_to_le32(iovec[n].iov_len);
> >
> >           last = i;
> > @@ -119,6 +171,8 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue 
> > *svq,
> >   {
> >       unsigned avail_idx;
> >       vring_avail_t *avail = svq->vring.avail;
> > +    bool ok;
> > +    g_autofree void **sgs = g_new(void *, MAX(elem->out_num, 
> > elem->in_num));
> >
> >       *head = svq->free_head;
> >
> > @@ -129,9 +183,20 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue 
> > *svq,
> >           return false;
> >       }
> >
> > -    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> > +    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
> > +    if (unlikely(!ok)) {
> > +        return false;
> > +    }
> > +    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
> >                               elem->in_num > 0, false);
> > -    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> > +
> > +
> > +    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
> > +    if (unlikely(!ok)) {
> > +        return false;
> > +    }
> > +
> > +    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, 
> > true);
> >
> >       /*
> >        * Put the entry in the available array (but don't update avail->idx 
> > until
> > @@ -514,11 +579,13 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
> >    * Creates vhost shadow virtqueue, and instructs the vhost device to use 
> > the
> >    * shadow methods and file descriptors.
> >    *
> > + * @iova_tree Tree to perform descriptors translations
> > + *
> >    * Returns the new virtqueue or NULL.
> >    *
> >    * In case of error, reason is reported through error_report.
> >    */
> > -VhostShadowVirtqueue *vhost_svq_new(void)
> > +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
> >   {
> >       g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 
> > 1);
> >       int r;
> > @@ -539,6 +606,7 @@ VhostShadowVirtqueue *vhost_svq_new(void)
> >
> >       event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
> >       event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
> > +    svq->iova_tree = iova_tree;
> >       return g_steal_pointer(&svq);
> >
> >   err_init_hdev_call:
> > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > index 435b9c2e9e..56f9f125cd 100644
> > --- a/hw/virtio/vhost-vdpa.c
> > +++ b/hw/virtio/vhost-vdpa.c
> > @@ -209,6 +209,21 @@ static void 
> > vhost_vdpa_listener_region_add(MemoryListener *listener,
> >                                            vaddr, section->readonly);
> >
> >       llsize = int128_sub(llend, int128_make64(iova));
> > +    if (v->shadow_vqs_enabled) {
> > +        DMAMap mem_region = {
> > +            .translated_addr = (hwaddr)vaddr,
> > +            .size = int128_get64(llsize) - 1,
> > +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> > +        };
> > +
> > +        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
> > +        if (unlikely(r != IOVA_OK)) {
> > +            error_report("Can't allocate a mapping (%d)", r);
> > +            goto fail;
> > +        }
> > +
> > +        iova = mem_region.iova;
> > +    }
> >
> >       vhost_vdpa_iotlb_batch_begin_once(v);
> >       ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
> > @@ -261,6 +276,20 @@ static void 
> > vhost_vdpa_listener_region_del(MemoryListener *listener,
> >
> >       llsize = int128_sub(llend, int128_make64(iova));
> >
> > +    if (v->shadow_vqs_enabled) {
> > +        const DMAMap *result;
> > +        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
> > +            section->offset_within_region +
> > +            (iova - section->offset_within_address_space);
> > +        DMAMap mem_region = {
> > +            .translated_addr = (hwaddr)vaddr,
> > +            .size = int128_get64(llsize) - 1,
> > +        };
> > +
> > +        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
> > +        iova = result->iova;
> > +        vhost_iova_tree_remove(v->iova_tree, &mem_region);
> > +    }
> >       vhost_vdpa_iotlb_batch_begin_once(v);
> >       ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
> >       if (ret) {
> > @@ -383,7 +412,7 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, 
> > struct vhost_vdpa *v,
> >
> >       shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
> >       for (unsigned n = 0; n < hdev->nvqs; ++n) {
> > -        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
> > +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
> >
> >           if (unlikely(!svq)) {
> >               error_setg(errp, "Cannot create svq %u", n);
> > @@ -834,37 +863,78 @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev 
> > *dev,
> >   /**
> >    * Unmap a SVQ area in the device
> >    */
> > -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
> > -                                      hwaddr size)
> > +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
> > +                                      const DMAMap *needle)
> >   {
> > +    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
> > +    hwaddr size;
> >       int r;
> >
> > -    size = ROUND_UP(size, qemu_real_host_page_size);
> > -    r = vhost_vdpa_dma_unmap(v, iova, size);
> > +    if (unlikely(!result)) {
> > +        error_report("Unable to find SVQ address to unmap");
> > +        return false;
> > +    }
> > +
> > +    size = ROUND_UP(result->size, qemu_real_host_page_size);
> > +    r = vhost_vdpa_dma_unmap(v, result->iova, size);
> >       return r == 0;
> >   }
> >
> >   static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
> >                                          const VhostShadowVirtqueue *svq)
> >   {
> > +    DMAMap needle;
> >       struct vhost_vdpa *v = dev->opaque;
> >       struct vhost_vring_addr svq_addr;
> > -    size_t device_size = vhost_svq_device_area_size(svq);
> > -    size_t driver_size = vhost_svq_driver_area_size(svq);
> >       bool ok;
> >
> >       vhost_svq_get_vring_addr(svq, &svq_addr);
> >
> > -    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, 
> > driver_size);
> > +    needle = (DMAMap) {
> > +        .translated_addr = svq_addr.desc_user_addr,
> > +    };
>
>
> Let's simply initialize the member to zero during start of this function
> then we can use needle->transalted_addr = XXX here.
>

Sure

>
> > +    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
> >       if (unlikely(!ok)) {
> >           return false;
> >       }
> >
> > -    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, 
> > device_size);
> > +    needle = (DMAMap) {
> > +        .translated_addr = svq_addr.used_user_addr,
> > +    };
> > +    return vhost_vdpa_svq_unmap_ring(v, &needle);
> > +}
> > +
> > +/**
> > + * Map the SVQ area in the device
> > + *
> > + * @v          Vhost-vdpa device
> > + * @needle     The area to search iova
> > + * @errorp     Error pointer
> > + */
> > +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
> > +                                    Error **errp)
> > +{
> > +    int r;
> > +
> > +    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
> > +    if (unlikely(r != IOVA_OK)) {
> > +        error_setg(errp, "Cannot allocate iova (%d)", r);
> > +        return false;
> > +    }
> > +
> > +    r = vhost_vdpa_dma_map(v, needle->iova, needle->size,
> > +                           (void *)needle->translated_addr,
> > +                           !(needle->perm & IOMMU_ACCESS_FLAG(0, 1)));
>
>
> Let's simply use needle->perm == IOMMU_RO here?
>

The motivation to use this way is to be more resilient to the future.
For example, if a new flag is added.

But I'm totally ok with comparing with IOMMU_RO, I see that scenario
unlikely at the moment.

>
> > +    if (unlikely(r != 0)) {
> > +        error_setg_errno(errp, -r, "Cannot map region to device");
> > +        vhost_iova_tree_remove(v->iova_tree, needle);
> > +    }
> > +
> > +    return r == 0;
> >   }
> >
> >   /**
> > - * Map shadow virtqueue rings in device
> > + * Map the shadow virtqueue rings in the device
> >    *
> >    * @dev   The vhost device
> >    * @svq   The shadow virtqueue
> > @@ -876,28 +946,44 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev 
> > *dev,
> >                                        struct vhost_vring_addr *addr,
> >                                        Error **errp)
> >   {
> > +    DMAMap device_region, driver_region;
> > +    struct vhost_vring_addr svq_addr;
> >       struct vhost_vdpa *v = dev->opaque;
> >       size_t device_size = vhost_svq_device_area_size(svq);
> >       size_t driver_size = vhost_svq_driver_area_size(svq);
> > -    int r;
> > +    size_t avail_offset;
> > +    bool ok;
> >
> >       ERRP_GUARD();
> > -    vhost_svq_get_vring_addr(svq, addr);
> > +    vhost_svq_get_vring_addr(svq, &svq_addr);
> >
> > -    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
> > -                           (void *)addr->desc_user_addr, true);
> > -    if (unlikely(r != 0)) {
> > -        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
> > +    driver_region = (DMAMap) {
> > +        .translated_addr = svq_addr.desc_user_addr,
> > +        .size = driver_size - 1,
>
>
> Any reason for the "-1" here? I see several places do things like that,
> it's probably hint of wrong API somehwere.
>

The "problem" is the api mismatch between _end and _last, to include
the last member in the size or not.

IOVA tree needs to use _end so we can allocate the last page in case
of available range ending in (uint64_t)-1 [1]. But If we change
vhost_svq_{device,driver}_area_size to make it inclusive, we need to
use "+1" in calls like qemu_memalign and memset at vhost_svq_start.
Probably in more places too

QEMU's emulated Intel iommu code solves it using the address mask as
the size, something that does not fit 100% with vhost devices since
they can allocate an arbitrary address of arbitrary size when using
vIOMMU. It's not a problem for vhost-vdpa at this moment since we make
sure we expose unaligned and whole pages with vrings, but I feel it
would only be to move the problem somewhere else.

Thanks!

[1] There are alternatives: to use Int128_t, etc. But I think it's
better not to change that in this patch series.

> Thanks
>
>
> > +        .perm = IOMMU_RO,
> > +    };
> > +    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
> > +    if (unlikely(!ok)) {
> > +        error_prepend(errp, "Cannot create vq driver region: ");
> >           return false;
> >       }
> > +    addr->desc_user_addr = driver_region.iova;
> > +    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
> > +    addr->avail_user_addr = driver_region.iova + avail_offset;
> >
> > -    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
> > -                           (void *)addr->used_user_addr, false);
> > -    if (unlikely(r != 0)) {
> > -        error_setg_errno(errp, -r, "Cannot create vq device region: ");
> > +    device_region = (DMAMap) {
> > +        .translated_addr = svq_addr.used_user_addr,
> > +        .size = device_size - 1,
> > +        .perm = IOMMU_RW,
> > +    };
> > +    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
> > +    if (unlikely(!ok)) {
> > +        error_prepend(errp, "Cannot create vq device region: ");
> > +        vhost_vdpa_svq_unmap_ring(v, &driver_region);
> >       }
> > +    addr->used_user_addr = device_region.iova;
> >
> > -    return r == 0;
> > +    return ok;
> >   }
> >
> >   static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
>
[Prev in Thread]
Current Thread
[Next in Thread]
Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ, Eugenio Perez Martin <=
- Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ, Jason Wang, 2022/03/03
  - Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ, Eugenio Perez Martin, 2022/03/03
    - Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ, Jason Wang, 2022/03/06
    - Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ, Eugenio Perez Martin, 2022/03/07
Prev by Date: Re: [PATCH v4 2/3] hw/acpi: add indication for i8042 in IA-PC boot flags of the FADT table
Next by Date: [PATCH] tests/Makefile.include: Let "make clean" remove the TCG tests, too
Previous by thread: Re: [PATCH] target/riscv: fix inverted checks for ext_zb[abcs]
Next by thread: Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ
Index(es):
- Date
- Thread