[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ
From: |
Eugenio Perez Martin |
Subject: |
Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ |
Date: |
Tue, 1 Mar 2022 09:50:08 +0100 |
On Mon, Feb 28, 2022 at 8:37 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2022/2/27 下午9:41, Eugenio Pérez 写道:
> > Use translations added in VhostIOVATree in SVQ.
> >
> > Only introduce usage here, not allocation and deallocation. As with
> > previous patches, we use the dead code paths of shadow_vqs_enabled to
> > avoid commiting too many changes at once. These are impossible to take
> > at the moment.
> >
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > ---
> > hw/virtio/vhost-shadow-virtqueue.h | 6 +-
> > include/hw/virtio/vhost-vdpa.h | 3 +
> > hw/virtio/vhost-shadow-virtqueue.c | 76 ++++++++++++++++-
> > hw/virtio/vhost-vdpa.c | 128 ++++++++++++++++++++++++-----
> > 4 files changed, 187 insertions(+), 26 deletions(-)
> >
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.h
> > b/hw/virtio/vhost-shadow-virtqueue.h
> > index 04c67685fd..b2f722d101 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > @@ -13,6 +13,7 @@
> > #include "qemu/event_notifier.h"
> > #include "hw/virtio/virtio.h"
> > #include "standard-headers/linux/vhost_types.h"
> > +#include "hw/virtio/vhost-iova-tree.h"
> >
> > /* Shadow virtqueue to relay notifications */
> > typedef struct VhostShadowVirtqueue {
> > @@ -43,6 +44,9 @@ typedef struct VhostShadowVirtqueue {
> > /* Virtio device */
> > VirtIODevice *vdev;
> >
> > + /* IOVA mapping */
> > + VhostIOVATree *iova_tree;
> > +
> > /* Map for use the guest's descriptors */
> > VirtQueueElement **ring_id_maps;
> >
> > @@ -78,7 +82,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq,
> > VirtIODevice *vdev,
> > VirtQueue *vq);
> > void vhost_svq_stop(VhostShadowVirtqueue *svq);
> >
> > -VhostShadowVirtqueue *vhost_svq_new(void);
> > +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
> >
> > void vhost_svq_free(gpointer vq);
> > G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
> > diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
> > index 009a9f3b6b..ee8e939ad0 100644
> > --- a/include/hw/virtio/vhost-vdpa.h
> > +++ b/include/hw/virtio/vhost-vdpa.h
> > @@ -14,6 +14,7 @@
> >
> > #include <gmodule.h>
> >
> > +#include "hw/virtio/vhost-iova-tree.h"
> > #include "hw/virtio/virtio.h"
> > #include "standard-headers/linux/vhost_types.h"
> >
> > @@ -30,6 +31,8 @@ typedef struct vhost_vdpa {
> > MemoryListener listener;
> > struct vhost_vdpa_iova_range iova_range;
> > bool shadow_vqs_enabled;
> > + /* IOVA mapping used by the Shadow Virtqueue */
> > + VhostIOVATree *iova_tree;
> > GPtrArray *shadow_vqs;
> > struct vhost_dev *dev;
> > VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.c
> > b/hw/virtio/vhost-shadow-virtqueue.c
> > index a38d313755..7e073773d1 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > @@ -11,6 +11,7 @@
> > #include "hw/virtio/vhost-shadow-virtqueue.h"
> >
> > #include "qemu/error-report.h"
> > +#include "qemu/log.h"
> > #include "qemu/main-loop.h"
> > #include "qemu/log.h"
> > #include "linux-headers/linux/vhost.h"
> > @@ -84,7 +85,58 @@ static void
> > vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> > }
> > }
> >
> > +/**
> > + * Translate addresses between the qemu's virtual address and the SVQ IOVA
> > + *
> > + * @svq Shadow VirtQueue
> > + * @vaddr Translated IOVA addresses
> > + * @iovec Source qemu's VA addresses
> > + * @num Length of iovec and minimum length of vaddr
> > + */
> > +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> > + void **addrs, const struct iovec
> > *iovec,
> > + size_t num)
> > +{
> > + if (num == 0) {
> > + return true;
> > + }
> > +
> > + for (size_t i = 0; i < num; ++i) {
> > + DMAMap needle = {
> > + .translated_addr = (hwaddr)iovec[i].iov_base,
> > + .size = iovec[i].iov_len,
> > + };
> > + size_t off;
> > +
> > + const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree,
> > &needle);
> > + /*
> > + * Map cannot be NULL since iova map contains all guest space and
> > + * qemu already has a physical address mapped
> > + */
> > + if (unlikely(!map)) {
> > + qemu_log_mask(LOG_GUEST_ERROR,
> > + "Invalid address 0x%"HWADDR_PRIx" given by
> > guest",
> > + needle.translated_addr);
> > + return false;
> > + }
> > +
> > + off = needle.translated_addr - map->translated_addr;
> > + addrs[i] = (void *)(map->iova + off);
> > +
> > + if (unlikely(int128_gt(int128_add(needle.translated_addr,
> > + iovec[i].iov_len),
> > + map->translated_addr + map->size))) {
> > + qemu_log_mask(LOG_GUEST_ERROR,
> > + "Guest buffer expands over iova range");
> > + return false;
> > + }
> > + }
> > +
> > + return true;
> > +}
> > +
> > static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > + void * const *vaddr_sg,
>
>
> Nit: it looks to me we are not passing vaddr but iova here, so it might
> be better to use "sg"?
>
Sure, this is a leftover of pre-IOVA translations. I see better to
write as you say.
>
> > const struct iovec *iovec,
> > size_t num, bool more_descs, bool
> > write)
> > {
> > @@ -103,7 +155,7 @@ static void
> > vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > } else {
> > descs[i].flags = flags;
> > }
> > - descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
> > + descs[i].addr = cpu_to_le64((hwaddr)vaddr_sg[n]);
> > descs[i].len = cpu_to_le32(iovec[n].iov_len);
> >
> > last = i;
> > @@ -119,6 +171,8 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue
> > *svq,
> > {
> > unsigned avail_idx;
> > vring_avail_t *avail = svq->vring.avail;
> > + bool ok;
> > + g_autofree void **sgs = g_new(void *, MAX(elem->out_num,
> > elem->in_num));
> >
> > *head = svq->free_head;
> >
> > @@ -129,9 +183,20 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue
> > *svq,
> > return false;
> > }
> >
> > - vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> > + ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
> > + if (unlikely(!ok)) {
> > + return false;
> > + }
> > + vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
> > elem->in_num > 0, false);
> > - vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> > +
> > +
> > + ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
> > + if (unlikely(!ok)) {
> > + return false;
> > + }
> > +
> > + vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false,
> > true);
> >
> > /*
> > * Put the entry in the available array (but don't update avail->idx
> > until
> > @@ -514,11 +579,13 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
> > * Creates vhost shadow virtqueue, and instructs the vhost device to use
> > the
> > * shadow methods and file descriptors.
> > *
> > + * @iova_tree Tree to perform descriptors translations
> > + *
> > * Returns the new virtqueue or NULL.
> > *
> > * In case of error, reason is reported through error_report.
> > */
> > -VhostShadowVirtqueue *vhost_svq_new(void)
> > +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
> > {
> > g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue,
> > 1);
> > int r;
> > @@ -539,6 +606,7 @@ VhostShadowVirtqueue *vhost_svq_new(void)
> >
> > event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
> > event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
> > + svq->iova_tree = iova_tree;
> > return g_steal_pointer(&svq);
> >
> > err_init_hdev_call:
> > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > index 435b9c2e9e..56f9f125cd 100644
> > --- a/hw/virtio/vhost-vdpa.c
> > +++ b/hw/virtio/vhost-vdpa.c
> > @@ -209,6 +209,21 @@ static void
> > vhost_vdpa_listener_region_add(MemoryListener *listener,
> > vaddr, section->readonly);
> >
> > llsize = int128_sub(llend, int128_make64(iova));
> > + if (v->shadow_vqs_enabled) {
> > + DMAMap mem_region = {
> > + .translated_addr = (hwaddr)vaddr,
> > + .size = int128_get64(llsize) - 1,
> > + .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> > + };
> > +
> > + int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
> > + if (unlikely(r != IOVA_OK)) {
> > + error_report("Can't allocate a mapping (%d)", r);
> > + goto fail;
> > + }
> > +
> > + iova = mem_region.iova;
> > + }
> >
> > vhost_vdpa_iotlb_batch_begin_once(v);
> > ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
> > @@ -261,6 +276,20 @@ static void
> > vhost_vdpa_listener_region_del(MemoryListener *listener,
> >
> > llsize = int128_sub(llend, int128_make64(iova));
> >
> > + if (v->shadow_vqs_enabled) {
> > + const DMAMap *result;
> > + const void *vaddr = memory_region_get_ram_ptr(section->mr) +
> > + section->offset_within_region +
> > + (iova - section->offset_within_address_space);
> > + DMAMap mem_region = {
> > + .translated_addr = (hwaddr)vaddr,
> > + .size = int128_get64(llsize) - 1,
> > + };
> > +
> > + result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
> > + iova = result->iova;
> > + vhost_iova_tree_remove(v->iova_tree, &mem_region);
> > + }
> > vhost_vdpa_iotlb_batch_begin_once(v);
> > ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
> > if (ret) {
> > @@ -383,7 +412,7 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev,
> > struct vhost_vdpa *v,
> >
> > shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
> > for (unsigned n = 0; n < hdev->nvqs; ++n) {
> > - g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
> > + g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
> >
> > if (unlikely(!svq)) {
> > error_setg(errp, "Cannot create svq %u", n);
> > @@ -834,37 +863,78 @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev
> > *dev,
> > /**
> > * Unmap a SVQ area in the device
> > */
> > -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
> > - hwaddr size)
> > +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
> > + const DMAMap *needle)
> > {
> > + const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
> > + hwaddr size;
> > int r;
> >
> > - size = ROUND_UP(size, qemu_real_host_page_size);
> > - r = vhost_vdpa_dma_unmap(v, iova, size);
> > + if (unlikely(!result)) {
> > + error_report("Unable to find SVQ address to unmap");
> > + return false;
> > + }
> > +
> > + size = ROUND_UP(result->size, qemu_real_host_page_size);
> > + r = vhost_vdpa_dma_unmap(v, result->iova, size);
> > return r == 0;
> > }
> >
> > static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
> > const VhostShadowVirtqueue *svq)
> > {
> > + DMAMap needle;
> > struct vhost_vdpa *v = dev->opaque;
> > struct vhost_vring_addr svq_addr;
> > - size_t device_size = vhost_svq_device_area_size(svq);
> > - size_t driver_size = vhost_svq_driver_area_size(svq);
> > bool ok;
> >
> > vhost_svq_get_vring_addr(svq, &svq_addr);
> >
> > - ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr,
> > driver_size);
> > + needle = (DMAMap) {
> > + .translated_addr = svq_addr.desc_user_addr,
> > + };
>
>
> Let's simply initialize the member to zero during start of this function
> then we can use needle->transalted_addr = XXX here.
>
Sure
>
> > + ok = vhost_vdpa_svq_unmap_ring(v, &needle);
> > if (unlikely(!ok)) {
> > return false;
> > }
> >
> > - return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr,
> > device_size);
> > + needle = (DMAMap) {
> > + .translated_addr = svq_addr.used_user_addr,
> > + };
> > + return vhost_vdpa_svq_unmap_ring(v, &needle);
> > +}
> > +
> > +/**
> > + * Map the SVQ area in the device
> > + *
> > + * @v Vhost-vdpa device
> > + * @needle The area to search iova
> > + * @errorp Error pointer
> > + */
> > +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
> > + Error **errp)
> > +{
> > + int r;
> > +
> > + r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
> > + if (unlikely(r != IOVA_OK)) {
> > + error_setg(errp, "Cannot allocate iova (%d)", r);
> > + return false;
> > + }
> > +
> > + r = vhost_vdpa_dma_map(v, needle->iova, needle->size,
> > + (void *)needle->translated_addr,
> > + !(needle->perm & IOMMU_ACCESS_FLAG(0, 1)));
>
>
> Let's simply use needle->perm == IOMMU_RO here?
>
The motivation to use this way is to be more resilient to the future.
For example, if a new flag is added.
But I'm totally ok with comparing with IOMMU_RO, I see that scenario
unlikely at the moment.
>
> > + if (unlikely(r != 0)) {
> > + error_setg_errno(errp, -r, "Cannot map region to device");
> > + vhost_iova_tree_remove(v->iova_tree, needle);
> > + }
> > +
> > + return r == 0;
> > }
> >
> > /**
> > - * Map shadow virtqueue rings in device
> > + * Map the shadow virtqueue rings in the device
> > *
> > * @dev The vhost device
> > * @svq The shadow virtqueue
> > @@ -876,28 +946,44 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev
> > *dev,
> > struct vhost_vring_addr *addr,
> > Error **errp)
> > {
> > + DMAMap device_region, driver_region;
> > + struct vhost_vring_addr svq_addr;
> > struct vhost_vdpa *v = dev->opaque;
> > size_t device_size = vhost_svq_device_area_size(svq);
> > size_t driver_size = vhost_svq_driver_area_size(svq);
> > - int r;
> > + size_t avail_offset;
> > + bool ok;
> >
> > ERRP_GUARD();
> > - vhost_svq_get_vring_addr(svq, addr);
> > + vhost_svq_get_vring_addr(svq, &svq_addr);
> >
> > - r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
> > - (void *)addr->desc_user_addr, true);
> > - if (unlikely(r != 0)) {
> > - error_setg_errno(errp, -r, "Cannot create vq driver region: ");
> > + driver_region = (DMAMap) {
> > + .translated_addr = svq_addr.desc_user_addr,
> > + .size = driver_size - 1,
>
>
> Any reason for the "-1" here? I see several places do things like that,
> it's probably hint of wrong API somehwere.
>
The "problem" is the api mismatch between _end and _last, to include
the last member in the size or not.
IOVA tree needs to use _end so we can allocate the last page in case
of available range ending in (uint64_t)-1 [1]. But If we change
vhost_svq_{device,driver}_area_size to make it inclusive, we need to
use "+1" in calls like qemu_memalign and memset at vhost_svq_start.
Probably in more places too
QEMU's emulated Intel iommu code solves it using the address mask as
the size, something that does not fit 100% with vhost devices since
they can allocate an arbitrary address of arbitrary size when using
vIOMMU. It's not a problem for vhost-vdpa at this moment since we make
sure we expose unaligned and whole pages with vrings, but I feel it
would only be to move the problem somewhere else.
Thanks!
[1] There are alternatives: to use Int128_t, etc. But I think it's
better not to change that in this patch series.
> Thanks
>
>
> > + .perm = IOMMU_RO,
> > + };
> > + ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
> > + if (unlikely(!ok)) {
> > + error_prepend(errp, "Cannot create vq driver region: ");
> > return false;
> > }
> > + addr->desc_user_addr = driver_region.iova;
> > + avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
> > + addr->avail_user_addr = driver_region.iova + avail_offset;
> >
> > - r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
> > - (void *)addr->used_user_addr, false);
> > - if (unlikely(r != 0)) {
> > - error_setg_errno(errp, -r, "Cannot create vq device region: ");
> > + device_region = (DMAMap) {
> > + .translated_addr = svq_addr.used_user_addr,
> > + .size = device_size - 1,
> > + .perm = IOMMU_RW,
> > + };
> > + ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
> > + if (unlikely(!ok)) {
> > + error_prepend(errp, "Cannot create vq device region: ");
> > + vhost_vdpa_svq_unmap_ring(v, &driver_region);
> > }
> > + addr->used_user_addr = device_region.iova;
> >
> > - return r == 0;
> > + return ok;
> > }
> >
> > static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
>
- Re: [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ,
Eugenio Perez Martin <=