[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v3 5/6] vfio/quirks: ioeventfd quirk acceleratio
From: |
Auger Eric |
Subject: |
Re: [Qemu-devel] [PATCH v3 5/6] vfio/quirks: ioeventfd quirk acceleration |
Date: |
Fri, 4 May 2018 09:51:18 +0200 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.4.0 |
Hi,
On 05/03/2018 11:45 PM, Alex Williamson wrote:
> The NVIDIA BAR0 quirks virtualize the PCI config space mirrors found
> in device MMIO space. Normally PCI config space is considered a slow
> path and further optimization is unnecessary, however NVIDIA uses a
> register here to enable the MSI interrupt to re-trigger. Exiting to
> QEMU for this MSI-ACK handling can therefore rate limit our interrupt
> handling. Fortunately the MSI-ACK write is easily detected since the
> quirk MemoryRegion otherwise has very few accesses, so simply looking
> for consecutive writes with the same data is sufficient, in this case
> 10 consecutive writes with the same data and size is arbitrarily
> chosen. We configure the KVM ioeventfd with data match, so there's
> no risk of triggering for the wrong data or size, but we do risk that
> pathological driver behavior might consume all of QEMU's file
> descriptors, so we cap ourselves to 10 ioeventfds for this purpose.
>
> In support of the above, generic ioeventfd infrastructure is added
> for vfio quirks. This automatically initializes an ioeventfd list
> per quirk, disables and frees ioeventfds on exit, and allows
> ioeventfds marked as dynamic to be dropped on device reset. The
> rationale for this latter feature is that useful ioeventfds may
> depend on specific driver behavior and since we necessarily place a
> cap on our use of ioeventfds, a machine reset is a reasonable point
> at which to assume a new driver and re-profile.
>
> Signed-off-by: Alex Williamson <address@hidden>
> ---
> hw/vfio/pci-quirks.c | 166
> +++++++++++++++++++++++++++++++++++++++++++++++++-
> hw/vfio/pci.c | 2 +
> hw/vfio/pci.h | 14 ++++
> hw/vfio/trace-events | 3 +
> 4 files changed, 183 insertions(+), 2 deletions(-)
>
> diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
> index f0947cbf152f..f7886487744e 100644
> --- a/hw/vfio/pci-quirks.c
> +++ b/hw/vfio/pci-quirks.c
> @@ -12,6 +12,7 @@
>
> #include "qemu/osdep.h"
> #include "qemu/error-report.h"
> +#include "qemu/main-loop.h"
> #include "qemu/range.h"
> #include "qapi/error.h"
> #include "qapi/visitor.h"
> @@ -202,6 +203,7 @@ typedef struct VFIOConfigMirrorQuirk {
> uint32_t offset;
> uint8_t bar;
> MemoryRegion *mem;
> + uint8_t data[];
> } VFIOConfigMirrorQuirk;
>
> static uint64_t vfio_generic_quirk_mirror_read(void *opaque,
> @@ -278,12 +280,95 @@ static const MemoryRegionOps vfio_ati_3c3_quirk = {
> static VFIOQuirk *vfio_quirk_alloc(int nr_mem)
> {
> VFIOQuirk *quirk = g_new0(VFIOQuirk, 1);
> + QLIST_INIT(&quirk->ioeventfds);
> quirk->mem = g_new0(MemoryRegion, nr_mem);
> quirk->nr_mem = nr_mem;
>
> return quirk;
> }
>
> +static void vfio_ioeventfd_exit(VFIOIOEventFD *ioeventfd)
> +{
> + QLIST_REMOVE(ioeventfd, next);
> + memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr,
> ioeventfd->size,
> + true, ioeventfd->data, &ioeventfd->e);
> + qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), NULL, NULL,
> NULL);
> + event_notifier_cleanup(&ioeventfd->e);
> + trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
> + (uint64_t)ioeventfd->addr, ioeventfd->size,
> + ioeventfd->data);
> + g_free(ioeventfd);
> +}
> +
> +static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
> +{
> + VFIOIOEventFD *ioeventfd, *tmp;
> +
> + QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
> + if (ioeventfd->dynamic) {
> + vfio_ioeventfd_exit(ioeventfd);
> + }
> + }
> +}
> +
> +static void vfio_ioeventfd_handler(void *opaque)
> +{
> + VFIOIOEventFD *ioeventfd = opaque;
> +
> + if (event_notifier_test_and_clear(&ioeventfd->e)) {
> + vfio_region_write(ioeventfd->region, ioeventfd->region_addr,
> + ioeventfd->data, ioeventfd->size);
> + trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr),
> + (uint64_t)ioeventfd->addr,
> ioeventfd->size,
> + ioeventfd->data);
> + }
> +}
> +
> +static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
> + MemoryRegion *mr, hwaddr addr,
> + unsigned size, uint64_t data,
> + VFIORegion *region,
> + hwaddr region_addr, bool dynamic)
> +{
> + VFIOIOEventFD *ioeventfd;
> +
> + if (vdev->no_kvm_ioeventfd) {
> + return NULL;
> + }
> +
> + ioeventfd = g_malloc0(sizeof(*ioeventfd));
> +
> + if (event_notifier_init(&ioeventfd->e, 0)) {
> + g_free(ioeventfd);
> + return NULL;
> + }
> +
> + /*
> + * MemoryRegion and relative offset, plus additional ioeventfd setup
> + * parameters for configuring and later tearing down KVM ioeventfd.
> + */
> + ioeventfd->mr = mr;
> + ioeventfd->addr = addr;
> + ioeventfd->size = size;
> + ioeventfd->data = data;
> + ioeventfd->dynamic = dynamic;
> + /*
> + * VFIORegion and relative offset for implementing the userspace
> + * handler. data & size fields shared for both uses.
> + */
> + ioeventfd->region = region;
> + ioeventfd->region_addr = region_addr;
> +
> + qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
> + vfio_ioeventfd_handler, NULL, ioeventfd);
> + memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr,
> ioeventfd->size,
> + true, ioeventfd->data, &ioeventfd->e);
> + trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
> + size, data);
> +
> + return ioeventfd;
> +}
> +
> static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
> {
> VFIOQuirk *quirk;
> @@ -719,6 +804,18 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice
> *vdev, int nr)
> trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name);
> }
>
> +typedef struct LastDataSet {
> + VFIOQuirk *quirk;
> + hwaddr addr;
> + uint64_t data;
> + unsigned size;
> + int hits;
> + int added;
> +} LastDataSet;
> +
> +#define MAX_DYN_IOEVENTFD 10
> +#define HITS_FOR_IOEVENTFD 10
> +
> /*
> * Finally, BAR0 itself. We want to redirect any accesses to either
> * 0x1800 or 0x88000 through the PCI config space access functions.
> @@ -729,6 +826,7 @@ static void vfio_nvidia_quirk_mirror_write(void *opaque,
> hwaddr addr,
> VFIOConfigMirrorQuirk *mirror = opaque;
> VFIOPCIDevice *vdev = mirror->vdev;
> PCIDevice *pdev = &vdev->pdev;
> + LastDataSet *last = (LastDataSet *)&mirror->data;
>
> vfio_generic_quirk_mirror_write(opaque, addr, data, size);
>
> @@ -743,6 +841,49 @@ static void vfio_nvidia_quirk_mirror_write(void *opaque,
> hwaddr addr,
> addr + mirror->offset, data, size);
> trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name);
> }
> +
> + /*
> + * Automatically add an ioeventfd to handle any repeated write with the
> + * same data and size above the standard PCI config space header. This
> is
> + * primarily expected to accelerate the MSI-ACK behavior, such as noted
> + * above. Current hardware/drivers should trigger an ioeventfd at config
> + * offset 0x704 (region offset 0x88704), with data 0x0, size 4.
> + *
> + * The criteria of 10 successive hits is arbitrary but reliably adds the
> + * MSI-ACK region. Note that as some writes are bypassed via the
> ioeventfd,
> + * the remaining ones have a greater chance of being seen successively.
> + * To avoid the pathological case of burning up all of QEMU's open file
> + * handles, arbitrarily limit this algorithm from adding no more than 10
> + * ioeventfds, print an error if we would have added an 11th, and then
> + * stop counting.
> + */
> + if (!vdev->no_kvm_ioeventfd &&
> + addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) {
> + if (addr != last->addr || data != last->data || size != last->size) {
> + last->addr = addr;
> + last->data = data;
> + last->size = size;
> + last->hits = 1;
> + } else if (++last->hits >= HITS_FOR_IOEVENTFD) {
> + if (last->added < MAX_DYN_IOEVENTFD) {
> + VFIOIOEventFD *ioeventfd;
> + ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr,
> size,
> + data,
> &vdev->bars[mirror->bar].region,
> + mirror->offset + addr, true);
> + if (ioeventfd) {
> + VFIOQuirk *quirk = last->quirk;
> +
> + QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next);
> + last->added++;
> + }
> + } else {
> + last->added++;
> + warn_report("NVIDIA ioeventfd queue full for %s, unable to "
> + "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", "
> + "size %u", vdev->vbasedev.name, addr, data,
> size);
> + }
> + }
> + }
> }
>
> static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
> @@ -751,10 +892,21 @@ static const MemoryRegionOps vfio_nvidia_mirror_quirk =
> {
> .endianness = DEVICE_LITTLE_ENDIAN,
> };
>
> +static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk
> *quirk)
> +{
> + VFIOConfigMirrorQuirk *mirror = quirk->data;
> + LastDataSet *last = (LastDataSet *)&mirror->data;
> +
> + last->addr = last->data = last->size = last->hits = last->added = 0;
> +
> + vfio_drop_dynamic_eventfds(vdev, quirk);
> +}
> +
> static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
> {
> VFIOQuirk *quirk;
> VFIOConfigMirrorQuirk *mirror;
> + LastDataSet *last;
>
> if (vdev->no_geforce_quirks ||
> !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
> @@ -763,11 +915,14 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice
> *vdev, int nr)
> }
>
> quirk = vfio_quirk_alloc(1);
> - mirror = quirk->data = g_malloc0(sizeof(*mirror));
> + quirk->reset = vfio_nvidia_bar0_quirk_reset;
> + mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
> mirror->mem = quirk->mem;
> mirror->vdev = vdev;
> mirror->offset = 0x88000;
> mirror->bar = nr;
> + last = (LastDataSet *)&mirror->data;
> + last->quirk = quirk;
>
> memory_region_init_io(mirror->mem, OBJECT(vdev),
> &vfio_nvidia_mirror_quirk, mirror,
> @@ -781,11 +936,14 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice
> *vdev, int nr)
> /* The 0x1800 offset mirror only seems to get used by legacy VGA */
> if (vdev->vga) {
> quirk = vfio_quirk_alloc(1);
> - mirror = quirk->data = g_malloc0(sizeof(*mirror));
> + quirk->reset = vfio_nvidia_bar0_quirk_reset;
> + mirror = quirk->data = g_malloc0(sizeof(*mirror) +
> sizeof(LastDataSet));
> mirror->mem = quirk->mem;
> mirror->vdev = vdev;
> mirror->offset = 0x1800;
> mirror->bar = nr;
> + last = (LastDataSet *)&mirror->data;
> + last->quirk = quirk;
>
> memory_region_init_io(mirror->mem, OBJECT(vdev),
> &vfio_nvidia_mirror_quirk, mirror,
> @@ -1668,6 +1826,10 @@ void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
> int i;
>
> QLIST_FOREACH(quirk, &bar->quirks, next) {
> + while (!QLIST_EMPTY(&quirk->ioeventfds)) {
> + vfio_ioeventfd_exit(QLIST_FIRST(&quirk->ioeventfds));
> + }
> +
> for (i = 0; i < quirk->nr_mem; i++) {
> memory_region_del_subregion(bar->region.mem, &quirk->mem[i]);
> }
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 65446fb42845..ba1239551115 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3175,6 +3175,8 @@ static Property vfio_pci_dev_properties[] = {
> DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
> DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
> no_geforce_quirks, false),
> + DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
> + false),
> DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id,
> PCI_ANY_ID),
> DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id,
> PCI_ANY_ID),
> DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 594a5bd00593..a4ac583fbd6e 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -24,9 +24,22 @@
>
> struct VFIOPCIDevice;
>
> +typedef struct VFIOIOEventFD {
> + QLIST_ENTRY(VFIOIOEventFD) next;
> + MemoryRegion *mr;
> + hwaddr addr;
> + unsigned size;
> + uint64_t data;
> + EventNotifier e;
> + VFIORegion *region;
> + hwaddr region_addr;
> + bool dynamic; /* Added runtime, removed on device reset */
> +} VFIOIOEventFD;
> +
> typedef struct VFIOQuirk {
> QLIST_ENTRY(VFIOQuirk) next;
> void *data;
> + QLIST_HEAD(, VFIOIOEventFD) ioeventfds;
> int nr_mem;
> MemoryRegion *mem;
> void (*reset)(struct VFIOPCIDevice *vdev, struct VFIOQuirk *quirk);
> @@ -149,6 +162,7 @@ typedef struct VFIOPCIDevice {
> bool no_kvm_msi;
> bool no_kvm_msix;
> bool no_geforce_quirks;
> + bool no_kvm_ioeventfd;
> VFIODisplay *dpy;
> } VFIOPCIDevice;
>
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 20109cb7581f..f8f97d1ff90c 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -77,6 +77,9 @@ vfio_quirk_ati_bonaire_reset_no_smc(const char *name) "%s"
> vfio_quirk_ati_bonaire_reset_timeout(const char *name) "%s"
> vfio_quirk_ati_bonaire_reset_done(const char *name) "%s"
> vfio_quirk_ati_bonaire_reset(const char *name) "%s"
> +vfio_ioeventfd_exit(const char *name, uint64_t addr, unsigned size, uint64_t
> data) "%s+0x%"PRIx64"[%d]:0x%"PRIx64
> +vfio_ioeventfd_handler(const char *name, uint64_t addr, unsigned size,
> uint64_t data) "%s+0x%"PRIx64"[%d] -> 0x%"PRIx64
> +vfio_ioeventfd_init(const char *name, uint64_t addr, unsigned size, uint64_t
> data) "%s+0x%"PRIx64"[%d]:0x%"PRIx64
> vfio_pci_igd_bar4_write(const char *name, uint32_t index, uint32_t data,
> uint32_t base) "%s [0x%03x] 0x%08x -> 0x%08x"
> vfio_pci_igd_bdsm_enabled(const char *name, int size) "%s %dMB"
> vfio_pci_igd_opregion_enabled(const char *name) "%s"
>
>
Reviewed-by: Eric Auger <address@hidden>
Thanks
Eric
- [Qemu-devel] [PATCH v3 0/6] vfio/quirks: ioeventfd support, Alex Williamson, 2018/05/03
- [Qemu-devel] [PATCH v3 1/6] scripts/update-linux-headers: Update handling of __aligned_u64, Alex Williamson, 2018/05/03
- [Qemu-devel] [PATCH v3 3/6] vfio/quirks: Add common quirk alloc helper, Alex Williamson, 2018/05/03
- [Qemu-devel] [PATCH v3 2/6] linux-headers: Update to v4.17-rc3, Alex Williamson, 2018/05/03
- [Qemu-devel] [PATCH v3 4/6] vfio/quirks: Add quirk reset callback, Alex Williamson, 2018/05/03
- [Qemu-devel] [PATCH v3 5/6] vfio/quirks: ioeventfd quirk acceleration, Alex Williamson, 2018/05/03
- [Qemu-devel] [PATCH v3 6/6] vfio/quirks: Enable ioeventfd quirks to be handled by vfio directly, Alex Williamson, 2018/05/03