qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v5 07/10] hw/vfio/platform: add vfio-platform su


From: Alexander Graf
Subject: Re: [Qemu-devel] [PATCH v5 07/10] hw/vfio/platform: add vfio-platform support
Date: Mon, 11 Aug 2014 11:36:21 +0200
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Thunderbird/31.0


On 09.08.14 16:25, Eric Auger wrote:
Minimal VFIO platform implementation supporting
- register space user mapping,
- IRQ assignment based on eventfds handled on qemu side.

irqfd kernel acceleration comes in a subsequent patch.

Signed-off-by: Kim Phillips <address@hidden>
Signed-off-by: Eric Auger <address@hidden>

---

v4 -> v5:
- vfio-plaform.h included first
- cleanup error handling in *populate*, vfio_get_device,
   vfio_enable_intp
- vfio_put_device not called anymore
- add some includes to follow vfio policy

v3 -> v4:
[Eric Auger]
- merge of "vfio: Add initial IRQ support in platform device"
   to get a full functional patch although perfs are limited.
- removal of unrealize function since I currently understand
   it is only used with device hot-plug feature.

v2 -> v3:
[Eric Auger]
- further factorization between PCI and platform (VFIORegion,
   VFIODevice). same level of functionality.

<= v2:
[Kim Philipps]
- Initial Creation of the device supporting register space mapping
---
  hw/vfio/Makefile.objs           |   1 +
  hw/vfio/platform.c              | 517 ++++++++++++++++++++++++++++++++++++++++
  include/hw/vfio/vfio-platform.h |  77 ++++++
  3 files changed, 595 insertions(+)
  create mode 100644 hw/vfio/platform.c
  create mode 100644 include/hw/vfio/vfio-platform.h

diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
index e31f30e..c5c76fe 100644
--- a/hw/vfio/Makefile.objs
+++ b/hw/vfio/Makefile.objs
@@ -1,4 +1,5 @@
  ifeq ($(CONFIG_LINUX), y)
  obj-$(CONFIG_SOFTMMU) += common.o
  obj-$(CONFIG_PCI) += pci.o
+obj-$(CONFIG_SOFTMMU) += platform.o
  endif
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
new file mode 100644
index 0000000..f1a1b55
--- /dev/null
+++ b/hw/vfio/platform.c
@@ -0,0 +1,517 @@
+/*
+ * vfio based device assignment support - platform devices
+ *
+ * Copyright Linaro Limited, 2014
+ *
+ * Authors:
+ *  Kim Phillips <address@hidden>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on vfio based PCI device assignment support:
+ *  Copyright Red Hat, Inc. 2012
+ */
+
+#include <linux/vfio.h>
+#include <sys/ioctl.h>
+
+#include "hw/vfio/vfio-platform.h"
+#include "qemu/error-report.h"
+#include "qemu/range.h"
+#include "sysemu/sysemu.h"
+#include "exec/memory.h"
+#include "qemu/queue.h"
+#include "hw/sysbus.h"
+
+extern const MemoryRegionOps vfio_region_ops;
+extern const MemoryListener vfio_memory_listener;
+extern QLIST_HEAD(, VFIOGroup) group_list;
+extern QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces;
+void vfio_put_device(VFIOPlatformDevice *vdev);
+
+/*
+ * It is mandatory to pass a VFIOPlatformDevice since VFIODevice
+ * is not a QOM Object and cannot be passed to memory region functions
+*/
+static void vfio_map_region(VFIOPlatformDevice *vdev, int nr)
+{
+    VFIORegion *region = vdev->regions[nr];
+    unsigned size = region->size;
+    char name[64];
+
+    if (!size) {
+        return;
+    }
+
+    snprintf(name, sizeof(name), "VFIO %s region %d",
+             vdev->vbasedev.name, nr);
+
+    /* A "slow" read/write mapping underlies all regions */
+    memory_region_init_io(&region->mem, OBJECT(vdev), &vfio_region_ops,
+                          region, name, size);
+
+    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
+
+    if (vfio_mmap_region(OBJECT(vdev), region, &region->mem,
+                         &region->mmap_mem, &region->mmap, size, 0, name)) {
+        error_report("%s unsupported. Performance may be slow", name);
+    }
+}
+
+static void print_regions(VFIOPlatformDevice *vdev)
+{
+    int i;
+
+    DPRINTF("Device \"%s\" counts %d region(s):\n",
+             vdev->vbasedev.name, vdev->vbasedev.num_regions);
+
+    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
+        DPRINTF("- region %d flags = 0x%lx, size = 0x%lx, "
+                "fd= %d, offset = 0x%lx\n",
+                vdev->regions[i]->nr,
+                (unsigned long)vdev->regions[i]->flags,
+                (unsigned long)vdev->regions[i]->size,
+                vdev->regions[i]->vbasedev->fd,
+                (unsigned long)vdev->regions[i]->fd_offset);
+    }
+}
+
+static int vfio_populate_regions(VFIODevice *vbasedev)
+{
+    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
+    int i, ret = 0;
+    VFIOPlatformDevice *vdev =
+        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
+
+    vdev->regions = g_malloc0(sizeof(VFIORegion *) * vbasedev->num_regions);
+
+    for (i = 0; i < vbasedev->num_regions; i++) {
+        vdev->regions[i] = g_malloc0(sizeof(VFIORegion));
+        reg_info.index = i;
+        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
+        if (ret) {
+            error_report("vfio: Error getting region %d info: %m", i);
+            goto error;
+        }
+
+        vdev->regions[i]->flags = reg_info.flags;
+        vdev->regions[i]->size = reg_info.size;
+        vdev->regions[i]->fd_offset = reg_info.offset;
+        vdev->regions[i]->nr = i;
+        vdev->regions[i]->vbasedev = vbasedev;
+    }
+    print_regions(vdev);
+error:
+    return ret;
+}
+
+/* not implemented yet */
+static int vfio_platform_check_device(VFIODevice *vdev)
+{
+    return 0;
+}
+
+/* not implemented yet */
+static bool vfio_platform_compute_needs_reset(VFIODevice *vdev)
+{
+return false;
+}
+
+/* not implemented yet */
+static int vfio_platform_hot_reset_multi(VFIODevice *vdev)
+{
+return 0;
+}
+
+/*
+ * eoi function is called on the first access to any MMIO region
+ * after an IRQ was triggered. It is assumed this access corresponds
+ * to the IRQ status register reset.
+ * With such a mechanism, a single IRQ can be handled at a time since
+ * there is no way to know which IRQ was completed by the guest.
+ * (we would need additional details about the IRQ status register mask)
+ */
+static void vfio_platform_eoi(VFIODevice *vbasedev)
+{
+    VFIOINTp *intp;
+    VFIOPlatformDevice *vdev =
+        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
+
+    QLIST_FOREACH(intp, &vdev->intp_list, next) {
+        if (intp->state == VFIO_IRQ_ACTIVE) {
+            DPRINTF("EOI IRQ #%d fd=%d\n",
+                    intp->pin, event_notifier_get_fd(&intp->interrupt));
+            intp->state = VFIO_IRQ_INACTIVE;
+
+            /* deassert the virtual IRQ and unmask physical one */
+            qemu_set_irq(intp->qemuirq, 0);
+            vfio_unmask_irqindex(vbasedev, intp->pin);
+
+            /* a single IRQ can be active at a time */
+            break;
+        }
+    }
+
+    /* in case there are pending IRQs, handle them one at a time */
+    if (!QSIMPLEQ_EMPTY(&vdev->pending_intp_queue)) {
+        intp = QSIMPLEQ_FIRST(&vdev->pending_intp_queue);
+        vfio_intp_interrupt(intp);
+        QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext);
+    }
+}
+
+/*
+ * enable/disable the fast path mode
+ * fast path = MMIO region is mmaped (no KVM TRAP)
+ * slow path = MMIO region is trapped and region callbacks are called
+ * slow path enables to trap the IRQ status register guest reset
+*/
+
+static void vfio_mmap_set_enabled(VFIOPlatformDevice *vdev, bool enabled)
+{
+    VFIORegion *region;
+    int i;
+
+    DPRINTF("fast path = %d\n", enabled);
+
+    for (i = 0; i < vdev->vbasedev.num_regions; i++) {
+        region = vdev->regions[i];
+
+        /* register space is unmapped to trap EOI */
+        memory_region_set_enabled(&region->mmap_mem, enabled);
+    }
+}
+
+/*
+ * Checks whether the IRQ is still pending. In the negative
+ * the fast path mode (where reg space is mmaped) can be restored.
+ * if the IRQ is still pending, we must keep on trapping IRQ status
+ * register reset with mmap disabled (slow path).
+ * the function is called on mmap_timer event.
+ * by construction a single fd is handled at a time. See EOI comment
+ * for additional details.
+ */
+static void vfio_intp_mmap_enable(void *opaque)
+{
+    VFIOINTp *tmp;
+    VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque;
+
+    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
+        if (tmp->state == VFIO_IRQ_ACTIVE) {
+            DPRINTF("IRQ #%d still active, stay in slow path\n",
+                    tmp->pin);
+            timer_mod(vdev->mmap_timer,
+                      qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+                          vdev->mmap_timeout);
+            return;
+        }
+    }
+    DPRINTF("no active IRQ, restore fast path\n");
+    vfio_mmap_set_enabled(vdev, true);
+}
+
+/*
+ * The fd handler
+ */
+void vfio_intp_interrupt(void *opaque)
+{
+    int ret;
+    VFIOINTp *tmp, *intp = (VFIOINTp *)opaque;
+    VFIOPlatformDevice *vdev = intp->vdev;
+    bool one_active_irq = false;
+
+    /*
+     * first check whether there is a pending IRQ
+     * in the positive the new IRQ cannot be handled until the
+     * active one is not completed.
+     * by construction the same IRQ as the pending one cannot hit
+     * since the physical IRQ was disabled by the VFIO driver
+     */
+    QLIST_FOREACH(tmp, &vdev->intp_list, next) {
+        if (tmp->state == VFIO_IRQ_ACTIVE) {
+            one_active_irq = true;
+            break;
+        }
+    }
+    if (one_active_irq) {
+        /*
+         * the new IRQ gets a pending status and is pushed in
+         * the pending queue
+         */
+        intp->state = VFIO_IRQ_PENDING;
+        QSIMPLEQ_INSERT_TAIL(&vdev->pending_intp_queue,
+                             intp, pqnext);
+        return;
+    }
+
+    /* no active IRQ, the new IRQ can be forwarded to the guest */
+    DPRINTF("Handle IRQ #%d (fd = %d)\n",
+            intp->pin, event_notifier_get_fd(&intp->interrupt));
+
+    ret = event_notifier_test_and_clear(&intp->interrupt);
+    if (!ret) {
+        DPRINTF("Error when clearing fd=%d\n",
+                event_notifier_get_fd(&intp->interrupt));
+    }
+
+    intp->state = VFIO_IRQ_ACTIVE;
+
+    /* sets slow path */
+    vfio_mmap_set_enabled(vdev, false);
+
+    /* trigger the virtual IRQ */
+    qemu_set_irq(intp->qemuirq, 1);
+
+    /* schedule the mmap timer which will restore mmap path after EOI*/
+    if (vdev->mmap_timeout) {
+        timer_mod(vdev->mmap_timer,
+                  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+                      vdev->mmap_timeout);
+    }
+}
+
+static int vfio_enable_intp(VFIODevice *vbasedev, unsigned int index)
+{
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+    int ret, argsz;
+    int device = vbasedev->fd;
+    VFIOPlatformDevice *vdev =
+        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
+    SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev);
+    VFIOINTp *intp;
+
+    /* allocate and populate a new VFIOINTp structure put in a queue list */
+    intp = g_malloc0(sizeof(*intp));
+    intp->vdev = vdev;
+    intp->pin = index;
+    intp->state = VFIO_IRQ_INACTIVE;
+    sysbus_init_irq(sbdev, &intp->qemuirq);
+
+    ret = event_notifier_init(&intp->interrupt, 0);
+    if (ret) {
+        g_free(intp);
+        error_report("vfio: Error: event_notifier_init failed ");
+        return ret;
+    }
+
+    /* build the irq_set to be passed to the vfio kernel driver */
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = index;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+
+    *pfd = event_notifier_get_fd(&intp->interrupt);
+
+    DPRINTF("register fd=%d/irq index=%d to kernel\n", *pfd, index);
+
+    qemu_set_fd_handler(*pfd, vfio_intp_interrupt, NULL, intp);
+
+    /*
+     * pass the index/fd binding to the kernel driver so that it
+     * triggers this fd on HW IRQ
+     */
+    ret = ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set);
+    g_free(irq_set);
+    if (ret) {
+        error_report("vfio: Error: Failed to pass IRQ fd to the driver: %m");
+        qemu_set_fd_handler(*pfd, NULL, NULL, NULL);
+        event_notifier_cleanup(&intp->interrupt);
+        return -errno;
+    }
+
+    /* store the new intp in qlist */
+    QLIST_INSERT_HEAD(&vdev->intp_list, intp, next);
+    return 0;
+}
+
+static int vfio_populate_interrupts(VFIODevice *vbasedev)
+{
+    struct vfio_irq_info irq = { .argsz = sizeof(irq) };
+    int i, ret;
+    VFIOPlatformDevice *vdev =
+        container_of(vbasedev, VFIOPlatformDevice, vbasedev);
+
+    vdev->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
+                                    vfio_intp_mmap_enable, vdev);
+
+    QSIMPLEQ_INIT(&vdev->pending_intp_queue);
+
+    for (i = 0; i < vbasedev->num_irqs; i++) {
+        irq.index = i;
+
+        DPRINTF("Retrieve IRQ info from vfio platform driver ...\n");
+
+        ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
+        if (ret) {
+            /* This can fail for an old kernel or legacy PCI dev */
+            error_printf("vfio: error getting device %s irq info",
+                         vbasedev->name);
+        } else {
+            DPRINTF("- IRQ index %d: count %d, flags=0x%x\n",
+                    irq.index, irq.count, irq.flags);
+
+            ret = vfio_enable_intp(vbasedev, irq.index);
+            if (ret) {
+                error_report("vfio: Error setting IRQ %d up", i);
+                return ret;
+            }
+        }
+    }
+    return 0;
+}
+
+static VFIODeviceOps vfio_platform_ops = {
+    .vfio_compute_needs_reset = vfio_platform_compute_needs_reset,
+    .vfio_hot_reset_multi = vfio_platform_hot_reset_multi,
+    .vfio_eoi = vfio_platform_eoi,
+    .vfio_check_device = vfio_platform_check_device,
+    .vfio_populate_regions = vfio_populate_regions,
+    .vfio_populate_interrupts = vfio_populate_interrupts,
+};
+
+static int vfio_base_device_init(VFIODevice *vbasedev)
+{
+    VFIOGroup *group;
+    VFIODevice *vbasedev_iter;
+    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
+    ssize_t len;
+    struct stat st;
+    int groupid;
+    int ret;
+
+    /* name must be set prior to the call */
+    if (!vbasedev->name) {
+        return -EINVAL;
+    }
+
+    /* Check that the host device exists */
+    snprintf(path, sizeof(path), "/sys/bus/platform/devices/%s/",
+             vbasedev->name);
+
+    if (stat(path, &st) < 0) {
+        error_report("vfio: error: no such host device: %s", path);
+        return -errno;
+    }
+
+    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
+    len = readlink(path, iommu_group_path, sizeof(path));
+    if (len <= 0 || len >= sizeof(path)) {
+        error_report("vfio: error no iommu_group for device");
+        return len < 0 ? -errno : ENAMETOOLONG;
+    }
+
+    iommu_group_path[len] = 0;
+    group_name = basename(iommu_group_path);
+
+    if (sscanf(group_name, "%d", &groupid) != 1) {
+        error_report("vfio: error reading %s: %m", path);
+        return -errno;
+    }
+
+    DPRINTF("%s(%s) group %d\n", __func__, vbasedev->name, groupid);
+
+    group = vfio_get_group(groupid, &address_space_memory);
+    if (!group) {
+        error_report("vfio: failed to get group %d", groupid);
+        return -ENOENT;
+    }
+
+    snprintf(path, sizeof(path), "%s", vbasedev->name);
+
+    QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+        if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
+            error_report("vfio: error: device %s is already attached", path);
+            vfio_put_group(group);
+            return -EBUSY;
+        }
+    }
+    ret = vfio_get_device(group, path, vbasedev);
+    if (ret) {
+        error_report("vfio: failed to get device %s", path);
+        vfio_put_group(group);
+    }
+ return ret;
+}
+
+void vfio_put_device(VFIOPlatformDevice *vdev)
+{
+    unsigned int i;
+    VFIODevice *vbasedev = &vdev->vbasedev;
+
+    for (i = 0; i < vbasedev->num_regions; i++) {
+            g_free(vdev->regions[i]);
+    }
+    g_free(vdev->regions);
+    g_free(vdev->vbasedev.name);
+    vfio_put_base_device(&vdev->vbasedev);
+}
+
+static void vfio_platform_realize(DeviceState *dev, Error **errp)
+{
+    VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev);
+    SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
+    VFIODevice *vbasedev = &vdev->vbasedev;
+    int i, ret;
+
+    vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
+    vbasedev->ops = &vfio_platform_ops;
+
+    DPRINTF("vfio device %s, compat = %s\n", vbasedev->name, vdev->compat);
+
+    ret = vfio_base_device_init(vbasedev);
+    if (ret) {
+        return;
+    }
+
+    for (i = 0; i < vbasedev->num_regions; i++) {
+        vfio_map_region(vdev, i);
+        sysbus_init_mmio(sbdev, &vdev->regions[i]->mem);
+    }
+}
+
+static const VMStateDescription vfio_platform_vmstate = {
+    .name = TYPE_VFIO_PLATFORM,
+    .unmigratable = 1,
+};
+
+static Property vfio_platform_dev_properties[] = {
+    DEFINE_PROP_STRING("vfio_device", VFIOPlatformDevice, vbasedev.name),
+    DEFINE_PROP_STRING("compat", VFIOPlatformDevice, compat),
+    DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice,
+                       mmap_timeout, 1100),
+    DEFINE_PROP_BOOL("irqfd", VFIOPlatformDevice, irqfd_allowed, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vfio_platform_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->realize = vfio_platform_realize;
+    dc->props = vfio_platform_dev_properties;
+    dc->vmsd = &vfio_platform_vmstate;
+    dc->desc = "VFIO-based platform device assignment";
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+}
+
+static const TypeInfo vfio_platform_dev_info = {
+    .name = TYPE_VFIO_PLATFORM,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(VFIOPlatformDevice),
+    .class_init = vfio_platform_class_init,
+    .class_size = sizeof(VFIOPlatformDeviceClass),

This should be an abstract class. People must never instantiate a generic "vfio-platform" device. Only "vfio-xgmac", "vfio-etsec", etc devices should be exposed to the user.


Alex




reply via email to

[Prev in Thread] Current Thread [Next in Thread]