qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [RFC v2 3/6] vfio: add vfio-platform support


From: Eric Auger
Subject: Re: [Qemu-devel] [RFC v2 3/6] vfio: add vfio-platform support
Date: Tue, 20 May 2014 08:44:26 +0200
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Thunderbird/24.2.0

On 04/18/2014 04:54 PM, Alex Williamson wrote:
> On Wed, 2014-04-09 at 16:33 +0100, Eric Auger wrote:
>> From: Kim Phillips <address@hidden>
>>
>> Functions for which PCI and platform device support share are moved
>> into common.c.  The common vfio_{get,put}_group() get an additional
>> argument, a pointer to a vfio_reset_handler(), for which to pass on to
>> qemu_register_reset, but only if it exists (the platform device code
>> currently passes a NULL as its reset_handler).
>>
>> For the platform device code, we basically use SysBusDevice
>> instead of PCIDevice.  Since realize() returns void, unlike
>> PCIDevice's initfn, error codes are moved into the
>> error message text with %m.
>>
>> Currently only MMIO access is supported at this time.
>>
>> The perceived path for future QEMU development is:
>>
>> - add support for interrupts
>> - verify and test platform dev unmap path
>> - test existing PCI path for any regressions
>> - add support for creating platform devices on the qemu command line
>>   - currently device address specification is hardcoded for test
>>     development on Calxeda Midway's fff51000.ethernet device
>> - reset is not supported and registration of reset functions is
>>   bypassed for platform devices.
>>   - there is no standard means of resetting a platform device,
>>     unsure if it suffices to be handled at device--VFIO binding time
>>
>> Signed-off-by: Kim Phillips <address@hidden>
>>
>> [1] http://www.spinics.net/lists/kvm-arm/msg08195.html
>> ---
>>  hw/vfio/Makefile.objs |   2 +
>>  hw/vfio/common.c      | 486 
>> ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  hw/vfio/pci.c         | 480 
>> ++-----------------------------------------------
>>  hw/vfio/platform.c    | 381 +++++++++++++++++++++++++++++++++++++++
>>  hw/vfio/vfio-common.h |  55 ++++++
>>  5 files changed, 937 insertions(+), 467 deletions(-)
>>  create mode 100644 hw/vfio/common.c
>>  create mode 100644 hw/vfio/platform.c
>>  create mode 100644 hw/vfio/vfio-common.h
>>
>> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
>> index 31c7dab..c5c76fe 100644
>> --- a/hw/vfio/Makefile.objs
>> +++ b/hw/vfio/Makefile.objs
>> @@ -1,3 +1,5 @@
>>  ifeq ($(CONFIG_LINUX), y)
>> +obj-$(CONFIG_SOFTMMU) += common.o
>>  obj-$(CONFIG_PCI) += pci.o
>> +obj-$(CONFIG_SOFTMMU) += platform.o
>>  endif
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> new file mode 100644
>> index 0000000..9d1f723
>> --- /dev/null
>> +++ b/hw/vfio/common.c
>> @@ -0,0 +1,486 @@
>> +/*
>> + * vfio based device assignment support
>> + *
>> + * Copyright Red Hat, Inc. 2012
>> + *
>> + * Authors:
>> + *  Alex Williamson <address@hidden>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + *
>> + * Based on qemu-kvm device-assignment:
>> + *  Adapted for KVM by Qumranet.
>> + *  Copyright (c) 2007, Neocleus, Alex Novik (address@hidden)
>> + *  Copyright (c) 2007, Neocleus, Guy Zana (address@hidden)
>> + *  Copyright (C) 2008, Qumranet, Amit Shah (address@hidden)
>> + *  Copyright (C) 2008, Red Hat, Amit Shah (address@hidden)
>> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (address@hidden)
>> + */
>> +
>> +#include <dirent.h>
>> +#include <linux/vfio.h>
>> +#include <sys/ioctl.h>
>> +#include <sys/mman.h>
>> +#include <sys/stat.h>
>> +#include <sys/types.h>
>> +#include <unistd.h>
>> +
>> +#include "config.h"
>> +#include "exec/address-spaces.h"
>> +#include "exec/memory.h"
>> +#include "hw/pci/msi.h"
>> +#include "hw/pci/msix.h"
>> +#include "hw/pci/pci.h"
> 
> I expect these pci includes aren't really needed

Hi Alex,

thank you for your review and sorry for this reply's delay. I am going
to maintain & update Kim's patch from now on. I am currently preparing
v3 taking into account your comments and Kim's ones.

The headers you mentioned could be removed. Only the following ones are
needed:

#include <linux/vfio.h>
#include <sys/ioctl.h>
#include "sys/mman.h"
#include "exec/address-spaces.h"
#include "qemu/error-report.h"
#include "sysemu/kvm.h"
#include "vfio-common.h"

> 
>> +#include "qemu-common.h"
>> +#include "qemu/error-report.h"
>> +#include "qemu/event_notifier.h"
>> +#include "qemu/queue.h"
>> +#include "qemu/range.h"
>> +#include "sysemu/kvm.h"
>> +#include "sysemu/sysemu.h"
>> +
>> +#include "vfio-common.h"
>> +
>> +#define DEBUG_VFIO
>> +#ifdef DEBUG_VFIO
>> +#define DPRINTF(fmt, ...) \
>> +    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
>> +#else
>> +#define DPRINTF(fmt, ...) \
>> +    do { } while (0)
>> +#endif
> 
> DEBUG_VFIO should probably be in vfio-common.h
done
> 
>> +
>> +static QLIST_HEAD(, VFIOContainer)
>> +    container_list = QLIST_HEAD_INITIALIZER(container_list);
>> +
>> +QLIST_HEAD(, VFIOGroup)
>> +    group_list = QLIST_HEAD_INITIALIZER(group_list);
>> +
>> +
>> +struct VFIODevice;
> 
> I don't see where this is needed.
removed
> 
>> +
>> +#ifdef CONFIG_KVM
>> +/*
>> + * We have a single VFIO pseudo device per KVM VM.  Once created it lives
>> + * for the life of the VM.  Closing the file descriptor only drops our
>> + * reference to it and the device's reference to kvm.  Therefore once
>> + * initialized, this file descriptor is only released on QEMU exit and
>> + * we'll re-use it should another vfio device be attached before then.
>> + */
>> +static int vfio_kvm_device_fd = -1;
>> +#endif
>> +
>> +/*
>> + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
>> + */
>> +static int vfio_dma_unmap(VFIOContainer *container,
>> +                          hwaddr iova, ram_addr_t size)
>> +{
>> +    struct vfio_iommu_type1_dma_unmap unmap = {
>> +        .argsz = sizeof(unmap),
>> +        .flags = 0,
>> +        .iova = iova,
>> +        .size = size,
>> +    };
>> +
>> +    if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
>> +        DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
>> +        return -errno;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>> +                        ram_addr_t size, void *vaddr, bool readonly)
>> +{
>> +    struct vfio_iommu_type1_dma_map map = {
>> +        .argsz = sizeof(map),
>> +        .flags = VFIO_DMA_MAP_FLAG_READ,
>> +        .vaddr = (__u64)(uintptr_t)vaddr,
>> +        .iova = iova,
>> +        .size = size,
>> +    };
>> +
>> +    if (!readonly) {
>> +        map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
>> +    }
>> +
>> +    /*
>> +     * Try the mapping, if it fails with EBUSY, unmap the region and try
>> +     * again.  This shouldn't be necessary, but we sometimes see it in
>> +     * the the VGA ROM space.
>> +     */
>> +    if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
>> +        (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
>> +         ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
>> +        return 0;
>> +    }
>> +
>> +    DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
>> +    return -errno;
>> +}
>> +
>> +static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>> +{
>> +    return !memory_region_is_ram(section->mr) ||
>> +           /*
>> +            * Sizing an enabled 64-bit BAR can cause spurious mappings to
>> +            * addresses in the upper part of the 64-bit address space.  
>> These
>> +            * are never accessed by the CPU and beyond the address width of
>> +            * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU 
>> width.
>> +            */
>> +           section->offset_within_address_space & (1ULL << 63);
>> +}
>> +
>> +static void vfio_listener_region_add(MemoryListener *listener,
>> +                                     MemoryRegionSection *section)
>> +{
>> +    VFIOContainer *container = container_of(listener, VFIOContainer,
>> +                                            iommu_data.type1.listener);
>> +    hwaddr iova, end;
>> +    void *vaddr;
>> +    int ret;
>> +
>> +    assert(!memory_region_is_iommu(section->mr));
>> +
>> +    if (vfio_listener_skipped_section(section)) {
>> +        DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
>> +                section->offset_within_address_space,
>> +                section->offset_within_address_space +
>> +                int128_get64(int128_sub(section->size, int128_one())));
>> +        return;
>> +    }
>> +
>> +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) 
>> !=
>> +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
>> +        error_report("%s received unaligned region", __func__);
>> +        return;
>> +    }
>> +
>> +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>> +    end = (section->offset_within_address_space + 
>> int128_get64(section->size)) &
>> +          TARGET_PAGE_MASK;
>> +
>> +    if (iova >= end) {
>> +        return;
>> +    }
>> +
>> +    vaddr = memory_region_get_ram_ptr(section->mr) +
>> +            section->offset_within_region +
>> +            (iova - section->offset_within_address_space);
>> +
>> +    DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
>> +            iova, end - 1, vaddr);
>> +
>> +    memory_region_ref(section->mr);
>> +    ret = vfio_dma_map(container, iova, end - iova, vaddr, 
>> section->readonly);
>> +    if (ret) {
>> +        error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
>> +                     "0x%"HWADDR_PRIx", %p) = %d (%m)",
>> +                     container, iova, end - iova, vaddr, ret);
>> +
>> +        /*
>> +         * On the initfn path, store the first error in the container so we
>> +         * can gracefully fail.  Runtime, there's not much we can do other
>> +         * than throw a hardware error.
>> +         */
>> +        if (!container->iommu_data.type1.initialized) {
>> +            if (!container->iommu_data.type1.error) {
>> +                container->iommu_data.type1.error = ret;
>> +            }
>> +        } else {
>> +            hw_error("vfio: DMA mapping failed, unable to continue");
>> +        }
>> +    }
>> +}
>> +
>> +static void vfio_listener_region_del(MemoryListener *listener,
>> +                                     MemoryRegionSection *section)
>> +{
>> +    VFIOContainer *container = container_of(listener, VFIOContainer,
>> +                                            iommu_data.type1.listener);
>> +    hwaddr iova, end;
>> +    int ret;
>> +
>> +    if (vfio_listener_skipped_section(section)) {
>> +        DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
>> +                section->offset_within_address_space,
>> +                section->offset_within_address_space +
>> +                int128_get64(int128_sub(section->size, int128_one())));
>> +        return;
>> +    }
>> +
>> +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) 
>> !=
>> +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
>> +        error_report("%s received unaligned region", __func__);
>> +        return;
>> +    }
>> +
>> +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>> +    end = (section->offset_within_address_space + 
>> int128_get64(section->size)) &
>> +          TARGET_PAGE_MASK;
>> +
>> +    if (iova >= end) {
>> +        return;
>> +    }
>> +
>> +    DPRINTF("region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
>> +            iova, end - 1);
>> +
>> +    ret = vfio_dma_unmap(container, iova, end - iova);
>> +    memory_region_unref(section->mr);
>> +    if (ret) {
>> +        error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
>> +                     "0x%"HWADDR_PRIx") = %d (%m)",
>> +                     container, iova, end - iova, ret);
>> +    }
>> +}
>> +
>> +static MemoryListener vfio_memory_listener = {
>> +    .region_add = vfio_listener_region_add,
>> +    .region_del = vfio_listener_region_del,
>> +};
>> +
>> +static void vfio_listener_release(VFIOContainer *container)
>> +{
>> +    memory_listener_unregister(&container->iommu_data.type1.listener);
>> +}
>> +
>> +static void vfio_kvm_device_add_group(VFIOGroup *group)
>> +{
>> +#ifdef CONFIG_KVM
>> +    struct kvm_device_attr attr = {
>> +        .group = KVM_DEV_VFIO_GROUP,
>> +        .attr = KVM_DEV_VFIO_GROUP_ADD,
>> +        .addr = (uint64_t)(unsigned long)&group->fd,
>> +    };
>> +
>> +    if (!kvm_enabled()) {
>> +        return;
>> +    }
>> +
>> +    if (vfio_kvm_device_fd < 0) {
>> +        struct kvm_create_device cd = {
>> +            .type = KVM_DEV_TYPE_VFIO,
>> +        };
>> +
>> +        if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
>> +            DPRINTF("KVM_CREATE_DEVICE: %m\n");
>> +            return;
>> +        }
>> +
>> +        vfio_kvm_device_fd = cd.fd;
>> +    }
>> +
>> +    if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
>> +        error_report("Failed to add group %d to KVM VFIO device: %m",
>> +                     group->groupid);
>> +    }
>> +#endif
>> +}
>> +
>> +static void vfio_kvm_device_del_group(VFIOGroup *group)
>> +{
>> +#ifdef CONFIG_KVM
>> +    struct kvm_device_attr attr = {
>> +        .group = KVM_DEV_VFIO_GROUP,
>> +        .attr = KVM_DEV_VFIO_GROUP_DEL,
>> +        .addr = (uint64_t)(unsigned long)&group->fd,
>> +    };
>> +
>> +    if (vfio_kvm_device_fd < 0) {
>> +        return;
>> +    }
>> +
>> +    if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
>> +        error_report("Failed to remove group %d from KVM VFIO device: %m",
>> +                     group->groupid);
>> +    }
>> +#endif
>> +}
>> +
>> +static int vfio_connect_container(VFIOGroup *group)
>> +{
>> +    VFIOContainer *container;
>> +    int ret, fd;
>> +
>> +    if (group->container) {
>> +        return 0;
>> +    }
>> +
>> +    QLIST_FOREACH(container, &container_list, next) {
>> +        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
>> +            group->container = container;
>> +            QLIST_INSERT_HEAD(&container->group_list, group, 
>> container_next);
>> +            return 0;
>> +        }
>> +    }
>> +
>> +    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
>> +    if (fd < 0) {
>> +        error_report("vfio: failed to open /dev/vfio/vfio: %m");
>> +        return -errno;
>> +    }
>> +
>> +    ret = ioctl(fd, VFIO_GET_API_VERSION);
>> +    if (ret != VFIO_API_VERSION) {
>> +        error_report("vfio: supported vfio version: %d, "
>> +                     "reported version: %d", VFIO_API_VERSION, ret);
>> +        close(fd);
>> +        return -EINVAL;
>> +    }
>> +
>> +    container = g_malloc0(sizeof(*container));
>> +    container->fd = fd;
>> +
>> +    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
>> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
>> +        if (ret) {
>> +            error_report("vfio: failed to set group container: %m");
>> +            g_free(container);
>> +            close(fd);
>> +            return -errno;
>> +        }
>> +
>> +        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
>> +        if (ret) {
>> +            error_report("vfio: failed to set iommu for container: %m");
>> +            g_free(container);
>> +            close(fd);
>> +            return -errno;
>> +        }
>> +
>> +        container->iommu_data.type1.listener = vfio_memory_listener;
>> +        container->iommu_data.release = vfio_listener_release;
>> +
>> +        memory_listener_register(&container->iommu_data.type1.listener,
>> +                                 &address_space_memory);
>> +
>> +        if (container->iommu_data.type1.error) {
>> +            ret = container->iommu_data.type1.error;
>> +            vfio_listener_release(container);
>> +            g_free(container);
>> +            close(fd);
>> +            error_report("vfio: memory listener initialization failed for 
>> container");
>> +            return ret;
>> +        }
>> +
>> +        container->iommu_data.type1.initialized = true;
>> +
>> +    } else {
>> +        error_report("vfio: No available IOMMU models");
>> +        g_free(container);
>> +        close(fd);
>> +        return -EINVAL;
>> +    }
>> +
>> +    QLIST_INIT(&container->group_list);
>> +    QLIST_INSERT_HEAD(&container_list, container, next);
>> +
>> +    group->container = container;
>> +    QLIST_INSERT_HEAD(&container->group_list, group, container_next);
>> +
>> +    return 0;
>> +}
>> +
>> +static void vfio_disconnect_container(VFIOGroup *group)
>> +{
>> +    VFIOContainer *container = group->container;
>> +
>> +    if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
>> +        error_report("vfio: error disconnecting group %d from container",
>> +                     group->groupid);
>> +    }
>> +
>> +    QLIST_REMOVE(group, container_next);
>> +    group->container = NULL;
>> +
>> +    if (QLIST_EMPTY(&container->group_list)) {
>> +        if (container->iommu_data.release) {
>> +            container->iommu_data.release(container);
>> +        }
>> +        QLIST_REMOVE(container, next);
>> +        DPRINTF("vfio_disconnect_container: close container->fd\n");
>> +        close(container->fd);
>> +        g_free(container);
>> +    }
>> +}
>> +
>> +VFIOGroup *vfio_get_group(int groupid, QEMUResetHandler *reset_handler)
>> +{
>> +    VFIOGroup *group;
>> +    char path[32];
>> +    struct vfio_group_status status = { .argsz = sizeof(status) };
>> +
>> +    QLIST_FOREACH(group, &group_list, next) {
>> +        if (group->groupid == groupid) {
>> +            return group;
>> +        }
>> +    }
>> +
>> +    group = g_malloc0(sizeof(*group));
>> +
>> +    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
>> +    group->fd = qemu_open(path, O_RDWR);
>> +    if (group->fd < 0) {
>> +        error_report("vfio: error opening %s: %m", path);
>> +        g_free(group);
>> +        return NULL;
>> +    }
>> +
>> +    if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
>> +        error_report("vfio: error getting group status: %m");
>> +        close(group->fd);
>> +        g_free(group);
>> +        return NULL;
>> +    }
>> +
>> +    if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
>> +        error_report("vfio: error, group %d is not viable, please ensure "
>> +                     "all devices within the iommu_group are bound to their 
>> "
>> +                     "vfio bus driver.", groupid);
>> +        close(group->fd);
>> +        g_free(group);
>> +        return NULL;
>> +    }
>> +
>> +    group->groupid = groupid;
>> +    QLIST_INIT(&group->device_list);
>> +
>> +    if (vfio_connect_container(group)) {
>> +        error_report("vfio: failed to setup container for group %d", 
>> groupid);
>> +        close(group->fd);
>> +        g_free(group);
>> +        return NULL;
>> +    }
>> +
>> +    if (QLIST_EMPTY(&group_list) && reset_handler) {
>> +        qemu_register_reset(reset_handler, NULL);
>> +    }
>> +
>> +    QLIST_INSERT_HEAD(&group_list, group, next);
>> +
>> +    vfio_kvm_device_add_group(group);
>> +
>> +    return group;
>> +}
>> +
>> +void vfio_put_group(VFIOGroup *group, QEMUResetHandler *reset_handler)
>> +{
>> +    if (!QLIST_EMPTY(&group->device_list)) {
>> +        return;
>> +    }
>> +
>> +    vfio_kvm_device_del_group(group);
>> +    vfio_disconnect_container(group);
>> +    QLIST_REMOVE(group, next);
>> +    DPRINTF("vfio_put_group: close group->fd\n");
>> +    close(group->fd);
>> +    g_free(group);
>> +
>> +    if (QLIST_EMPTY(&group_list) && reset_handler) {
>> +        qemu_unregister_reset(reset_handler, NULL);
>> +    }
> 
> The reset_handler stuff needs work.  We can theoretically support both
> PCI and platform devices simultaneously, but this would only allow one
> to register a reset handler and one to remove it (which may not be the
> same one). 

My understanding is there is a global reset handler for all VFIO
devices, registered before the first group is added and unregistered
when the last group is removed from the group list.

This global reset handler, currently named vfio_pci_reset_handler
handles the reset of all (PCI) devices in all groups.

One possibility is to have a derived reset method for PCI class
(currently vfio_pci_hot_reset_multi) and platform class. As such we
could keep the unique global reset handler while handling both PCI wide
reset handling and platform wide reset handling.

The limitation of this is that user-side can't have platform device
specific reset handlers which may be needed in the future. In case this
were needed we would need to register many reset_handlers, possibly per
VFIO platform device. I suggest this would be the topic of a different
patch. What do you think?

> 
>> +}
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 9cf5b84..9e70d68 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -1,5 +1,5 @@
>>  /*
>> - * vfio based device assignment support
>> + * vfio based device assignment support - PCI devices
>>   *
>>   * Copyright Red Hat, Inc. 2012
>>   *
>> @@ -40,6 +40,8 @@
>>  #include "sysemu/kvm.h"
>>  #include "sysemu/sysemu.h"
> 
> There were probably some includes that could be cleaned out with the
> split.

Done

>  
>> +#include "vfio-common.h"
>> +
>>  /* #define DEBUG_VFIO */
>>  #ifdef DEBUG_VFIO
>>  #define DPRINTF(fmt, ...) \
> 
> DEBUG_VFIO is duplicated here... move to vfio-common.h
ok done
> 
>> @@ -55,6 +57,8 @@
>>  #define VFIO_ALLOW_KVM_MSI 1
>>  #define VFIO_ALLOW_KVM_MSIX 1
>>  
>> +extern QLIST_HEAD(, VFIOGroup) group_list;
> 
> Define in vfio-common.h?
Done
> 
>> +
>>  struct VFIODevice;
>>  
>>  typedef struct VFIOQuirk {
>> @@ -135,25 +139,6 @@ enum {
>>  
>>  struct VFIOGroup;
>>  
>> -typedef struct VFIOType1 {
>> -    MemoryListener listener;
>> -    int error;
>> -    bool initialized;
>> -} VFIOType1;
>> -
>> -typedef struct VFIOContainer {
>> -    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
>> -    struct {
>> -        /* enable abstraction to support various iommu backends */
>> -        union {
>> -            VFIOType1 type1;
>> -        };
>> -        void (*release)(struct VFIOContainer *);
>> -    } iommu_data;
>> -    QLIST_HEAD(, VFIOGroup) group_list;
>> -    QLIST_ENTRY(VFIOContainer) next;
>> -} VFIOContainer;
>> -
>>  /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map 
>> */
>>  typedef struct VFIOMSIXInfo {
>>      uint8_t table_bar;
>> @@ -200,15 +185,6 @@ typedef struct VFIODevice {
>>      bool rom_read_failed;
>>  } VFIODevice;
>>  
>> -typedef struct VFIOGroup {
>> -    int fd;
>> -    int groupid;
>> -    VFIOContainer *container;
>> -    QLIST_HEAD(, VFIODevice) device_list;
>> -    QLIST_ENTRY(VFIOGroup) next;
>> -    QLIST_ENTRY(VFIOGroup) container_next;
>> -} VFIOGroup;
>> -
>>  typedef struct VFIORomBlacklistEntry {
>>      uint16_t vendor_id;
>>      uint16_t device_id;
>> @@ -234,23 +210,6 @@ static const VFIORomBlacklistEntry romblacklist[] = {
>>  
>>  #define MSIX_CAP_LENGTH 12
>>  
>> -static QLIST_HEAD(, VFIOContainer)
>> -    container_list = QLIST_HEAD_INITIALIZER(container_list);
>> -
>> -static QLIST_HEAD(, VFIOGroup)
>> -    group_list = QLIST_HEAD_INITIALIZER(group_list);
>> -
>> -#ifdef CONFIG_KVM
>> -/*
>> - * We have a single VFIO pseudo device per KVM VM.  Once created it lives
>> - * for the life of the VM.  Closing the file descriptor only drops our
>> - * reference to it and the device's reference to kvm.  Therefore once
>> - * initialized, this file descriptor is only released on QEMU exit and
>> - * we'll re-use it should another vfio device be attached before then.
>> - */
>> -static int vfio_kvm_device_fd = -1;
>> -#endif
>> -
>>  static void vfio_disable_interrupts(VFIODevice *vdev);
>>  static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int 
>> len);
>>  static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
>> @@ -2180,183 +2139,6 @@ static void vfio_pci_write_config(PCIDevice *pdev, 
>> uint32_t addr,
>>  }
>>  
>>  /*
>> - * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
>> - */
>> -static int vfio_dma_unmap(VFIOContainer *container,
>> -                          hwaddr iova, ram_addr_t size)
>> -{
>> -    struct vfio_iommu_type1_dma_unmap unmap = {
>> -        .argsz = sizeof(unmap),
>> -        .flags = 0,
>> -        .iova = iova,
>> -        .size = size,
>> -    };
>> -
>> -    if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
>> -        DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
>> -        return -errno;
>> -    }
>> -
>> -    return 0;
>> -}
>> -
>> -static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>> -                        ram_addr_t size, void *vaddr, bool readonly)
>> -{
>> -    struct vfio_iommu_type1_dma_map map = {
>> -        .argsz = sizeof(map),
>> -        .flags = VFIO_DMA_MAP_FLAG_READ,
>> -        .vaddr = (__u64)(uintptr_t)vaddr,
>> -        .iova = iova,
>> -        .size = size,
>> -    };
>> -
>> -    if (!readonly) {
>> -        map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
>> -    }
>> -
>> -    /*
>> -     * Try the mapping, if it fails with EBUSY, unmap the region and try
>> -     * again.  This shouldn't be necessary, but we sometimes see it in
>> -     * the the VGA ROM space.
>> -     */
>> -    if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
>> -        (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
>> -         ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
>> -        return 0;
>> -    }
>> -
>> -    DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
>> -    return -errno;
>> -}
>> -
>> -static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>> -{
>> -    return !memory_region_is_ram(section->mr) ||
>> -           /*
>> -            * Sizing an enabled 64-bit BAR can cause spurious mappings to
>> -            * addresses in the upper part of the 64-bit address space.  
>> These
>> -            * are never accessed by the CPU and beyond the address width of
>> -            * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU 
>> width.
>> -            */
>> -           section->offset_within_address_space & (1ULL << 63);
>> -}
>> -
>> -static void vfio_listener_region_add(MemoryListener *listener,
>> -                                     MemoryRegionSection *section)
>> -{
>> -    VFIOContainer *container = container_of(listener, VFIOContainer,
>> -                                            iommu_data.type1.listener);
>> -    hwaddr iova, end;
>> -    void *vaddr;
>> -    int ret;
>> -
>> -    assert(!memory_region_is_iommu(section->mr));
>> -
>> -    if (vfio_listener_skipped_section(section)) {
>> -        DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
>> -                section->offset_within_address_space,
>> -                section->offset_within_address_space +
>> -                int128_get64(int128_sub(section->size, int128_one())));
>> -        return;
>> -    }
>> -
>> -    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) 
>> !=
>> -                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
>> -        error_report("%s received unaligned region", __func__);
>> -        return;
>> -    }
>> -
>> -    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>> -    end = (section->offset_within_address_space + 
>> int128_get64(section->size)) &
>> -          TARGET_PAGE_MASK;
>> -
>> -    if (iova >= end) {
>> -        return;
>> -    }
>> -
>> -    vaddr = memory_region_get_ram_ptr(section->mr) +
>> -            section->offset_within_region +
>> -            (iova - section->offset_within_address_space);
>> -
>> -    DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
>> -            iova, end - 1, vaddr);
>> -
>> -    memory_region_ref(section->mr);
>> -    ret = vfio_dma_map(container, iova, end - iova, vaddr, 
>> section->readonly);
>> -    if (ret) {
>> -        error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
>> -                     "0x%"HWADDR_PRIx", %p) = %d (%m)",
>> -                     container, iova, end - iova, vaddr, ret);
>> -
>> -        /*
>> -         * On the initfn path, store the first error in the container so we
>> -         * can gracefully fail.  Runtime, there's not much we can do other
>> -         * than throw a hardware error.
>> -         */
>> -        if (!container->iommu_data.type1.initialized) {
>> -            if (!container->iommu_data.type1.error) {
>> -                container->iommu_data.type1.error = ret;
>> -            }
>> -        } else {
>> -            hw_error("vfio: DMA mapping failed, unable to continue");
>> -        }
>> -    }
>> -}
>> -
>> -static void vfio_listener_region_del(MemoryListener *listener,
>> -                                     MemoryRegionSection *section)
>> -{
>> -    VFIOContainer *container = container_of(listener, VFIOContainer,
>> -                                            iommu_data.type1.listener);
>> -    hwaddr iova, end;
>> -    int ret;
>> -
>> -    if (vfio_listener_skipped_section(section)) {
>> -        DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
>> -                section->offset_within_address_space,
>> -                section->offset_within_address_space +
>> -                int128_get64(int128_sub(section->size, int128_one())));
>> -        return;
>> -    }
>> -
>> -    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) 
>> !=
>> -                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
>> -        error_report("%s received unaligned region", __func__);
>> -        return;
>> -    }
>> -
>> -    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>> -    end = (section->offset_within_address_space + 
>> int128_get64(section->size)) &
>> -          TARGET_PAGE_MASK;
>> -
>> -    if (iova >= end) {
>> -        return;
>> -    }
>> -
>> -    DPRINTF("region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
>> -            iova, end - 1);
>> -
>> -    ret = vfio_dma_unmap(container, iova, end - iova);
>> -    memory_region_unref(section->mr);
>> -    if (ret) {
>> -        error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
>> -                     "0x%"HWADDR_PRIx") = %d (%m)",
>> -                     container, iova, end - iova, ret);
>> -    }
>> -}
>> -
>> -static MemoryListener vfio_memory_listener = {
>> -    .region_add = vfio_listener_region_add,
>> -    .region_del = vfio_listener_region_del,
>> -};
>> -
>> -static void vfio_listener_release(VFIOContainer *container)
>> -{
>> -    memory_listener_unregister(&container->iommu_data.type1.listener);
>> -}
>> -
>> -/*
>>   * Interrupt setup
>>   */
>>  static void vfio_disable_interrupts(VFIODevice *vdev)
>> @@ -3221,244 +3003,8 @@ static void vfio_pci_reset_handler(void *opaque)
>>      }
>>  }
>>  
>> -static void vfio_kvm_device_add_group(VFIOGroup *group)
>> -{
>> -#ifdef CONFIG_KVM
>> -    struct kvm_device_attr attr = {
>> -        .group = KVM_DEV_VFIO_GROUP,
>> -        .attr = KVM_DEV_VFIO_GROUP_ADD,
>> -        .addr = (uint64_t)(unsigned long)&group->fd,
>> -    };
>> -
>> -    if (!kvm_enabled()) {
>> -        return;
>> -    }
>> -
>> -    if (vfio_kvm_device_fd < 0) {
>> -        struct kvm_create_device cd = {
>> -            .type = KVM_DEV_TYPE_VFIO,
>> -        };
>> -
>> -        if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
>> -            DPRINTF("KVM_CREATE_DEVICE: %m\n");
>> -            return;
>> -        }
>> -
>> -        vfio_kvm_device_fd = cd.fd;
>> -    }
>> -
>> -    if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
>> -        error_report("Failed to add group %d to KVM VFIO device: %m",
>> -                     group->groupid);
>> -    }
>> -#endif
>> -}
>> -
>> -static void vfio_kvm_device_del_group(VFIOGroup *group)
>> -{
>> -#ifdef CONFIG_KVM
>> -    struct kvm_device_attr attr = {
>> -        .group = KVM_DEV_VFIO_GROUP,
>> -        .attr = KVM_DEV_VFIO_GROUP_DEL,
>> -        .addr = (uint64_t)(unsigned long)&group->fd,
>> -    };
>> -
>> -    if (vfio_kvm_device_fd < 0) {
>> -        return;
>> -    }
>> -
>> -    if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
>> -        error_report("Failed to remove group %d from KVM VFIO device: %m",
>> -                     group->groupid);
>> -    }
>> -#endif
>> -}
>> -
>> -static int vfio_connect_container(VFIOGroup *group)
>> -{
>> -    VFIOContainer *container;
>> -    int ret, fd;
>> -
>> -    if (group->container) {
>> -        return 0;
>> -    }
>> -
>> -    QLIST_FOREACH(container, &container_list, next) {
>> -        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
>> -            group->container = container;
>> -            QLIST_INSERT_HEAD(&container->group_list, group, 
>> container_next);
>> -            return 0;
>> -        }
>> -    }
>> -
>> -    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
>> -    if (fd < 0) {
>> -        error_report("vfio: failed to open /dev/vfio/vfio: %m");
>> -        return -errno;
>> -    }
>> -
>> -    ret = ioctl(fd, VFIO_GET_API_VERSION);
>> -    if (ret != VFIO_API_VERSION) {
>> -        error_report("vfio: supported vfio version: %d, "
>> -                     "reported version: %d", VFIO_API_VERSION, ret);
>> -        close(fd);
>> -        return -EINVAL;
>> -    }
>> -
>> -    container = g_malloc0(sizeof(*container));
>> -    container->fd = fd;
>> -
>> -    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
>> -        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
>> -        if (ret) {
>> -            error_report("vfio: failed to set group container: %m");
>> -            g_free(container);
>> -            close(fd);
>> -            return -errno;
>> -        }
>> -
>> -        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
>> -        if (ret) {
>> -            error_report("vfio: failed to set iommu for container: %m");
>> -            g_free(container);
>> -            close(fd);
>> -            return -errno;
>> -        }
>> -
>> -        container->iommu_data.type1.listener = vfio_memory_listener;
>> -        container->iommu_data.release = vfio_listener_release;
>> -
>> -        memory_listener_register(&container->iommu_data.type1.listener,
>> -                                 &address_space_memory);
>> -
>> -        if (container->iommu_data.type1.error) {
>> -            ret = container->iommu_data.type1.error;
>> -            vfio_listener_release(container);
>> -            g_free(container);
>> -            close(fd);
>> -            error_report("vfio: memory listener initialization failed for 
>> container");
>> -            return ret;
>> -        }
>> -
>> -        container->iommu_data.type1.initialized = true;
>> -
>> -    } else {
>> -        error_report("vfio: No available IOMMU models");
>> -        g_free(container);
>> -        close(fd);
>> -        return -EINVAL;
>> -    }
>> -
>> -    QLIST_INIT(&container->group_list);
>> -    QLIST_INSERT_HEAD(&container_list, container, next);
>> -
>> -    group->container = container;
>> -    QLIST_INSERT_HEAD(&container->group_list, group, container_next);
>> -
>> -    return 0;
>> -}
>> -
>> -static void vfio_disconnect_container(VFIOGroup *group)
>> -{
>> -    VFIOContainer *container = group->container;
>> -
>> -    if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
>> -        error_report("vfio: error disconnecting group %d from container",
>> -                     group->groupid);
>> -    }
>> -
>> -    QLIST_REMOVE(group, container_next);
>> -    group->container = NULL;
>> -
>> -    if (QLIST_EMPTY(&container->group_list)) {
>> -        if (container->iommu_data.release) {
>> -            container->iommu_data.release(container);
>> -        }
>> -        QLIST_REMOVE(container, next);
>> -        DPRINTF("vfio_disconnect_container: close container->fd\n");
>> -        close(container->fd);
>> -        g_free(container);
>> -    }
>> -}
>> -
>> -static VFIOGroup *vfio_get_group(int groupid)
>> -{
>> -    VFIOGroup *group;
>> -    char path[32];
>> -    struct vfio_group_status status = { .argsz = sizeof(status) };
>> -
>> -    QLIST_FOREACH(group, &group_list, next) {
>> -        if (group->groupid == groupid) {
>> -            return group;
>> -        }
>> -    }
>> -
>> -    group = g_malloc0(sizeof(*group));
>> -
>> -    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
>> -    group->fd = qemu_open(path, O_RDWR);
>> -    if (group->fd < 0) {
>> -        error_report("vfio: error opening %s: %m", path);
>> -        g_free(group);
>> -        return NULL;
>> -    }
>> -
>> -    if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
>> -        error_report("vfio: error getting group status: %m");
>> -        close(group->fd);
>> -        g_free(group);
>> -        return NULL;
>> -    }
>> -
>> -    if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
>> -        error_report("vfio: error, group %d is not viable, please ensure "
>> -                     "all devices within the iommu_group are bound to their 
>> "
>> -                     "vfio bus driver.", groupid);
>> -        close(group->fd);
>> -        g_free(group);
>> -        return NULL;
>> -    }
>> -
>> -    group->groupid = groupid;
>> -    QLIST_INIT(&group->device_list);
>> -
>> -    if (vfio_connect_container(group)) {
>> -        error_report("vfio: failed to setup container for group %d", 
>> groupid);
>> -        close(group->fd);
>> -        g_free(group);
>> -        return NULL;
>> -    }
>> -
>> -    if (QLIST_EMPTY(&group_list)) {
>> -        qemu_register_reset(vfio_pci_reset_handler, NULL);
>> -    }
>> -
>> -    QLIST_INSERT_HEAD(&group_list, group, next);
>> -
>> -    vfio_kvm_device_add_group(group);
>> -
>> -    return group;
>> -}
>> -
>> -static void vfio_put_group(VFIOGroup *group)
>> -{
>> -    if (!QLIST_EMPTY(&group->device_list)) {
>> -        return;
>> -    }
>> -
>> -    vfio_kvm_device_del_group(group);
>> -    vfio_disconnect_container(group);
>> -    QLIST_REMOVE(group, next);
>> -    DPRINTF("vfio_put_group: close group->fd\n");
>> -    close(group->fd);
>> -    g_free(group);
>> -
>> -    if (QLIST_EMPTY(&group_list)) {
>> -        qemu_unregister_reset(vfio_pci_reset_handler, NULL);
>> -    }
>> -}
>> -
>> -static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice 
>> *vdev)
>> +static int vfio_get_device(VFIOGroup *group, const char *name,
>> +                           struct VFIODevice *vdev)
> 
> Why?
ok reverted
> 
>>  {
>>      struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>>      struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
>> @@ -3485,7 +3031,7 @@ static int vfio_get_device(VFIOGroup *group, const 
>> char *name, VFIODevice *vdev)
>>          goto error;
>>      }
>>  
>> -    DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
>> +    DPRINTF("Device %s flags: %u, regions: %u, irqs: %u\n", name,
> 
> ???
type correction: irgs -> irqs ;-)
> 
>>              dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
>>  
>>      if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
>> @@ -3768,7 +3314,7 @@ static int vfio_initfn(PCIDevice *pdev)
>>      DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
>>              vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
>>  
>> -    group = vfio_get_group(groupid);
>> +    group = vfio_get_group(groupid, vfio_pci_reset_handler);
>>      if (!group) {
>>          error_report("vfio: failed to get group %d", groupid);
>>          return -ENOENT;
>> @@ -3785,7 +3331,7 @@ static int vfio_initfn(PCIDevice *pdev)
>>              pvdev->host.function == vdev->host.function) {
>>  
>>              error_report("vfio: error: device %s is already attached", 
>> path);
>> -            vfio_put_group(group);
>> +            vfio_put_group(group, vfio_pci_reset_handler);
>>              return -EBUSY;
>>          }
>>      }
>> @@ -3793,7 +3339,7 @@ static int vfio_initfn(PCIDevice *pdev)
>>      ret = vfio_get_device(group, path, vdev);
>>      if (ret) {
>>          error_report("vfio: failed to get device %s", path);
>> -        vfio_put_group(group);
>> +        vfio_put_group(group, vfio_pci_reset_handler);
>>          return ret;
>>      }
>>  
>> @@ -3879,7 +3425,7 @@ out_teardown:
>>  out_put:
>>      g_free(vdev->emulated_config_bits);
>>      vfio_put_device(vdev);
>> -    vfio_put_group(group);
>> +    vfio_put_group(group, vfio_pci_reset_handler);
>>      return ret;
>>  }
>>  
>> @@ -3899,7 +3445,7 @@ static void vfio_exitfn(PCIDevice *pdev)
>>      g_free(vdev->emulated_config_bits);
>>      g_free(vdev->rom);
>>      vfio_put_device(vdev);
>> -    vfio_put_group(group);
>> +    vfio_put_group(group, vfio_pci_reset_handler);
>>  }
>>  
>>  static void vfio_pci_reset(DeviceState *dev)
>> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
>> new file mode 100644
>> index 0000000..138fb13
>> --- /dev/null
>> +++ b/hw/vfio/platform.c
>> @@ -0,0 +1,381 @@
>> +/*
>> + * vfio based device assignment support - platform devices
>> + *
>> + * Copyright Linaro Limited, 2014
>> + *
>> + * Authors:
>> + *  Kim Phillips <address@hidden>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + *
>> + * Based on vfio based PCI device assignment support:
>> + *  Copyright Red Hat, Inc. 2012
>> + */
>> +
>> +#include <dirent.h>
>> +#include <linux/vfio.h>
>> +#include <sys/ioctl.h>
>> +#include <sys/mman.h>
>> +#include <sys/stat.h>
>> +#include <sys/types.h>
>> +#include <unistd.h>
>> +
>> +#include "config.h"
>> +#include "exec/address-spaces.h"
>> +#include "exec/memory.h"
>> +#include "qemu-common.h"
>> +#include "qemu/error-report.h"
>> +#include "qemu/event_notifier.h"
>> +#include "qemu/queue.h"
>> +#include "qemu/range.h"
>> +#include "sysemu/kvm.h"
>> +#include "sysemu/sysemu.h"
>> +#include "hw/qdev-properties.h"
>> +#include "migration/vmstate.h"
>> +#include "hw/hw.h"
>> +#include "hw/sysbus.h"
>> +
>> +#include "vfio-common.h"
>> +
>> +#define DEBUG_VFIO
>> +#ifdef DEBUG_VFIO
>> +#define DPRINTF(fmt, ...) \
>> +    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
>> +#else
>> +#define DPRINTF(fmt, ...) \
>> +    do { } while (0)
>> +#endif
>> +
>> +/* Extra debugging, trap acceleration paths for more logging */
>> +#define VFIO_ALLOW_MMAP 1
> 
> Maybe there should be a common debug section in vfio-common.h?
> 
done
>> +
>> +#define TYPE_VFIO_PLATFORM "vfio-platform"
>> +
>> +typedef struct VFIORegion {
>> +    off_t fd_offset; /* offset of region within device fd */
>> +    int fd; /* device fd, allows us to pass VFIORegion as opaque data */
>> +    MemoryRegion mem; /* slow, read/write access */
>> +    MemoryRegion mmap_mem; /* direct mapped access */
>> +    void *mmap;
>> +    size_t size;
>> +    uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
>> +    uint8_t nr; /* cache the region number for debug */
>> +} VFIORegion;
>> +
>> +typedef struct VFIODevice {
>> +    SysBusDevice sbdev;
>> +    int fd;
>> +    int num_regions;
>> +    VFIORegion *regions;
>> +    QLIST_ENTRY(VFIODevice) next;
>> +    struct VFIOGroup *group;
>> +    char *name;
>> +} VFIODevice;
> 
> I suspect we really need:
> 
> (in vfio-common.h)
> typedef struct VFIODevice {
>       QLIST_ENTRY(VFIODevice) next;
>       struct VFIOGroup *group;
>       enum VFIODeviceType;
>       /* maybe some of the other common stuff */
> } VFIODevice;
> 
> (per pci/platform)
> typedef struct VFIOPlatformDevice {
>       VFIODevice vdev;
>       /* device specific stuff */
> } VFIOPlatformDevice;
> 
> Otherwise I don't see how the device_list on VFIOGroup works.

I am currently experimenting your idea of building a kind a VFIODevice
base class. Before taking care of interrupts this would look like

typedef struct VFIODevice {
    QLIST_ENTRY(VFIODevice) next;
    struct VFIOGroup *group;
    int num_regions;
    VFIORegion **regions;
    int num_irqs;
    char *name;
    int fd;
    int type;
    bool reset_works;
    VFIODeviceOps *ops;
} VFIODevice;

Now I am stuck by the fact I would need this VFIODevice to be a QOM
object. This is needed because I need to pass the VFIODevice * in
memory_region_init_ram_ptr and memory_region_init as a OBJECT(vdev).
This would make possible to factorize quite a lot of code. However I am
currently doubting this is feasible after reading object.h, ie. I would
need double inheritance
VFIOPCIDevice <- PCIDevice
              <- VFIODevice
VFIOPlatformDevice <- SysBusDevice
                   <- VFIODevice

Obviously I may do without VFIODevice being a QOM object but this means
less code factorization. If you have guidance/comments on this, I am
definitively interested.

Also I think we might be able to share a VFIORegion as follows.

typedef struct VFIORegion {
    off_t fd_offset; /* offset of region within device fd */
    int fd; /* device fd, allows us to pass VFIORegion as opaque data */
    MemoryRegion mem; /* slow, read/write access */
    MemoryRegion mmap_mem; /* direct mapped access */
    void *mmap;
    size_t size;
    uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
    uint8_t nr; /* cache the region number for debug */
} VFIORegion;

typedef struct VFIOBAR {
    VFIORegion region;
    bool ioport;
    bool mem64;
    QLIST_HEAD(, VFIOQuirk) quirks;
} VFIOBAR;

Overall this makes quite a lot of changes in the pci.c file and
currently I do not know how to test the PCI code changes. This is
another serious issue.

> 
>> +
>> +static int vfio_mmap_region(VFIODevice *vdev, VFIORegion *region,
>> +                         MemoryRegion *mem, MemoryRegion *submem,
>> +                         void **map, size_t size, off_t offset,
>> +                         const char *name)
>> +{
>> +    int ret = 0;
>> +
>> +    if (VFIO_ALLOW_MMAP && size && region->flags & 
>> VFIO_REGION_INFO_FLAG_MMAP) {
>> +        int prot = 0;
>> +        ret = 0;
>> +
>> +        if (region->flags & VFIO_REGION_INFO_FLAG_READ) {
>> +            prot |= PROT_READ;
>> +        }
>> +
>> +        if (region->flags & VFIO_REGION_INFO_FLAG_WRITE) {
>> +            prot |= PROT_WRITE;
>> +        }
>> +
>> +        *map = mmap(NULL, size, prot, MAP_SHARED,
>> +                    region->fd, region->fd_offset + offset);
>> +        if (*map == MAP_FAILED) {
>> +            ret = -errno;
>> +            *map = NULL;
>> +            goto error;
>> +        }
>> +
>> +        memory_region_init_ram_ptr(submem, OBJECT(vdev), name, size, *map);
>> +    }
>> +
>> +    memory_region_add_subregion(mem, offset, submem);
>> +
>> +error:
>> +    return ret;
>> +}
>> +
>> +/*
>> + * IO Port/MMIO - Beware of the endians, VFIO is always little endian
>> + */
>> +static void vfio_region_write(void *opaque, hwaddr addr,
>> +                              uint64_t data, unsigned size)
>> +{
>> +    VFIORegion *region = opaque;
>> +    union {
>> +        uint8_t byte;
>> +        uint16_t word;
>> +        uint32_t dword;
>> +        uint64_t qword;
>> +    } buf;
>> +
>> +    switch (size) {
>> +    case 1:
>> +        buf.byte = data;
>> +        break;
>> +    case 2:
>> +        buf.word = data;
>> +        break;
>> +    case 4:
>> +        buf.dword = data;
>> +        break;
>> +    default:
>> +        hw_error("vfio: unsupported write size, %d bytes\n", size);
>> +        break;
>> +    }
>> +
>> +    if (pwrite(region->fd, &buf, size, region->fd_offset + addr) != size) {
>> +        error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
>> +                     __func__, addr, data, size);
>> +    }
>> +
>> +    DPRINTF("%s(region %d+0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n",
>> +             __func__, region->nr, addr, data, size);
>> +}
>> +
>> +static uint64_t vfio_region_read(void *opaque, hwaddr addr, unsigned size)
>> +{
>> +    VFIORegion *region = opaque;
>> +    union {
>> +        uint8_t byte;
>> +        uint16_t word;
>> +        uint32_t dword;
>> +        uint64_t qword;
>> +    } buf;
>> +    uint64_t data = 0;
>> +
>> +    if (pread(region->fd, &buf, size, region->fd_offset + addr) != size) {
>> +        error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
>> +                     __func__, addr, size);
>> +        return (uint64_t)-1;
>> +    }
>> +
>> +    switch (size) {
>> +    case 1:
>> +        data = buf.byte;
>> +        break;
>> +    case 2:
>> +        data = buf.word;
>> +        break;
>> +    case 4:
>> +        data = buf.dword;
>> +        break;
>> +    default:
>> +        hw_error("vfio: unsupported read size, %d bytes\n", size);
>> +        break;
>> +    }
>> +
>> +    DPRINTF("%s(region %d+0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n",
>> +            __func__, region->nr, addr, size, data);
>> +
>> +    return data;
>> +}
>> +
>> +static const MemoryRegionOps vfio_region_ops = {
>> +    .read = vfio_region_read,
>> +    .write = vfio_region_write,
>> +    .endianness = DEVICE_NATIVE_ENDIAN,
>> +};
>> +
>> +static void vfio_map_region(VFIODevice *vdev, int nr)
>> +{
>> +    VFIORegion *region = &vdev->regions[nr];
>> +    unsigned size = region->size;
>> +    char name[64];
>> +
>> +    snprintf(name, sizeof(name), "VFIO %s region %d", vdev->name, nr);
>> +
>> +    /* A "slow" read/write mapping underlies all regions  */
>> +    memory_region_init_io(&region->mem, OBJECT(vdev), &vfio_region_ops,
>> +                          region, name, size);
>> +
>> +    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
>> +    if (vfio_mmap_region(vdev, region, &region->mem,
>> +                         &region->mmap_mem, &region->mmap, size, 0, name)) {
>> +        error_report("%s unsupported. Performance may be slow", name);
>> +    }
>> +}
>> +
>> +static int vfio_get_device(VFIOGroup *group, const char *name,
>> +                           struct VFIODevice *vdev)
>> +{
>> +    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>> +    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
>> +    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
>> +    int ret, i;
>> +
>> +    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
>> +    if (ret < 0) {
>> +        error_report("vfio: error getting device %s from group %d: %m",
>> +                     name, group->groupid);
>> +        error_printf("Verify all devices in group %d are bound to the vfio "
>> +                     "platform driver and are not already in use\n",
>> +                     group->groupid);
>> +        return ret;
>> +    }
>> +
>> +    vdev->fd = ret;
>> +    vdev->group = group;
>> +    QLIST_INSERT_HEAD(&group->device_list, vdev, next);
>> +
>> +    /* Sanity check device */
>> +    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
>> +    if (ret) {
>> +        error_report("vfio: error getting device info: %m");
>> +        goto error;
>> +    }
>> +
>> +    DPRINTF("Device %s flags: %u, regions: %u, irqs: %u\n", name,
>> +            dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
>> +
>> +    vdev->regions = g_malloc0(sizeof(VFIORegion) * dev_info.num_regions);
>> +    if (!vdev->regions) {
>> +            error_report("vfio: Error allocating space for %d regions",
>> +                         dev_info.num_regions);
>> +            ret = -ENOMEM;
>> +            goto error;
>> +    }
>> +
>> +    vdev->num_regions = dev_info.num_regions;
>> +
>> +    for (i = 0; i < dev_info.num_regions; i++) {
>> +        reg_info.index = i;
>> +
>> +        ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
>> +        if (ret) {
>> +            error_report("vfio: Error getting region %d info: %m", i);
>> +            goto error;
>> +        }
>> +
>> +        DPRINTF("Device %s region %d:\n", name, i);
>> +        DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
>> +                (unsigned long)reg_info.size, (unsigned 
>> long)reg_info.offset,
>> +                (unsigned long)reg_info.flags);
>> +
>> +        vdev->regions[i].flags = reg_info.flags;
>> +        vdev->regions[i].size = reg_info.size;
>> +        vdev->regions[i].fd_offset = reg_info.offset;
>> +        vdev->regions[i].fd = vdev->fd;
>> +        vdev->regions[i].nr = i;
>> +    }
> 
> Perhaps more of this could be in common.c if we had a generic VFIODevice
> embedded in a VFIOPlatformDevice or VFIOPCIDevice.  We could fill in
> high-level things like number of regions/irqs, fd, name, etc, leaving
> interpretation and probing of each region/irq to device specific code.
> 

This is indeed current target. See comment above on QOM inheritance
problematic.

>> +
>> +error:
>> +    if (ret) {
>> +        g_free(vdev->regions);
>> +        QLIST_REMOVE(vdev, next);
>> +        vdev->group = NULL;
>> +        close(vdev->fd);
>> +    }
>> +    return ret;
>> +}
>> +
>> +static void vfio_platform_realize(DeviceState *dev, Error **errp)
>> +{
>> +    SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
>> +    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, sbdev, sbdev);
>> +    VFIOGroup *group;
>> +    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
>> +    ssize_t len;
>> +    struct stat st;
>> +    int groupid, i, ret;
>> +
>> +    /* TODO: pass device name on command line */
>> +    vdev->name = malloc(PATH_MAX);
>> +    strcpy(vdev->name, "fff51000.ethernet");
>> +
>> +    /* Check that the host device exists */
>> +    snprintf(path, sizeof(path), "/sys/bus/platform/devices/%s/", 
>> vdev->name);
>> +    if (stat(path, &st) < 0) {
>> +        error_report("vfio: error: no such host device: %s", path);
>> +        return;
>> +    }
>> +
>> +    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
>> +
>> +    len = readlink(path, iommu_group_path, PATH_MAX);
>> +    if (len <= 0) {
>> +        error_report("vfio: error no iommu_group for device");
>> +        return;
>> +    }
>> +
>> +    iommu_group_path[len] = 0;
>> +    group_name = basename(iommu_group_path);
>> +
>> +    if (sscanf(group_name, "%d", &groupid) != 1) {
>> +        error_report("vfio: error reading %s: %m", path);
>> +        return;
>> +    }
>> +
>> +    DPRINTF("%s(%s) group %d\n", __func__, vdev->name, groupid);
>> +
>> +    group = vfio_get_group(groupid, NULL);
>> +    if (!group) {
>> +        error_report("vfio: failed to get group %d", groupid);
>> +        return;
>> +    }
>> +
>> +    snprintf(path, sizeof(path), "%s", vdev->name);
>> +
>> +    QLIST_FOREACH(pvdev, &group->device_list, next) {
>> +        if (strcmp(pvdev->name, vdev->name) == 0) {
>> +            error_report("vfio: error: device %s is already attached", 
>> path);
>> +            vfio_put_group(group, NULL);
>> +            return;
>> +        }
>> +    }
>> +
>> +    ret = vfio_get_device(group, path, vdev);
>> +    if (ret) {
>> +        error_report("vfio: failed to get device %s", path);
>> +        vfio_put_group(group, NULL);
>> +        return;
>> +    }
>> +
>> +    for (i = 0; i < vdev->num_regions; i++) {
>> +        vfio_map_region(vdev, i);
>> +        sysbus_init_mmio(sbdev, &vdev->regions[i].mem);
>> +    }
>> +}
>> +
>> +static const VMStateDescription vfio_platform_vmstate = {
>> +    .name = TYPE_VFIO_PLATFORM,
>> +    .unmigratable = 1,
>> +};
>> +
>> +static void vfio_platform_dev_class_init(ObjectClass *klass, void *data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +
>> +    dc->realize = vfio_platform_realize;
>> +    dc->vmsd = &vfio_platform_vmstate;
>> +    dc->desc = "VFIO-based platform device assignment";
>> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
>> +}
>> +
>> +static const TypeInfo vfio_platform_dev_info = {
>> +    .name = TYPE_VFIO_PLATFORM,
>> +    .parent = TYPE_SYS_BUS_DEVICE,
>> +    .instance_size = sizeof(VFIODevice),
>> +    .class_init = vfio_platform_dev_class_init,
>> +};
>> +
>> +static void register_vfio_platform_dev_type(void)
>> +{
>> +    type_register_static(&vfio_platform_dev_info);
>> +}
>> +
>> +type_init(register_vfio_platform_dev_type)
>> diff --git a/hw/vfio/vfio-common.h b/hw/vfio/vfio-common.h
>> new file mode 100644
>> index 0000000..21148ef
>> --- /dev/null
>> +++ b/hw/vfio/vfio-common.h
>> @@ -0,0 +1,55 @@
>> +/*
>> + * common header for vfio based device assignment support
>> + *
>> + * Copyright Red Hat, Inc. 2012
>> + *
>> + * Authors:
>> + *  Alex Williamson <address@hidden>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + *
>> + * Based on qemu-kvm device-assignment:
>> + *  Adapted for KVM by Qumranet.
>> + *  Copyright (c) 2007, Neocleus, Alex Novik (address@hidden)
>> + *  Copyright (c) 2007, Neocleus, Guy Zana (address@hidden)
>> + *  Copyright (C) 2008, Qumranet, Amit Shah (address@hidden)
>> + *  Copyright (C) 2008, Red Hat, Amit Shah (address@hidden)
>> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (address@hidden)
>> + */
>> +
>> +struct VFIODevice;
>> +
>> +struct VFIOGroup;
>> +
>> +typedef struct VFIOType1 {
>> +    MemoryListener listener;
>> +    int error;
>> +    bool initialized;
>> +} VFIOType1;
>> +
>> +typedef struct VFIOContainer {
>> +    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
>> +    struct {
>> +        /* enable abstraction to support various iommu backends */
>> +        union {
>> +            VFIOType1 type1;
>> +        };
>> +        void (*release)(struct VFIOContainer *);
>> +    } iommu_data;
>> +    QLIST_HEAD(, VFIOGroup) group_list;
>> +    QLIST_ENTRY(VFIOContainer) next;
>> +} VFIOContainer;
>> +
>> +typedef struct VFIOGroup {
>> +    int fd;
>> +    int groupid;
>> +    VFIOContainer *container;
>> +    QLIST_HEAD(, VFIODevice) device_list;
>> +    QLIST_ENTRY(VFIOGroup) next;
>> +    QLIST_ENTRY(VFIOGroup) container_next;
>> +} VFIOGroup;
>> +
>> +
>> +VFIOGroup *vfio_get_group(int groupid, QEMUResetHandler *reset_handler);
>> +void vfio_put_group(VFIOGroup *group, QEMUResetHandler *reset_handler);
> 
> 
> 




reply via email to

[Prev in Thread] Current Thread [Next in Thread]