[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH RFC v2] IOMMU: Add Support to VFIO devices with
From: |
Aviv B.D. |
Subject: |
Re: [Qemu-devel] [PATCH RFC v2] IOMMU: Add Support to VFIO devices with vIOMMU present |
Date: |
Sat, 16 Apr 2016 18:40:13 +0300 |
See my comments below,
Thanks,
Aviv.
On Mon, Apr 11, 2016 at 11:25 PM, Alex Williamson
<address@hidden> wrote:
>
> Some more detailed comments now that I have some faith that the host
> IOMMU domain is working correctly...
>
> On Sat, 9 Apr 2016 21:03:38 +0300
> "Aviv B.D." <address@hidden> wrote:
>
>> From: "Aviv Ben-David" <address@hidden>
>> Date: Tue, 23 Feb 2016 00:24:54 +0200
>> Subject: [PATCH] IOMMU: Add Support to VFIO devices with vIOMMU present
>>
>> * Fix bug that prevent qemu from starting up with vIOMMU and VFIO
>> device are present.
>> * Advertize Cache Mode capability in iommu cap register.
>> * Register every VFIO device with IOMMU state.
>> * On page cache invalidation in vIOMMU, check if the domain belong to
>> VFIO device and mirror the guest requests to host.
>>
>> Changes from previous versions:
>> * remove assumption that the cache do not clears
>> * fix lock up on high load.
>> * refactor vtd_get_did_dev to return success return code, and actual
>> domain_id via argument.
>>
>> Tested only on network cards (also with multiple cards at once).
>>
>> Signed-off-by: Aviv Ben-David <address@hidden>
>> ---
>> hw/i386/intel_iommu.c | 113
>> +++++++++++++++++++++++++++++++++++------
>> hw/i386/intel_iommu_internal.h | 3 ++
>> hw/vfio/common.c | 12 +++--
>> include/exec/memory.h | 8 ++-
>> include/hw/i386/intel_iommu.h | 4 ++
>> include/hw/vfio/vfio-common.h | 1 +
>> 6 files changed, 121 insertions(+), 20 deletions(-)
>>
>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> index 347718f..a568181 100644
>> --- a/hw/i386/intel_iommu.c
>> +++ b/hw/i386/intel_iommu.c
>> @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
>> VTD_DBGBIT(CSR);
>> #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>> #endif
>>
>> +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>> + uint8_t devfn, VTDContextEntry *ce);
>> +
>> static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
>> uint64_t wmask, uint64_t w1cmask)
>> {
>> @@ -126,6 +129,22 @@ static uint32_t
>> vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
>> return new_val;
>> }
>>
>> +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
>> uint8_t devfn, uint16_t * domain_id)
>> +{
>> + VTDContextEntry ce;
>> + int ret_fr;
>> +
>> + assert(domain_id);
>> +
>> + ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
>> + if (ret_fr){
>> + return -1;
>> + }
>> +
>> + *domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi);
>> + return 0;
>> +}
>> +
>> static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
>> uint64_t clear, uint64_t mask)
>> {
>> @@ -621,7 +640,7 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte,
>> uint32_t level)
>> /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
>> * of the translation, can be used for deciding the size of large page.
>> */
>> -static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool
>> is_write,
>> +static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa,
>> IOMMUAccessPermissions is_write,
>
> "is_write" is binary, yes/no, IOMMUAccessPermissions clearly has more
> states. This should change to "flags" or something and should use
> existing IOMMUAccessFlags rather than defining something new. This
> should be done in a separate patch that doesn't introduce new
> functionality otherwise.
OK, I will do it.
>
>> uint64_t *slptep, uint32_t *slpte_level,
>> bool *reads, bool *writes)
>> {
>> @@ -641,7 +660,19 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce,
>> uint64_t gpa, bool is_write,
>> }
>>
>> /* FIXME: what is the Atomics request here? */
>> - access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
>> + switch(is_write){
>> + case IOMMU_WRITE:
>> + access_right_check = VTD_SL_W;
>> + break;
>> + case IOMMU_READ:
>> + access_right_check = VTD_SL_R;
>> + break;
>> + case IOMMU_ANY:
>> + access_right_check = VTD_SL_R | VTD_SL_W;
>> + break;
>> + default:
>> + assert(0);
>> + }
>>
>> while (true) {
>> offset = vtd_gpa_level_offset(gpa, level);
>> @@ -711,9 +742,9 @@ static int
>> vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>> }
>>
>> if (!vtd_context_entry_present(ce)) {
>> - VTD_DPRINTF(GENERAL,
>> + /*VTD_DPRINTF(GENERAL,
>> "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
>> - "is not present", devfn, bus_num);
>> + "is not present", devfn, bus_num);*/
>
>
> Leftover debug?
Yes :/
I'll clear them...
>
>> return -VTD_FR_CONTEXT_ENTRY_P;
>> } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
>> (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
>> @@ -785,7 +816,7 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
>> * @entry: IOMMUTLBEntry that contain the addr to be translated and result
>> */
>> static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>> - uint8_t devfn, hwaddr addr, bool
>> is_write,
>> + uint8_t devfn, hwaddr addr,
>> IOMMUAccessPermissions is_write,
>> IOMMUTLBEntry *entry)
>> {
>> IntelIOMMUState *s = vtd_as->iommu_state;
>> @@ -848,12 +879,14 @@ static void
>> vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>> is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
>> if (ret_fr) {
>> ret_fr = -ret_fr;
>> - if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
>> - VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
>> + if (is_write != IOMMU_ANY){
>
> Is this debugging as well? Seems like this hides the majority of
> faults that might occur.
No, this is actually the purpose of IOMMU_ANY - to suppress
translate's errors reporting.
The guest kernel may issue invalidation of some consecutive pages that
some of them may not be present.
>
>> + if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
>> + VTD_DPRINTF(FLOG, "fault processing is disabled for DMA
>> "
>> "requests through this context-entry "
>> "(with FPD Set)");
>> - } else {
>> - vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
>> + } else {
>> + vtd_report_dmar_fault(s, source_id, addr, ret_fr,
>> is_write);
>> + }
>> }
>> return;
>> }
>> @@ -870,11 +903,13 @@ static void
>> vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>> &reads, &writes);
>> if (ret_fr) {
>> ret_fr = -ret_fr;
>> - if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
>> - VTD_DPRINTF(FLOG, "fault processing is disabled for DMA
>> requests "
>> + if (is_write != IOMMU_ANY){
>
> Here as well, why only fault non-RW entries?
same as above, maybe the name IOMMU_ANY is misleading and should be
something like IOMMU_NO_FAIL...
>
>> + if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
>> + VTD_DPRINTF(FLOG, "fault processing is disabled for
>> DMA requests "
>> "through this context-entry (with FPD Set)");
>> - } else {
>> - vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
>> + } else {
>> + vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
>> + }
>> }
>> return;
>> }
>> @@ -1016,18 +1051,58 @@ static void
>> vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
>> &domain_id);
>> }
>>
>> +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s,
>> uint16_t domain_id,
>> + hwaddr addr, uint8_t am)
>> +{
>> + VFIOGuestIOMMU * giommu;
>> +
>> + QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
>> + VTDAddressSpace *vtd_as = container_of(giommu->iommu,
>> VTDAddressSpace, iommu);
>> + uint16_t vfio_domain_id;
>> + int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
>> vtd_as->devfn, &vfio_domain_id);
>> + int i=0;
>> + if (!ret && domain_id == vfio_domain_id){
>> + IOMMUTLBEntry entry;
>> +
>> + /* do vfio unmap */
>> + VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr,
>> am);
>> + entry.target_as = NULL;
>> + entry.iova = addr & VTD_PAGE_MASK_4K;
>> + entry.translated_addr = 0;
>> + entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
>> + entry.perm = IOMMU_NONE;
>> + memory_region_notify_iommu(giommu->iommu, entry);
>> +
>> + /* do vfio map */
>> + VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
>> + /* call to vtd_iommu_translate */
>> + for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){
>> + IOMMUTLBEntry entry =
>> s->iommu_ops.translate(giommu->iommu, addr, IOMMU_ANY);
>> + if (entry.perm != IOMMU_NONE){
>> + memory_region_notify_iommu(giommu->iommu, entry);
>> + }
>> + }
>> + }
>> + }
>> +}
>> +
>> static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t
>> domain_id,
>> hwaddr addr, uint8_t am)
>> {
>> VTDIOTLBPageInvInfo info;
>>
>> assert(am <= VTD_MAMV);
>> +
>> info.domain_id = domain_id;
>> info.addr = addr;
>> info.mask = ~((1 << am) - 1);
>> +
>> g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
>> +
>> + vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am);
>
> Why is this vfio related and why does it need to know about giommus?
> That's vfio private data. Notifies need to happen regardless of
> whether there's a vfio device attached or not. It seems like this is
> just filling a gap that current VT-d code doesn't notify everywhere it
> needs to, but it shouldn't know about vfio.
Noted, I'll try to change them.
>
>> }
>>
>> +
>> /* Flush IOTLB
>> * Returns the IOTLB Actual Invalidation Granularity.
>> * @val: the content of the IOTLB_REG
>> @@ -1840,7 +1915,7 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
>> }
>>
>> static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr,
>> - bool is_write)
>> + IOMMUAccessPermissions is_write)
>> {
>> VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
>> IntelIOMMUState *s = vtd_as->iommu_state;
>> @@ -1895,6 +1970,13 @@ static Property vtd_properties[] = {
>> DEFINE_PROP_END_OF_LIST(),
>> };
>>
>> +void vtd_register_giommu(VFIOGuestIOMMU * giommu)
>> +{
>> + VTDAddressSpace *vtd_as = container_of(giommu->iommu,
>> VTDAddressSpace, iommu);
>> + IntelIOMMUState *s = vtd_as->iommu_state;
>> +
>> + QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
>> +}
>
> This function shouldn't be needed.
>
>>
>> VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>> {
>> @@ -1949,7 +2031,8 @@ static void vtd_init(IntelIOMMUState *s)
>> s->iq_last_desc_type = VTD_INV_DESC_NONE;
>> s->next_frcd_reg = 0;
>> s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
>> - VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
>> + VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS|
>> + VTD_CAP_CM;
>
> This should be a separate patch as well.
Noted.
>
>> s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
>>
>> vtd_reset_context_cache(s);
>> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
>> index e5f514c..102e9a5 100644
>> --- a/hw/i386/intel_iommu_internal.h
>> +++ b/hw/i386/intel_iommu_internal.h
>> @@ -190,6 +190,7 @@
>> #define VTD_CAP_MAMV (VTD_MAMV << 48)
>> #define VTD_CAP_PSI (1ULL << 39)
>> #define VTD_CAP_SLLPS ((1ULL << 34) | (1ULL << 35))
>> +#define VTD_CAP_CM (1ULL << 7)
>>
>> /* Supported Adjusted Guest Address Widths */
>> #define VTD_CAP_SAGAW_SHIFT 8
>> @@ -338,6 +339,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
>> #define VTD_PAGE_SHIFT_1G 30
>> #define VTD_PAGE_MASK_1G (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
>>
>> +#define VTD_PAGE_MASK(shift) (~((1ULL << (shift)) - 1))
>> +
>> struct VTDRootEntry {
>> uint64_t val;
>> uint64_t rsvd;
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 607ec70..98c8d67 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -32,6 +32,9 @@
>> #include "sysemu/kvm.h"
>> #include "trace.h"
>>
>> +#include "hw/sysbus.h"
>> +#include "hw/i386/intel_iommu.h"
>> +
>> struct vfio_group_head vfio_group_list =
>> QLIST_HEAD_INITIALIZER(vfio_group_list);
>> struct vfio_as_head vfio_address_spaces =
>> @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void
>> *data)
>> out:
>> rcu_read_unlock();
>> }
>> -
>> +#if 0
>> static hwaddr vfio_container_granularity(VFIOContainer *container)
>> {
>> return (hwaddr)1 << ctz64(container->iova_pgsizes);
>> }
>> -
>> +#endif
>> static void vfio_listener_region_add(MemoryListener *listener,
>> MemoryRegionSection *section)
>> {
>> @@ -344,6 +347,7 @@ static void
>> vfio_listener_region_add(MemoryListener *listener,
>> iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>> llend = int128_make64(section->offset_within_address_space);
>> llend = int128_add(llend, section->size);
>> + llend = int128_add(llend, int128_exts64(-1));
>> llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
>>
>> if (int128_ge(int128_make64(iova), llend)) {
>> @@ -381,11 +385,13 @@ static void
>> vfio_listener_region_add(MemoryListener *listener,
>> giommu->n.notify = vfio_iommu_map_notify;
>> QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
>>
>> + vtd_register_giommu(giommu);
>> memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
>> +#if 0
>> memory_region_iommu_replay(giommu->iommu, &giommu->n,
>> vfio_container_granularity(container),
>> false);
>> -
>> +#endif
>
> AFAICT, none of the above vfio changes should be required. The
> overflow is already fixed in qemu.git, the giommu registration
> shouldn't be necessary, the replay is probably not used, but shouldn't
> be a problem either. Not that there aren't vfio issues, but I think
> they're internal, like how pages are accounted and map/unmap efficiency.
>
>> return;
>> }
>>
>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>> index 2de7898..0e814ab 100644
>> --- a/include/exec/memory.h
>> +++ b/include/exec/memory.h
>> @@ -146,10 +146,14 @@ struct MemoryRegionOps {
>> };
>>
>> typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps;
>> -
>> +typedef enum IOMMUAccessPermissions{
>> + IOMMU_READ = 0,
>> + IOMMU_WRITE = 1,
>> + IOMMU_ANY = 2
>> +} IOMMUAccessPermissions;
>> struct MemoryRegionIOMMUOps {
>> /* Return a TLB entry that contains a given address. */
>> - IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool
>> is_write);
>> + IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr,
>> IOMMUAccessPermissions is_write);
>> };
>>
>> typedef struct CoalescedMemoryRange CoalescedMemoryRange;
>> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
>> index b024ffa..22f3f83 100644
>> --- a/include/hw/i386/intel_iommu.h
>> +++ b/include/hw/i386/intel_iommu.h
>> @@ -23,6 +23,7 @@
>> #define INTEL_IOMMU_H
>> #include "hw/qdev.h"
>> #include "sysemu/dma.h"
>> +#include "hw/vfio/vfio-common.h"
>>
>> #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>> #define INTEL_IOMMU_DEVICE(obj) \
>> @@ -123,6 +124,8 @@ struct IntelIOMMUState {
>> MemoryRegionIOMMUOps iommu_ops;
>> GHashTable *vtd_as_by_busptr; /* VTDBus objects indexed by
>> PCIBus* reference */
>> VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects
>> indexed by bus number */
>> +
>> + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>> };
>>
>> /* Find the VTD Address space associated with the given bus pointer,
>> @@ -130,4 +133,5 @@ struct IntelIOMMUState {
>> */
>> VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int
>> devfn);
>>
>> +void vtd_register_giommu(VFIOGuestIOMMU * giommu);
>> #endif
>
> Needing to know anything about vfio is an indication that this
> shouldn't be necessary.
>
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index f037f3c..9225ba3 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU {
>> MemoryRegion *iommu;
>> Notifier n;
>> QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
>> + QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
>> } VFIOGuestIOMMU;
>
> This is clearly a layering violation, vt-d should not be managing a
> list on a vfio data structure, especially one that it shouldn't even
> have access to. Thanks,
As above, I'll try to separate them.
>
> Alex