qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v3 2/2] intel-iommu: extend VTD emulation to all


From: Yu Zhang
Subject: Re: [Qemu-devel] [PATCH v3 2/2] intel-iommu: extend VTD emulation to allow 57-bit IOVA address width.
Date: Wed, 19 Dec 2018 13:57:43 +0800
User-agent: NeoMutt/20180622-66-b94505

On Tue, Dec 18, 2018 at 11:35:34PM -0500, Michael S. Tsirkin wrote:
> On Wed, Dec 19, 2018 at 11:40:06AM +0800, Yu Zhang wrote:
> > On Tue, Dec 18, 2018 at 09:49:02AM -0500, Michael S. Tsirkin wrote:
> > > On Tue, Dec 18, 2018 at 09:45:41PM +0800, Yu Zhang wrote:
> > > > On Tue, Dec 18, 2018 at 07:43:28AM -0500, Michael S. Tsirkin wrote:
> > > > > On Tue, Dec 18, 2018 at 06:01:16PM +0800, Yu Zhang wrote:
> > > > > > On Tue, Dec 18, 2018 at 05:47:14PM +0800, Yu Zhang wrote:
> > > > > > > On Mon, Dec 17, 2018 at 02:29:02PM +0100, Igor Mammedov wrote:
> > > > > > > > On Wed, 12 Dec 2018 21:05:39 +0800
> > > > > > > > Yu Zhang <address@hidden> wrote:
> > > > > > > > 
> > > > > > > > > A 5-level paging capable VM may choose to use 57-bit IOVA 
> > > > > > > > > address width.
> > > > > > > > > E.g. guest applications may prefer to use its VA as IOVA when 
> > > > > > > > > performing
> > > > > > > > > VFIO map/unmap operations, to avoid the burden of managing 
> > > > > > > > > the IOVA space.
> > > > > > > > > 
> > > > > > > > > This patch extends the current vIOMMU logic to cover the 
> > > > > > > > > extended address
> > > > > > > > > width. When creating a VM with 5-level paging feature, one 
> > > > > > > > > can choose to
> > > > > > > > > create a virtual VTD with 5-level paging capability, with 
> > > > > > > > > configurations
> > > > > > > > > like "-device intel-iommu,x-aw-bits=57".
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Yu Zhang <address@hidden>
> > > > > > > > > Reviewed-by: Peter Xu <address@hidden>
> > > > > > > > > ---
> > > > > > > > > Cc: "Michael S. Tsirkin" <address@hidden>
> > > > > > > > > Cc: Marcel Apfelbaum <address@hidden>
> > > > > > > > > Cc: Paolo Bonzini <address@hidden>
> > > > > > > > > Cc: Richard Henderson <address@hidden>
> > > > > > > > > Cc: Eduardo Habkost <address@hidden>
> > > > > > > > > Cc: Peter Xu <address@hidden>
> > > > > > > > > ---
> > > > > > > > >  hw/i386/intel_iommu.c          | 53 
> > > > > > > > > ++++++++++++++++++++++++++++++++----------
> > > > > > > > >  hw/i386/intel_iommu_internal.h | 10 ++++++--
> > > > > > > > >  include/hw/i386/intel_iommu.h  |  1 +
> > > > > > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > > > > > > 
> > > > > > > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > > > > > > > index 0e88c63..871110c 100644
> > > > > > > > > --- a/hw/i386/intel_iommu.c
> > > > > > > > > +++ b/hw/i386/intel_iommu.c
> > > > > > > > > @@ -664,16 +664,16 @@ static inline bool 
> > > > > > > > > vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce,
> > > > > > > > >  
> > > > > > > > >  /*
> > > > > > > > >   * Rsvd field masks for spte:
> > > > > > > > > - *     Index [1] to [4] 4k pages
> > > > > > > > > - *     Index [5] to [8] large pages
> > > > > > > > > + *     Index [1] to [5] 4k pages
> > > > > > > > > + *     Index [6] to [10] large pages
> > > > > > > > >   */
> > > > > > > > > -static uint64_t vtd_paging_entry_rsvd_field[9];
> > > > > > > > > +static uint64_t vtd_paging_entry_rsvd_field[11];
> > > > > > > > >  
> > > > > > > > >  static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t 
> > > > > > > > > level)
> > > > > > > > >  {
> > > > > > > > >      if (slpte & VTD_SL_PT_PAGE_SIZE_MASK) {
> > > > > > > > >          /* Maybe large page */
> > > > > > > > > -        return slpte & vtd_paging_entry_rsvd_field[level + 
> > > > > > > > > 4];
> > > > > > > > > +        return slpte & vtd_paging_entry_rsvd_field[level + 
> > > > > > > > > 5];
> > > > > > > > >      } else {
> > > > > > > > >          return slpte & vtd_paging_entry_rsvd_field[level];
> > > > > > > > >      }
> > > > > > > > > @@ -3127,6 +3127,8 @@ static void vtd_init(IntelIOMMUState *s)
> > > > > > > > >               VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits);
> > > > > > > > >      if (s->aw_bits == VTD_AW_48BIT) {
> > > > > > > > >          s->cap |= VTD_CAP_SAGAW_48bit;
> > > > > > > > > +    } else if (s->aw_bits == VTD_AW_57BIT) {
> > > > > > > > > +        s->cap |= VTD_CAP_SAGAW_57bit | VTD_CAP_SAGAW_48bit;
> > > > > > > > >      }
> > > > > > > > >      s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
> > > > > > > > >      s->haw_bits = cpu->phys_bits;
> > > > > > > > > @@ -3139,10 +3141,12 @@ static void vtd_init(IntelIOMMUState 
> > > > > > > > > *s)
> > > > > > > > >      vtd_paging_entry_rsvd_field[2] = 
> > > > > > > > > VTD_SPTE_PAGE_L2_RSVD_MASK(s->haw_bits);
> > > > > > > > >      vtd_paging_entry_rsvd_field[3] = 
> > > > > > > > > VTD_SPTE_PAGE_L3_RSVD_MASK(s->haw_bits);
> > > > > > > > >      vtd_paging_entry_rsvd_field[4] = 
> > > > > > > > > VTD_SPTE_PAGE_L4_RSVD_MASK(s->haw_bits);
> > > > > > > > > -    vtd_paging_entry_rsvd_field[5] = 
> > > > > > > > > VTD_SPTE_LPAGE_L1_RSVD_MASK(s->haw_bits);
> > > > > > > > > -    vtd_paging_entry_rsvd_field[6] = 
> > > > > > > > > VTD_SPTE_LPAGE_L2_RSVD_MASK(s->haw_bits);
> > > > > > > > > -    vtd_paging_entry_rsvd_field[7] = 
> > > > > > > > > VTD_SPTE_LPAGE_L3_RSVD_MASK(s->haw_bits);
> > > > > > > > > -    vtd_paging_entry_rsvd_field[8] = 
> > > > > > > > > VTD_SPTE_LPAGE_L4_RSVD_MASK(s->haw_bits);
> > > > > > > > > +    vtd_paging_entry_rsvd_field[5] = 
> > > > > > > > > VTD_SPTE_PAGE_L5_RSVD_MASK(s->haw_bits);
> > > > > > > > > +    vtd_paging_entry_rsvd_field[6] = 
> > > > > > > > > VTD_SPTE_LPAGE_L1_RSVD_MASK(s->haw_bits);
> > > > > > > > > +    vtd_paging_entry_rsvd_field[7] = 
> > > > > > > > > VTD_SPTE_LPAGE_L2_RSVD_MASK(s->haw_bits);
> > > > > > > > > +    vtd_paging_entry_rsvd_field[8] = 
> > > > > > > > > VTD_SPTE_LPAGE_L3_RSVD_MASK(s->haw_bits);
> > > > > > > > > +    vtd_paging_entry_rsvd_field[9] = 
> > > > > > > > > VTD_SPTE_LPAGE_L4_RSVD_MASK(s->haw_bits);
> > > > > > > > > +    vtd_paging_entry_rsvd_field[10] = 
> > > > > > > > > VTD_SPTE_LPAGE_L5_RSVD_MASK(s->haw_bits);
> > > > > > > > >  
> > > > > > > > >      if (x86_iommu->intr_supported) {
> > > > > > > > >          s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
> > > > > > > > > @@ -3241,6 +3245,23 @@ static AddressSpace 
> > > > > > > > > *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
> > > > > > > > >      return &vtd_as->as;
> > > > > > > > >  }
> > > > > > > > >  
> > > > > > > > > +static bool host_has_la57(void)
> > > > > > > > > +{
> > > > > > > > > +    uint32_t ecx, unused;
> > > > > > > > > +
> > > > > > > > > +    host_cpuid(7, 0, &unused, &unused, &ecx, &unused);
> > > > > > > > > +    return ecx & CPUID_7_0_ECX_LA57;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static bool guest_has_la57(void)
> > > > > > > > > +{
> > > > > > > > > +    CPUState *cs = first_cpu;
> > > > > > > > > +    X86CPU *cpu = X86_CPU(cs);
> > > > > > > > > +    CPUX86State *env = &cpu->env;
> > > > > > > > > +
> > > > > > > > > +    return env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_LA57;
> > > > > > > > > +}
> > > > > > > > another direct access to CPU fields,
> > > > > > > > I'd suggest to set this value when iommu is created
> > > > > > > > i.e. add 'la57' property and set from iommu owner.
> > > > > > > > 
> > > > > > > 
> > > > > > > Sorry, do you mean "-device intel-iommu,la57"? I think we do not 
> > > > > > > need
> > > > > > > that, because a 5-level capable vIOMMU can be created with 
> > > > > > > properties
> > > > > > > like "-device intel-iommu,x-aw-bits=57". 
> > > > > > > 
> > > > > > > The guest CPU fields are checked to make sure the VM has LA57 CPU 
> > > > > > > feature,
> > > > > > > because I believe there shall be no 5-level IOMMU on platforms 
> > > > > > > without LA57
> > > > > > > CPUs. 
> > > > > 
> > > > > I don't necessarily see why these need to be connected.
> > > > > If yes pls add code to explain.
> > > > 
> > > > Sorry, do you mean the VM shall be able to see a 5-level IOMMU even it 
> > > > does not
> > > > have LA57 feature? I do not see any direct connection when asked to 
> > > > enable a 5-level
> > > > vIOMMU at first, but I was told(and checked) that DPDK in the VM may 
> > > > choose a VA
> > > > value as an IOVA.
> > > 
> > > Right but then that doesn't work on all hosts either.
> > 
> > Oh, the host already has 5-level IOMMU now. So I think DPDK in native shall 
> > work with that.
> > 
> > > 
> > > > And if guest has LA57, we should create a 5-level vIOMMU to the VM.
> > > > But if the VM even does not have LA57, any specific reason we should 
> > > > give it a 5-level
> > > > vIOMMU?
> > > 
> > > So the example you give is VTD address width < CPU aw. That is known
> > > to be problematic for dpdk but not for other software and maybe dpdk
> > > will learns how to cope. Given such hosts exist it might be
> > > useful to support this at least for debugging.
> > > 
> > > Are there reasons to worry about VTD > CPU?
> > 
> > Well, I am not that worried(no usage case is one concern). I am OK to drop 
> > the guest check. :)
> > 
> > > 
> > > 
> > > > > 
> > > > > 
> > > > > > > > >  static bool vtd_decide_config(IntelIOMMUState *s, Error 
> > > > > > > > > **errp)
> > > > > > > > >  {
> > > > > > > > >      X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
> > > > > > > > > @@ -3267,11 +3288,19 @@ static bool 
> > > > > > > > > vtd_decide_config(IntelIOMMUState *s, Error **errp)
> > > > > > > > >          }
> > > > > > > > >      }
> > > > > > > > >  
> > > > > > > > > -    /* Currently only address widths supported are 39 and 48 
> > > > > > > > > bits */
> > > > > > > > > +    /* Currently address widths supported are 39, 48, and 57 
> > > > > > > > > bits */
> > > > > > > > >      if ((s->aw_bits != VTD_AW_39BIT) &&
> > > > > > > > > -        (s->aw_bits != VTD_AW_48BIT)) {
> > > > > > > > > -        error_setg(errp, "Supported values for x-aw-bits 
> > > > > > > > > are: %d, %d",
> > > > > > > > > -                   VTD_AW_39BIT, VTD_AW_48BIT);
> > > > > > > > > +        (s->aw_bits != VTD_AW_48BIT) &&
> > > > > > > > > +        (s->aw_bits != VTD_AW_57BIT)) {
> > > > > > > > > +        error_setg(errp, "Supported values for x-aw-bits 
> > > > > > > > > are: %d, %d, %d",
> > > > > > > > > +                   VTD_AW_39BIT, VTD_AW_48BIT, VTD_AW_57BIT);
> > > > > > > > > +        return false;
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    if ((s->aw_bits == VTD_AW_57BIT) &&
> > > > > > > > > +        !(host_has_la57() && guest_has_la57())) {
> > > > > > > > Does iommu supposed to work in TCG mode?
> > > > > > > > If yes then why it should care about host_has_la57()?
> > > > > > > > 
> > > > > > > 
> > > > > > > Hmm... I did not take TCG mode into consideration. And 
> > > > > > > host_has_la57() is
> > > > > > > used to guarantee the host have la57 feature so that iommu 
> > > > > > > shadowing works
> > > > > > > for device assignment.
> > > > > > > 
> > > > > > > I guess iommu shall work in TCG mode(though I am not quite sure 
> > > > > > > about this).
> > > > > > > But I do not have any usage case of a 5-level vIOMMU in TCG in 
> > > > > > > mind. So maybe
> > > > > > > we can:
> > > > > > > 1> check the 'ms->accel' in vtd_decide_config() and do not care 
> > > > > > > about host
> > > > > > > capability if it is TCG.
> > > > > > 
> > > > > > For choice 1, kvm_enabled() might be used instead of ms->accel. 
> > > > > > Thanks Peter
> > > > > > for the remind. :)
> > > > > 
> > > > > 
> > > > > This needs a big comment with an explanation though.
> > > > > And probably a TODO to make it work under TCG ...
> > > > > 
> > > > 
> > > > Thanks, Michael. For choice 1, I believe it should work for TCG(will 
> > > > need test
> > > > though), and the condition would be sth. like:
> > > > 
> > > >     if ((s->aw_bits == VTD_AW_57BIT) &&
> > > >         kvm_enabled() &&
> > > >         !host_has_la57())  {
> > > > 
> > > > As you can see, though I remove the check of guest_has_la57(), I still 
> > > > kept the
> > > > check against host when KVM is enabled. I'm still ready to be convinced 
> > > > for any
> > > > requirement why we do not need the guest check. :) 
> > > 
> > > 
> > > okay but then (repeating myself, sorry) pls add a comment that explains
> > > what happens if you do not add this limitation.
> > 
> > How about below comments?
> >     /*
> >      * For KVM guests, the host capability of LA57 shall be available,
> 
> So why is host CPU LA57 necessary for shadowing? Could you explain pls?

Oh, let me try to explain the background here. :)

Currently, vIOMMU in qemu does not have logic to check against the hardware
IOMMU capability. E.g. when we create an IOMMU with 48 bit DMA address width,
qemu does not check if any physical IOMMU has such support. And the shadow
IOMMU logic will have problem if host IOMMU only supports 39 bit IOVA. And
we will have the same problem when it comes to 57 bit IOVA.

My previous discussion with Peter Xu reached an agreement that for now, we
just use the host cpu capability as a reference when trying to create a 5-level
vIOMMU, because 57 bit IOMMU hardware will not come until ICX platform(which
includes LA57). 

And the final correct solution should be to enumerate the capabilities of
hardware IOMMUs used by the assigned device, and reject if any dismatch is
found.

Maybe I should add a TODO in above comments, give the background explaination.

> 
> > so
> >      * that iommu shadowing works for device assignment scenario. But for
> >      * TCG mode, we do not need such restriction.
> >      */
> > 
> > BTW, I just tested the TCG mode, it works(with restriction of host 
> > capability removed).
> > 
> > > 
> > > 
> > > > > > > 2> Or, we can choose to keep as it is, and add the check when 
> > > > > > > 5-level paging
> > > > > > > vIOMMU does have usage in TCG?
> > > > > > > 
> > > > > > > But as to the check of guest capability, I still believe it is 
> > > > > > > necessary. As
> > > > > > > said, a VM without LA57 feature shall not see a VT-d with 5-level 
> > > > > > > IOMMU.
> > > > > > > 
> > > > > > > > > +        error_setg(errp, "Do not support 57-bit DMA address, 
> > > > > > > > > unless both "
> > > > > > > > > +                         "host and guest are capable of 
> > > > > > > > > 5-level paging");
> > > > > > > > >          return false;
> > > > > > > > >      }
> > > > > > > > >  
> > > > > > > > > diff --git a/hw/i386/intel_iommu_internal.h 
> > > > > > > > > b/hw/i386/intel_iommu_internal.h
> > > > > > > > > index d084099..2b29b6f 100644
> > > > > > > > > --- a/hw/i386/intel_iommu_internal.h
> > > > > > > > > +++ b/hw/i386/intel_iommu_internal.h
> > > > > > > > > @@ -114,8 +114,8 @@
> > > > > > > > >                                       
> > > > > > > > > VTD_INTERRUPT_ADDR_FIRST + 1)
> > > > > > > > >  
> > > > > > > > >  /* The shift of source_id in the key of IOTLB hash table */
> > > > > > > > > -#define VTD_IOTLB_SID_SHIFT         36
> > > > > > > > > -#define VTD_IOTLB_LVL_SHIFT         52
> > > > > > > > > +#define VTD_IOTLB_SID_SHIFT         45
> > > > > > > > > +#define VTD_IOTLB_LVL_SHIFT         61
> > > > > > > > >  #define VTD_IOTLB_MAX_SIZE          1024    /* Max size of 
> > > > > > > > > the hash table */
> > > > > > > > >  
> > > > > > > > >  /* IOTLB_REG */
> > > > > > > > > @@ -212,6 +212,8 @@
> > > > > > > > >  #define VTD_CAP_SAGAW_39bit         (0x2ULL << 
> > > > > > > > > VTD_CAP_SAGAW_SHIFT)
> > > > > > > > >   /* 48-bit AGAW, 4-level page-table */
> > > > > > > > >  #define VTD_CAP_SAGAW_48bit         (0x4ULL << 
> > > > > > > > > VTD_CAP_SAGAW_SHIFT)
> > > > > > > > > + /* 57-bit AGAW, 5-level page-table */
> > > > > > > > > +#define VTD_CAP_SAGAW_57bit         (0x8ULL << 
> > > > > > > > > VTD_CAP_SAGAW_SHIFT)
> > > > > > > > >  
> > > > > > > > >  /* IQT_REG */
> > > > > > > > >  #define VTD_IQT_QT(val)             (((val) >> 4) & 
> > > > > > > > > 0x7fffULL)
> > > > > > > > > @@ -379,6 +381,8 @@ typedef union VTDInvDesc VTDInvDesc;
> > > > > > > > >          (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
> > > > > > > > >  #define VTD_SPTE_PAGE_L4_RSVD_MASK(aw) \
> > > > > > > > >          (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
> > > > > > > > > +#define VTD_SPTE_PAGE_L5_RSVD_MASK(aw) \
> > > > > > > > > +        (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
> > > > > > > > >  #define VTD_SPTE_LPAGE_L1_RSVD_MASK(aw) \
> > > > > > > > >          (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
> > > > > > > > >  #define VTD_SPTE_LPAGE_L2_RSVD_MASK(aw) \
> > > > > > > > > @@ -387,6 +391,8 @@ typedef union VTDInvDesc VTDInvDesc;
> > > > > > > > >          (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | 
> > > > > > > > > VTD_SL_IGN_COM))
> > > > > > > > >  #define VTD_SPTE_LPAGE_L4_RSVD_MASK(aw) \
> > > > > > > > >          (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
> > > > > > > > > +#define VTD_SPTE_LPAGE_L5_RSVD_MASK(aw) \
> > > > > > > > > +        (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
> > > > > > > > >  
> > > > > > > > >  /* Information about page-selective IOTLB invalidate */
> > > > > > > > >  struct VTDIOTLBPageInvInfo {
> > > > > > > > > diff --git a/include/hw/i386/intel_iommu.h 
> > > > > > > > > b/include/hw/i386/intel_iommu.h
> > > > > > > > > index 820451c..7474c4f 100644
> > > > > > > > > --- a/include/hw/i386/intel_iommu.h
> > > > > > > > > +++ b/include/hw/i386/intel_iommu.h
> > > > > > > > > @@ -49,6 +49,7 @@
> > > > > > > > >  #define DMAR_REG_SIZE               0x230
> > > > > > > > >  #define VTD_AW_39BIT                39
> > > > > > > > >  #define VTD_AW_48BIT                48
> > > > > > > > > +#define VTD_AW_57BIT                57
> > > > > > > > >  #define VTD_ADDRESS_WIDTH           VTD_AW_39BIT
> > > > > > > > >  #define VTD_HAW_MASK(aw)            ((1ULL << (aw)) - 1)
> > > > > > > > >  
> > > > > > > > 
> > > > > > > > 
> > > > > > > 
> > > > > > > B.R.
> > > > > > > Yu
> > > > > > > 
> > > > 
> > > > B.R.
> > > > Yu
> > > 
> > 
> > B.R.
> > Yu

B.R.
Yu



reply via email to

[Prev in Thread] Current Thread [Next in Thread]