[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH RFC 04/10] intel_iommu: Second Stage Access Dirty bit support
From: |
Jason Wang |
Subject: |
Re: [PATCH RFC 04/10] intel_iommu: Second Stage Access Dirty bit support |
Date: |
Fri, 29 Apr 2022 10:26:21 +0800 |
On Fri, Apr 29, 2022 at 5:14 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> IOMMU advertises Access/Dirty bits if the extended capability
> DMAR register reports it (ECAP, mnemonic ECAP.SSADS albeit it used
> to be known as SLADS before). The first stage table, though, has no bit for
> advertising Access/Dirty, unless referenced via a scalable-mode PASID Entry.
> Relevant Intel IOMMU SDM ref for first stage table "3.6.2 Accessed, Extended
> Accessed, and Dirty Flags" and second stage table "3.7.2 Accessed and Dirty
> Flags".
>
> To enable it we depend on scalable-mode for the second-stage table,
> so we limit use of dirty-bit to scalable-mode To use SSADS, we set a bit in
> the scalable-mode PASID Table entry, by setting bit 9 (SSADE). When
> we do so we require flushing the context/pasid-table caches and IOTLB much
> like AMD. Relevant SDM refs:
>
> "3.7.2 Accessed and Dirty Flags"
> "6.5.3.3 Guidance to Software for Invalidations,
> Table 23. Guidance to Software for Invalidations"
>
> To read out what's dirtied, same thing as past implementations is done.
> Dirty bit support is located in the same location (bit 9). The IOTLB
> caches some attributes when SSADE is enabled and dirty-ness information,
> so we also need to flush IOTLB to make sure IOMMU attempts to set the
> dirty bit again. Relevant manuals over the hardware translation is
> chapter 6 with some special mention to:
>
> "6.2.3.1 Scalable-Mode PASID-Table Entry Programming Considerations"
> "6.2.4 IOTLB"
>
> The first 12bits of the PTE are already cached via the SLPTE pointer,
> similar to how it is added in amd-iommu. Use also the previously
> added PASID entry cache in order to fetch whether Dirty bit was
> enabled or not in the SM second stage table.
>
> This is useful for covering and prototyping IOMMU support for access/dirty
> bits.
>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
> hw/i386/intel_iommu.c | 85 ++++++++++++++++++++++++++++++----
> hw/i386/intel_iommu_internal.h | 4 ++
> hw/i386/trace-events | 2 +
> 3 files changed, 82 insertions(+), 9 deletions(-)
>
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 752940fa4c0e..e946f793a968 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -651,6 +651,21 @@ static uint64_t vtd_get_slpte(dma_addr_t base_addr,
> uint32_t index)
> return slpte;
> }
>
> +/* Get the content of a spte located in @base_addr[@index] */
> +static uint64_t vtd_set_slpte(dma_addr_t base_addr, uint32_t index,
> + uint64_t slpte)
> +{
> +
> + if (dma_memory_write(&address_space_memory,
> + base_addr + index * sizeof(slpte), &slpte,
> + sizeof(slpte), MEMTXATTRS_UNSPECIFIED)) {
> + slpte = (uint64_t)-1;
> + return slpte;
> + }
> +
> + return vtd_get_slpte(base_addr, index);
> +}
> +
> /* Given an iova and the level of paging structure, return the offset
> * of current level.
> */
> @@ -720,6 +735,11 @@ static inline bool vtd_pe_present(VTDPASIDEntry *pe)
> return pe->val[0] & VTD_PASID_ENTRY_P;
> }
>
> +static inline bool vtd_pe_slad_enabled(VTDPASIDEntry *pe)
> +{
> + return pe->val[0] & VTD_SM_PASID_ENTRY_SLADE;
> +}
> +
> static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s,
> uint32_t pasid,
> dma_addr_t addr,
> @@ -1026,6 +1046,33 @@ static VTDBus
> *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
> return NULL;
> }
>
> +static inline bool vtd_ssad_update(IntelIOMMUState *s, uint16_t pe_flags,
> + uint64_t *slpte, bool is_write,
> + bool reads, bool writes)
> +{
> + bool dirty, access = reads;
> +
> + if (!(pe_flags & VTD_SM_PASID_ENTRY_SLADE)) {
> + return false;
> + }
> +
> + dirty = access = false;
> +
> + if (is_write && writes && !(*slpte & VTD_SL_D)) {
> + *slpte |= VTD_SL_D;
> + trace_vtd_dirty_update(*slpte);
> + dirty = true;
> + }
> +
> + if (((!is_write && reads) || dirty) && !(*slpte & VTD_SL_A)) {
> + *slpte |= VTD_SL_A;
> + trace_vtd_access_update(*slpte);
> + access = true;
> + }
> +
> + return dirty || access;
> +}
> +
> /* Given the @iova, get relevant @slptep. @slpte_level will be the last level
> * of the translation, can be used for deciding the size of large page.
> */
> @@ -1039,6 +1086,7 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s,
> VTDContextEntry *ce,
> uint32_t offset;
> uint64_t slpte;
> uint64_t access_right_check;
> + uint16_t pe_flags;
>
> if (!vtd_iova_range_check(s, iova, ce, aw_bits)) {
> error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")",
> @@ -1054,14 +1102,7 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s,
> VTDContextEntry *ce,
> slpte = vtd_get_slpte(addr, offset);
>
> if (slpte == (uint64_t)-1) {
> - error_report_once("%s: detected read error on DMAR slpte "
> - "(iova=0x%" PRIx64 ")", __func__, iova);
> - if (level == vtd_get_iova_level(s, ce)) {
> - /* Invalid programming of context-entry */
> - return -VTD_FR_CONTEXT_ENTRY_INV;
> - } else {
> - return -VTD_FR_PAGING_ENTRY_INV;
> - }
> + goto inv_slpte;
> }
> *reads = (*reads) && (slpte & VTD_SL_R);
> *writes = (*writes) && (slpte & VTD_SL_W);
> @@ -1081,6 +1122,14 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s,
> VTDContextEntry *ce,
> }
>
> if (vtd_is_last_slpte(slpte, level)) {
> + pe_flags = vtd_sm_pasid_entry_flags(s, ce);
> + if (vtd_ssad_update(s, pe_flags, &slpte, is_write,
> + *reads, *writes)) {
> + slpte = vtd_set_slpte(addr, offset, slpte);
> + if (slpte == (uint64_t)-1) {
> + goto inv_slpte;
> + }
> + }
> *slptep = slpte;
> *slpte_level = level;
> return 0;
> @@ -1088,6 +1137,16 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s,
> VTDContextEntry *ce,
> addr = vtd_get_slpte_addr(slpte, aw_bits);
> level--;
> }
> +
> +inv_slpte:
> + error_report_once("%s: detected read error on DMAR slpte "
> + "(iova=0x%" PRIx64 ")", __func__, iova);
> + if (level == vtd_get_iova_level(s, ce)) {
> + /* Invalid programming of context-entry */
> + return -VTD_FR_CONTEXT_ENTRY_INV;
> + } else {
> + return -VTD_FR_PAGING_ENTRY_INV;
> + }
> }
>
> typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private);
> @@ -1742,6 +1801,13 @@ static bool vtd_do_iommu_translate(VTDAddressSpace
> *vtd_as, PCIBus *bus,
> slpte = iotlb_entry->slpte;
> access_flags = iotlb_entry->access_flags;
> page_mask = iotlb_entry->mask;
> + if (vtd_ssad_update(s, iotlb_entry->sm_pe_flags, &slpte, is_write,
> + access_flags & IOMMU_RO, access_flags &
> IOMMU_WO)) {
> + uint32_t offset;
> +
> + offset = vtd_iova_level_offset(addr, vtd_get_iova_level(s,
> &ce));
> + vtd_set_slpte(addr, offset, slpte);
> + }
> goto out;
> }
>
> @@ -3693,7 +3759,8 @@ static void vtd_init(IntelIOMMUState *s)
>
> /* TODO: read cap/ecap from host to decide which cap to be exposed. */
> if (s->scalable_mode) {
> - s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
> + s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS |
> + VTD_ECAP_SLADS;
> }
We probably need a dedicated command line parameter and make it compat
for pre 7.1 machines.
Otherwise we may break migration.
Thanks
>
> if (s->snoop_control) {
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index 1ff13b40f9bb..c00f6e7b4a72 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -192,6 +192,7 @@
> #define VTD_ECAP_MHMV (15ULL << 20)
> #define VTD_ECAP_SRS (1ULL << 31)
> #define VTD_ECAP_SMTS (1ULL << 43)
> +#define VTD_ECAP_SLADS (1ULL << 45)
> #define VTD_ECAP_SLTS (1ULL << 46)
>
> /* CAP_REG */
> @@ -492,6 +493,7 @@ typedef struct VTDRootEntry VTDRootEntry;
>
> #define VTD_SM_PASID_ENTRY_AW 7ULL /* Adjusted guest-address-width
> */
> #define VTD_SM_PASID_ENTRY_DID(val) ((val) & VTD_DOMAIN_ID_MASK)
> +#define VTD_SM_PASID_ENTRY_SLADE (1ULL << 9)
>
> /* Second Level Page Translation Pointer*/
> #define VTD_SM_PASID_ENTRY_SLPTPTR (~0xfffULL)
> @@ -515,5 +517,7 @@ typedef struct VTDRootEntry VTDRootEntry;
> #define VTD_SL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) &
> VTD_HAW_MASK(aw))
> #define VTD_SL_IGN_COM 0xbff0000000000000ULL
> #define VTD_SL_TM (1ULL << 62)
> +#define VTD_SL_A (1ULL << 8)
> +#define VTD_SL_D (1ULL << 9)
>
> #endif
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index eb5f075873cd..e4122ee8a999 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -66,6 +66,8 @@ vtd_frr_new(int index, uint64_t hi, uint64_t lo) "index %d
> high 0x%"PRIx64" low
> vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16
> vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid
> 0x%"PRIx16" index %d vec %d (should be: %d)"
> vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid
> 0x%"PRIx16" index %d trigger %d (should be: %d)"
> +vtd_access_update(uint64_t slpte) "slpte 0x%"PRIx64
> +vtd_dirty_update(uint64_t slpte) "slpte 0x%"PRIx64
>
> # amd_iommu.c
> amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at
> addr 0x%"PRIx64" + offset 0x%"PRIx32
> --
> 2.17.2
>
- [PATCH RFC 00/10] hw/vfio, x86/iommu: IOMMUFD Dirty Tracking, Joao Martins, 2022/04/28
- [PATCH RFC 02/10] amd-iommu: Access/Dirty bit support, Joao Martins, 2022/04/28
- [PATCH RFC 03/10] intel-iommu: Cache PASID entry flags, Joao Martins, 2022/04/28
- [PATCH RFC 05/10] linux-headers: import iommufd.h hwpt extensions, Joao Martins, 2022/04/28
- [PATCH RFC 07/10] vfio/iommufd: Add HWPT_GET_DIRTY_IOVA support, Joao Martins, 2022/04/28
- [PATCH RFC 08/10] vfio/iommufd: Add IOAS_UNMAP_DIRTY support, Joao Martins, 2022/04/28
- [PATCH RFC 06/10] vfio/iommufd: Add HWPT_SET_DIRTY support, Joao Martins, 2022/04/28
- [PATCH RFC 09/10] migration/dirtyrate: Expand dirty_bitmap to be tracked separately for devices, Joao Martins, 2022/04/28
- [PATCH RFC 10/10] hw/vfio: Add nr of dirty pages to tracepoints, Joao Martins, 2022/04/28