qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH RFC 17/21] migration: Rework ram discard logic for hugetlb double


From: Peter Xu
Subject: [PATCH RFC 17/21] migration: Rework ram discard logic for hugetlb double-map
Date: Tue, 17 Jan 2023 17:09:10 -0500

Hugetlb double map will make the ram discard logic different.

The whole idea will still be the same: we need to a bitmap sync between
src/dst before we switch to postcopy.

When discarding a range, we only erase the pgtables that were used to be
mapped for the guest leveraging the semantics of MADV_DONTNEED on Linux.
This guarantees us that when a guest access triggered we'll receive a MINOR
fault message rather than a MISSING fault message.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/exec/cpu-common.h |  1 +
 migration/ram.c           | 16 +++++++++++++++-
 migration/trace-events    |  1 +
 softmmu/physmem.c         | 31 +++++++++++++++++++++++++++++++
 4 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 4c394ccdfc..09378c6ada 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -155,6 +155,7 @@ typedef int (RAMBlockIterFunc)(RAMBlock *rb, void *opaque);
 
 int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque);
 int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length);
+int ram_block_zap_range(RAMBlock *rb, uint64_t start, size_t length);
 
 #endif
 
diff --git a/migration/ram.c b/migration/ram.c
index 4d786f4b97..4da56d925c 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2770,6 +2770,12 @@ static void 
postcopy_each_ram_send_discard(MigrationState *ms)
          * host-page size chunks, mark any partially dirty host-page size
          * chunks as all dirty.  In this case the host-page is the host-page
          * for the particular RAMBlock, i.e. it might be a huge page.
+         *
+         * Note: we need to do huge page truncation when double-map is
+         * enabled too, _only_ because we use MADV_DONTNEED to drop
+         * pgtables on dest QEMU, and it (at least so far...) does not
+         * support dropping partial of the hugetlb pgtables.  If it can one
+         * day, we can skip this "chunk" operation as further optimization.
          */
         postcopy_chunk_hostpages_pass(ms, block);
 
@@ -2913,7 +2919,15 @@ int ram_discard_range(const char *rbname, uint64_t 
start, size_t length)
                      length >> qemu_target_page_bits());
     }
 
-    return ram_block_discard_range(rb, start, length);
+    if (postcopy_use_minor_fault(rb)) {
+        /*
+         * We need to keep the page cache exist, so as to trigger MINOR
+         * faults for every future page accesses on old pages.
+         */
+        return ram_block_zap_range(rb, start, length);
+    } else {
+        return ram_block_discard_range(rb, start, length);
+    }
 }
 
 /*
diff --git a/migration/trace-events b/migration/trace-events
index 57003edcbd..6b418a0e9e 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -92,6 +92,7 @@ migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages 
%" PRIu64
 migration_bitmap_clear_dirty(char *str, uint64_t start, uint64_t size, 
unsigned long page) "rb %s start 0x%"PRIx64" size 0x%"PRIx64" page 0x%lx"
 migration_throttle(void) ""
 ram_discard_range(const char *rbname, uint64_t start, size_t len) "%s: start: 
%" PRIx64 " %zx"
+postcopy_discard_range(const char *rbname, uint64_t start, void *host, size_t 
len) "%s: start=%" PRIx64 " haddr=%p len=%zx"
 ram_load_loop(const char *rbname, uint64_t addr, int flags, void *host) "%s: 
addr: 0x%" PRIx64 " flags: 0x%x host: %p"
 ram_load_postcopy_loop(int channel, uint64_t addr, int flags) "chan=%d 
addr=0x%" PRIx64 " flags=0x%x"
 ram_postcopy_send_discard_bitmap(void) ""
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index 536c204811..12c0bc9aee 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -3567,6 +3567,37 @@ int qemu_ram_foreach_block(RAMBlockIterFunc func, void 
*opaque)
     return ret;
 }
 
+/*
+ * Zap page tables for specified range.  Only applicable for file-backed
+ * memory.  We're relying on Linux's MADV_DONTNEED behavior here for
+ * zapping the pgtables, it may or may not work on other OSes.  Before we
+ * know that, fail them.
+ */
+int ram_block_zap_range(RAMBlock *rb, uint64_t start, size_t length)
+{
+#ifdef CONFIG_LINUX
+    uint8_t *host_addr = rb->host + start;
+    int ret;
+
+    if (rb->fd == -1) {
+        /* The zap magic only works with file-backed */
+        return -EINVAL;
+    }
+
+    ret = madvise(host_addr, length, MADV_DONTNEED);
+    if (ret) {
+        ret = -errno;
+        error_report("%s: Failed to zap ramblock start=0x%"PRIx64
+                     " addr=0x%"PRIx64" length=0x%zx", __func__,
+                     start, (uint64_t)host_addr, length);
+    }
+
+    return ret;
+#else
+    return -EINVAL;
+#endif
+}
+
 /*
  * Unmap pages of memory from start to start+length such that
  * they a) read as 0, b) Trigger whatever fault mechanism
-- 
2.37.3




reply via email to

[Prev in Thread] Current Thread [Next in Thread]