qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [RFC 00/28] bitmap handling optimization


From: Paolo Bonzini
Subject: Re: [Qemu-devel] [RFC 00/28] bitmap handling optimization
Date: Wed, 09 Oct 2013 15:29:57 +0200
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130923 Thunderbird/17.0.9

Il 09/10/2013 13:28, Juan Quintela ha scritto:
> Hi
> 
> This series split the dirty bitmap (8 bits per page, only three used)
> into 3 individual bitmaps.  Once the conversion is done, operations
> are handled by bitmap operations, not bit by bit.
> 
> - *_DIRTY_FLAG flags are gone, now we use memory.h DIRTY_MEMORY_*
>    everywhere.
> 
> - We set/reset each flag individually
>   (set_dirty_flags(0xff&~CODE_DIRTY_FLAG)) are gone.
> 
> - Rename several functions to clarify/make consistent things.
> 
> - I know it dont't pass checkpatch for long lines, propper submission
>   should pass it. We have to have long lines, short variable names, or
>   ugly line splitting :p
> 
> - DIRTY_MEMORY_NUM: how can one include exec/memory.h into cpu-all.h?
>   #include it don't work, as a workaround, I have copied its value, but
>   any better idea?  I can always create "exec/migration-flags.h", though.

I think both files are too "central" and you get some sort of circular 
dependency.

The solution could be to move RAM definitions from cpu-all.h to
memory-internal.h.  For example:

diff --git a/arch_init.c b/arch_init.c
index 7545d96..8752e27 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -47,7 +47,7 @@
 #include "qemu/config-file.h"
 #include "qmp-commands.h"
 #include "trace.h"
-#include "exec/cpu-all.h"
+#include "exec/memory-internal.h"
 #include "hw/acpi/acpi.h"
 
 #ifdef DEBUG_ARCH_INIT
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index 019dc20..4cfde66 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -435,57 +435,11 @@ void cpu_watchpoint_remove_all(CPUArchState *env, int 
mask);
 
 #if !defined(CONFIG_USER_ONLY)
 
-/* memory API */
-
 extern ram_addr_t ram_size;
-
-/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
-#define RAM_PREALLOC_MASK   (1 << 0)
-
-typedef struct RAMBlock {
-    struct MemoryRegion *mr;
-    uint8_t *host;
-    ram_addr_t offset;
-    ram_addr_t length;
-    uint32_t flags;
-    char idstr[256];
-    /* Reads can take either the iothread or the ramlist lock.
-     * Writes must take both locks.
-     */
-    QTAILQ_ENTRY(RAMBlock) next;
-    int fd;
-} RAMBlock;
-
-#define DIRTY_MEMORY_NUM       3
-
-typedef struct RAMList {
-    QemuMutex mutex;
-    /* Protected by the iothread lock.  */
-    unsigned long *dirty_memory[DIRTY_MEMORY_NUM];
-    RAMBlock *mru_block;
-    /* Protected by the ramlist lock.  */
-    QTAILQ_HEAD(, RAMBlock) blocks;
-    uint32_t version;
-} RAMList;
-extern RAMList ram_list;
-
 extern const char *mem_path;
 extern int mem_prealloc;
 
-/* Flags stored in the low bits of the TLB virtual address.  These are
-   defined so that fast path ram access is all zeros.  */
-/* Zero if TLB entry is valid.  */
-#define TLB_INVALID_MASK   (1 << 3)
-/* Set if TLB entry references a clean RAM page.  The iotlb entry will
-   contain the page physical address.  */
-#define TLB_NOTDIRTY    (1 << 4)
-/* Set if TLB entry is an IO callback.  */
-#define TLB_MMIO        (1 << 5)
-
 void dump_exec_info(FILE *f, fprintf_function cpu_fprintf);
-ram_addr_t last_ram_offset(void);
-void qemu_mutex_lock_ramlist(void);
-void qemu_mutex_unlock_ramlist(void);
 #endif /* !CONFIG_USER_ONLY */
 
 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
diff --git a/include/exec/cputlb.h b/include/exec/cputlb.h
index e21cb60..719fa27 100644
--- a/include/exec/cputlb.h
+++ b/include/exec/cputlb.h
@@ -20,6 +20,17 @@
 #define CPUTLB_H
 
 #if !defined(CONFIG_USER_ONLY)
+
+/* Flags stored in the low bits of the TLB virtual address.  These are
+   defined so that fast path ram access is all zeros.  */
+/* Zero if TLB entry is valid.  */
+#define TLB_INVALID_MASK   (1 << 3)
+/* Set if TLB entry references a clean RAM page.  The iotlb entry will
+   contain the page physical address.  */
+#define TLB_NOTDIRTY    (1 << 4)
+/* Set if TLB entry is an IO callback.  */
+#define TLB_MMIO        (1 << 5)
+
 /* cputlb.c */
 void tlb_protect_code(ram_addr_t ram_addr);
 void tlb_unprotect_code_phys(CPUArchState *env, ram_addr_t ram_addr,
diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h
index d46570e..aaf76c2 100644
--- a/include/exec/memory-internal.h
+++ b/include/exec/memory-internal.h
@@ -22,6 +22,35 @@
 #ifndef CONFIG_USER_ONLY
 #include "hw/xen/xen.h"
 
+/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
+#define RAM_PREALLOC_MASK   (1 << 0)
+
+typedef struct RAMBlock {
+    struct MemoryRegion *mr;
+    uint8_t *host;
+    ram_addr_t offset;
+    ram_addr_t length;
+    uint32_t flags;
+    char idstr[256];
+    /* Reads can take either the iothread or the ramlist lock.
+     * Writes must take both locks.
+     */
+    QTAILQ_ENTRY(RAMBlock) next;
+    int fd;
+} RAMBlock;
+
+#define DIRTY_MEMORY_NUM       3
+
+typedef struct RAMList {
+    QemuMutex mutex;
+    /* Protected by the iothread lock.  */
+    unsigned long *dirty_memory[DIRTY_MEMORY_NUM];
+    RAMBlock *mru_block;
+    /* Protected by the ramlist lock.  */
+    QTAILQ_HEAD(, RAMBlock) blocks;
+    uint32_t version;
+} RAMList;
+extern RAMList ram_list;
 
 typedef struct AddressSpaceDispatch AddressSpaceDispatch;
 
@@ -105,6 +134,10 @@ static inline void 
cpu_physical_memory_clear_dirty_range(ram_addr_t start,
 void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
                                      unsigned client);
 
+ram_addr_t last_ram_offset(void);
+void qemu_mutex_lock_ramlist(void);
+void qemu_mutex_unlock_ramlist(void);
+
 #endif
 
 #endif
diff --git a/include/exec/softmmu_template.h b/include/exec/softmmu_template.h
index 5bbc56a..cc27058 100644
--- a/include/exec/softmmu_template.h
+++ b/include/exec/softmmu_template.h
@@ -23,6 +23,7 @@
  */
 #include "qemu/timer.h"
 #include "exec/memory.h"
+#include "exec/cputlb.h"
 
 #define DATA_SIZE (1 << SHIFT)
 

Bonus points for splitting RAM save out of arch_init.c. :)

> - create a lock for the bitmaps and fold migration bitmap into this
>   one.  This would avoid a copy and make things easier?

I think this is less important and not that easy.  For example,
how fine-grained should the locking be without bogging down TCG?

You can speed up migration_bitmap_sync by a factor of 64 or more
if you copy word-by-word instead of memory_region_test_and_clear_dirty
and migration_bitmap_set_dirty.

Once you do that and also merge KVM and vhost bitmaps one word at a
time, it's likely that dirty bitmaps get almost out of the
profile.

> - As this code uses/abuses bitmaps, we need to change the type of the
>   index from int to long.  With an int index, we can only access a
>   maximum of 8TB guest (yes, this is not urgent, we have a couple of
>   years to do it).

Yes.

> - merging KVM <-> QEMU bitmap as a bitmap and not bit-by-bit.

Right.  All of vhost_dev_sync_region, kvm_get_dirty_pages_log_range,
xen_sync_dirty_bitmap are really working a word at a time.
So it should be easy to optimize the log_sync implementations to
work with a word-at-a-time API instead of memory_region_set_dirty.

> - spliting the KVM bitmap synchronization into chunks, i.e. not
>   synchronize all memory, just enough to continue with migration.

That can also help.  However, it's not easy to do it without
making ram_save_pending's computations too optimistic.

So I'd just focus on speeding up migration_bitmap_sync first.  It's
easy and should "almost" do the entirety of the work.

Paolo



reply via email to

[Prev in Thread] Current Thread [Next in Thread]