[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [RFC 00/28] bitmap handling optimization
From: |
Paolo Bonzini |
Subject: |
Re: [Qemu-devel] [RFC 00/28] bitmap handling optimization |
Date: |
Wed, 09 Oct 2013 15:29:57 +0200 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130923 Thunderbird/17.0.9 |
Il 09/10/2013 13:28, Juan Quintela ha scritto:
> Hi
>
> This series split the dirty bitmap (8 bits per page, only three used)
> into 3 individual bitmaps. Once the conversion is done, operations
> are handled by bitmap operations, not bit by bit.
>
> - *_DIRTY_FLAG flags are gone, now we use memory.h DIRTY_MEMORY_*
> everywhere.
>
> - We set/reset each flag individually
> (set_dirty_flags(0xff&~CODE_DIRTY_FLAG)) are gone.
>
> - Rename several functions to clarify/make consistent things.
>
> - I know it dont't pass checkpatch for long lines, propper submission
> should pass it. We have to have long lines, short variable names, or
> ugly line splitting :p
>
> - DIRTY_MEMORY_NUM: how can one include exec/memory.h into cpu-all.h?
> #include it don't work, as a workaround, I have copied its value, but
> any better idea? I can always create "exec/migration-flags.h", though.
I think both files are too "central" and you get some sort of circular
dependency.
The solution could be to move RAM definitions from cpu-all.h to
memory-internal.h. For example:
diff --git a/arch_init.c b/arch_init.c
index 7545d96..8752e27 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -47,7 +47,7 @@
#include "qemu/config-file.h"
#include "qmp-commands.h"
#include "trace.h"
-#include "exec/cpu-all.h"
+#include "exec/memory-internal.h"
#include "hw/acpi/acpi.h"
#ifdef DEBUG_ARCH_INIT
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index 019dc20..4cfde66 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -435,57 +435,11 @@ void cpu_watchpoint_remove_all(CPUArchState *env, int
mask);
#if !defined(CONFIG_USER_ONLY)
-/* memory API */
-
extern ram_addr_t ram_size;
-
-/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
-#define RAM_PREALLOC_MASK (1 << 0)
-
-typedef struct RAMBlock {
- struct MemoryRegion *mr;
- uint8_t *host;
- ram_addr_t offset;
- ram_addr_t length;
- uint32_t flags;
- char idstr[256];
- /* Reads can take either the iothread or the ramlist lock.
- * Writes must take both locks.
- */
- QTAILQ_ENTRY(RAMBlock) next;
- int fd;
-} RAMBlock;
-
-#define DIRTY_MEMORY_NUM 3
-
-typedef struct RAMList {
- QemuMutex mutex;
- /* Protected by the iothread lock. */
- unsigned long *dirty_memory[DIRTY_MEMORY_NUM];
- RAMBlock *mru_block;
- /* Protected by the ramlist lock. */
- QTAILQ_HEAD(, RAMBlock) blocks;
- uint32_t version;
-} RAMList;
-extern RAMList ram_list;
-
extern const char *mem_path;
extern int mem_prealloc;
-/* Flags stored in the low bits of the TLB virtual address. These are
- defined so that fast path ram access is all zeros. */
-/* Zero if TLB entry is valid. */
-#define TLB_INVALID_MASK (1 << 3)
-/* Set if TLB entry references a clean RAM page. The iotlb entry will
- contain the page physical address. */
-#define TLB_NOTDIRTY (1 << 4)
-/* Set if TLB entry is an IO callback. */
-#define TLB_MMIO (1 << 5)
-
void dump_exec_info(FILE *f, fprintf_function cpu_fprintf);
-ram_addr_t last_ram_offset(void);
-void qemu_mutex_lock_ramlist(void);
-void qemu_mutex_unlock_ramlist(void);
#endif /* !CONFIG_USER_ONLY */
int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
diff --git a/include/exec/cputlb.h b/include/exec/cputlb.h
index e21cb60..719fa27 100644
--- a/include/exec/cputlb.h
+++ b/include/exec/cputlb.h
@@ -20,6 +20,17 @@
#define CPUTLB_H
#if !defined(CONFIG_USER_ONLY)
+
+/* Flags stored in the low bits of the TLB virtual address. These are
+ defined so that fast path ram access is all zeros. */
+/* Zero if TLB entry is valid. */
+#define TLB_INVALID_MASK (1 << 3)
+/* Set if TLB entry references a clean RAM page. The iotlb entry will
+ contain the page physical address. */
+#define TLB_NOTDIRTY (1 << 4)
+/* Set if TLB entry is an IO callback. */
+#define TLB_MMIO (1 << 5)
+
/* cputlb.c */
void tlb_protect_code(ram_addr_t ram_addr);
void tlb_unprotect_code_phys(CPUArchState *env, ram_addr_t ram_addr,
diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h
index d46570e..aaf76c2 100644
--- a/include/exec/memory-internal.h
+++ b/include/exec/memory-internal.h
@@ -22,6 +22,35 @@
#ifndef CONFIG_USER_ONLY
#include "hw/xen/xen.h"
+/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
+#define RAM_PREALLOC_MASK (1 << 0)
+
+typedef struct RAMBlock {
+ struct MemoryRegion *mr;
+ uint8_t *host;
+ ram_addr_t offset;
+ ram_addr_t length;
+ uint32_t flags;
+ char idstr[256];
+ /* Reads can take either the iothread or the ramlist lock.
+ * Writes must take both locks.
+ */
+ QTAILQ_ENTRY(RAMBlock) next;
+ int fd;
+} RAMBlock;
+
+#define DIRTY_MEMORY_NUM 3
+
+typedef struct RAMList {
+ QemuMutex mutex;
+ /* Protected by the iothread lock. */
+ unsigned long *dirty_memory[DIRTY_MEMORY_NUM];
+ RAMBlock *mru_block;
+ /* Protected by the ramlist lock. */
+ QTAILQ_HEAD(, RAMBlock) blocks;
+ uint32_t version;
+} RAMList;
+extern RAMList ram_list;
typedef struct AddressSpaceDispatch AddressSpaceDispatch;
@@ -105,6 +134,10 @@ static inline void
cpu_physical_memory_clear_dirty_range(ram_addr_t start,
void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
unsigned client);
+ram_addr_t last_ram_offset(void);
+void qemu_mutex_lock_ramlist(void);
+void qemu_mutex_unlock_ramlist(void);
+
#endif
#endif
diff --git a/include/exec/softmmu_template.h b/include/exec/softmmu_template.h
index 5bbc56a..cc27058 100644
--- a/include/exec/softmmu_template.h
+++ b/include/exec/softmmu_template.h
@@ -23,6 +23,7 @@
*/
#include "qemu/timer.h"
#include "exec/memory.h"
+#include "exec/cputlb.h"
#define DATA_SIZE (1 << SHIFT)
Bonus points for splitting RAM save out of arch_init.c. :)
> - create a lock for the bitmaps and fold migration bitmap into this
> one. This would avoid a copy and make things easier?
I think this is less important and not that easy. For example,
how fine-grained should the locking be without bogging down TCG?
You can speed up migration_bitmap_sync by a factor of 64 or more
if you copy word-by-word instead of memory_region_test_and_clear_dirty
and migration_bitmap_set_dirty.
Once you do that and also merge KVM and vhost bitmaps one word at a
time, it's likely that dirty bitmaps get almost out of the
profile.
> - As this code uses/abuses bitmaps, we need to change the type of the
> index from int to long. With an int index, we can only access a
> maximum of 8TB guest (yes, this is not urgent, we have a couple of
> years to do it).
Yes.
> - merging KVM <-> QEMU bitmap as a bitmap and not bit-by-bit.
Right. All of vhost_dev_sync_region, kvm_get_dirty_pages_log_range,
xen_sync_dirty_bitmap are really working a word at a time.
So it should be easy to optimize the log_sync implementations to
work with a word-at-a-time API instead of memory_region_set_dirty.
> - spliting the KVM bitmap synchronization into chunks, i.e. not
> synchronize all memory, just enough to continue with migration.
That can also help. However, it's not easy to do it without
making ram_save_pending's computations too optimistic.
So I'd just focus on speeding up migration_bitmap_sync first. It's
easy and should "almost" do the entirety of the work.
Paolo
- [Qemu-devel] [PATCH 20/28] memory: unfold cpu_physical_memory_clear_dirty_flag() in its only user, (continued)
- [Qemu-devel] [PATCH 20/28] memory: unfold cpu_physical_memory_clear_dirty_flag() in its only user, Juan Quintela, 2013/10/09
- [Qemu-devel] [PATCH 21/28] memory: unfold cpu_physical_memory_set_dirty() in its only user, Juan Quintela, 2013/10/09
- [Qemu-devel] [PATCH 23/28] memory: make cpu_physical_memory_get_dirty() the main function, Juan Quintela, 2013/10/09
- [Qemu-devel] [PATCH 22/28] memory: unfold cpu_physical_memory_set_dirty_flag(), Juan Quintela, 2013/10/09
- [Qemu-devel] [PATCH 25/28] memory: s/mask/clear/ cpu_physical_memory_mask_dirty_range, Juan Quintela, 2013/10/09
- [Qemu-devel] [PATCH 24/28] memory: cpu_physical_memory_get_dirty() is used as returning a bool, Juan Quintela, 2013/10/09
- [Qemu-devel] [PATCH 26/28] memory: use find_next_bit() to find dirty bits, Juan Quintela, 2013/10/09
- [Qemu-devel] [PATCH 27/28] memory: cpu_physical_memory_set_dirty_range() now uses bitmap operations, Juan Quintela, 2013/10/09
- [Qemu-devel] [PATCH 28/28] memory: cpu_physical_memory_clear_dirty_range() now uses bitmap operations, Juan Quintela, 2013/10/09
- Re: [Qemu-devel] [RFC 00/28] bitmap handling optimization,
Paolo Bonzini <=