Re: [Qemu-devel] [WIP:COLO: 1] Flush colo ram in parallel

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [WIP:COLO: 1] Flush colo ram in parallel

From:	Hailiang Zhang
Subject:	Re: [Qemu-devel] [WIP:COLO: 1] Flush colo ram in parallel
Date:	Mon, 29 Feb 2016 20:41:14 +0800
User-agent:	Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Thunderbird/38.5.1

On 2016/2/29 18:19, Dr. David Alan Gilbert (git) wrote:

From: "Dr. David Alan Gilbert" <address@hidden>

Flush the colo ram cache in parallel; use the same number
of threads as CPU cores.

On a VM with 4 cores, and 4GB RAM, I've seen a reduction from
~20ms to ~16ms using this, which is helpful but not as much
as I hoped;   I guess one problem might be that all the changes
could be concentrated in one area of RAM?  Perhaps another approach


Agreed. The dirty pages happened in continuous address are common in real
usage scene, Here, the idea of dividing ramblock by address, and flushing
them in parallel could be used in colo_init_ram_cache(),
where we need to backup the SVM's RAM into cache.

would be to have one thread searching the bitmap and other threads
doing the copy with some type of work queueu.


I guess, in most case, the time of finding the dirty bitmap
is much faster than copy one page. So i think this method could be more
effective and more reasonable, but let's realize and test it first ;)

Thanks,
Hailiang

[Note: Not for merging at the moment, just for discussion with the COLO guys]

Signed-off-by: Dr. David Alan Gilbert <address@hidden>
---
  migration/ram.c | 215 +++++++++++++++++++++++++++++++++++++++++++++-----------
  1 file changed, 174 insertions(+), 41 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 188c3a1..6458863 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -42,6 +42,7 @@
  #include "qemu/rcu_queue.h"
  #include "migration/colo.h"
  #include "crypto/hash.h"
+#include "sysemu/sysemu.h"

  #ifdef DEBUG_MIGRATION_RAM
  #define DPRINTF(fmt, ...) \
@@ -616,18 +617,18 @@ static inline bool 
migration_bitmap_clear_dirty(ram_addr_t addr)
      ret = test_and_clear_bit(nr, bitmap);

      if (ret) {
-        migration_dirty_pages--;
+        atomic_dec(&migration_dirty_pages);
      }
      return ret;
  }

  static inline
  ram_addr_t ramlist_bitmap_find_and_reset_dirty(RAMBlock *rb,
-                                               ram_addr_t start)
+                                               ram_addr_t start,
+                                               uint64_t rb_size)
  {
      unsigned long base = rb->offset >> TARGET_PAGE_BITS;
      unsigned long nr = base + (start >> TARGET_PAGE_BITS);
-    uint64_t rb_size = rb->used_length;
      unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
      unsigned long next;

@@ -2721,6 +2722,141 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
      return ret;
  }

+/* For parallel flushing of colo ram cache; it has 'n' threads,
+ * trying 'n'==number of cpus, but might be smarter to do something NUMA
+ * Each thread waits for 'triggers' and signals 'completes'.
+ */
+typedef struct ram_cache_threads {
+    QemuThread *threads;
+    QemuEvent  *triggers;
+    QemuEvent  *completes;
+    bool        quit;
+} ColoRamCacheThreads;
+
+static ColoRamCacheThreads colo_flush_threads;
+
+/*
+ * Helper for colo_flush_thread.
+ * Given the current block and the bounds of ram (in pages)
+ * that we're dealing with, then find our next useable block,
+ * and set offset_out/used_out to the limits we're interested in.
+ * Result is NULL when we run out of blocks.
+ */
+static RAMBlock *flush_find_next_block(RAMBlock *block,
+                                       ram_addr_t lower_bound,
+                                       ram_addr_t upper_bound,
+                                       ram_addr_t *offset_out,
+                                       ram_addr_t *used_out)
+{
+    do {
+        if (!block) {
+            block = QLIST_FIRST_RCU(&ram_list.blocks);
+        } else {
+            block = QLIST_NEXT_RCU(block, next);
+        }
+        if (block &&
+            (block->offset + block->used_length) >= lower_bound &&
+            (block->offset < upper_bound)) {
+            /* OK, good, the block is at least partially within our bounds */
+            if (block->offset <= lower_bound) {
+                *offset_out = lower_bound - block->offset;
+            } else {
+                *offset_out = 0;
+            }
+            if ((block->offset + block->used_length) >= upper_bound) {
+                *used_out = upper_bound - block->offset;
+            } else {
+                *used_out = block->used_length;
+            }
+            break;
+        }
+    } while (block);
+
+    return block;
+}
+
+/* Flush thread for COLO ram cache, synchronises a proportion of the
+ * ram cache.
+ */
+static void *colo_flush_thread(void *opaque)
+{
+    int i = (int)(intptr_t)opaque;
+    int64_t ram_cache_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+    ram_addr_t lower, upper;
+    /* work out our range, lower..upper-1
+     * want to avoid SMP issues on the dirty bitmap, so make sure
+     * our limits never land on the same 64bit word
+     */
+    ram_addr_t chunk_size = (ram_cache_pages / smp_cpus) & ~63ul;
+
+    lower = i * chunk_size;
+
+    if (i != (smp_cpus - 1)) {
+        upper = (i + 1) * chunk_size;
+    } else {
+        /* Last thread gets deals with extra few pages due to rounding */
+        upper = ram_cache_pages;
+    }
+    lower <<= TARGET_PAGE_BITS;
+    upper <<= TARGET_PAGE_BITS;
+
+    while (true) {
+        RAMBlock *block = NULL;
+        void *dst_host;
+        void *src_host;
+        ram_addr_t offset = ~0, host_off = 0, cache_off = 0, used_length = 0;
+        uint64_t host_dirty = 0, both_dirty = 0;
+
+        /* Wait for work */
+        qemu_event_wait(&colo_flush_threads.triggers[i]);
+        qemu_event_reset(&colo_flush_threads.triggers[i]);
+        if (colo_flush_threads.quit) {
+            break;
+        }
+
+        rcu_read_lock();
+        /* note offset is initialised to ~0 so 1st time through we drop
+         * through to finding a block
+         */
+        do {
+            ram_addr_t ram_addr_abs;
+            if (cache_off == offset) { /* walk ramblock->colo_cache */
+                cache_off = migration_bitmap_find_dirty(block,
+                                                        offset, &ram_addr_abs);
+                if (cache_off < used_length) {
+                    migration_bitmap_clear_dirty(ram_addr_abs);
+                }
+            }
+            if (host_off == offset) { /* walk ramblock->host */
+                host_off = ramlist_bitmap_find_and_reset_dirty(block, offset,
+                               used_length);
+            }
+            if (!block || (host_off >= used_length &&
+                cache_off >= used_length)) {
+                block = flush_find_next_block(block, lower, upper,
+                                              &offset, &used_length);
+                cache_off = host_off = offset;
+            } else {
+                if (host_off <= cache_off) {
+                    offset = host_off;
+                    host_dirty++;
+                    both_dirty += (host_off == cache_off);
+                } else {
+                    offset = cache_off;
+                }
+                dst_host = block->host + offset;
+                src_host = block->colo_cache + offset;
+                memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
+            }
+        } while (block);
+        rcu_read_unlock();
+        trace_colo_flush_ram_cache_end(host_dirty, both_dirty);
+
+        qemu_event_set(&colo_flush_threads.completes[i]);
+    }
+
+    return NULL;
+}
  /*
   * colo cache: this is for secondary VM, we cache the whole
   * memory of the secondary VM, it will be called after first migration.
@@ -2729,6 +2865,7 @@ int colo_init_ram_cache(void)
  {
      RAMBlock *block;
      int64_t ram_cache_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+    int i;

      rcu_read_lock();
      QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
@@ -2753,6 +2890,20 @@ int colo_init_ram_cache(void)
      migration_dirty_pages = 0;
      memory_global_dirty_log_start();

+    colo_flush_threads.threads = g_new0(QemuThread, smp_cpus);
+    colo_flush_threads.triggers = g_new0(QemuEvent, smp_cpus);
+    colo_flush_threads.completes = g_new0(QemuEvent, smp_cpus);
+    colo_flush_threads.quit = false;
+    for (i = 0; i < smp_cpus; i++) {
+        char name[32];
+        qemu_event_init(&colo_flush_threads.triggers[i], false);
+        qemu_event_init(&colo_flush_threads.completes[i], false);
+        sprintf(name, "colofl: %d", i);
+        qemu_thread_create(&colo_flush_threads.threads[i], name,
+                           colo_flush_thread,
+                           (void *)(intptr_t)i,
+                           QEMU_THREAD_JOINABLE);
+    }
      return 0;

  out_locked:
@@ -2771,9 +2922,19 @@ void colo_release_ram_cache(void)
  {
      RAMBlock *block;
      struct BitmapRcu *bitmap = migration_bitmap_rcu;
+    int i;

      ram_cache_enable = false;

+    colo_flush_threads.quit = true;
+    for (i = 0; i < smp_cpus; i++) {
+        qemu_event_set(&colo_flush_threads.triggers[i]);
+        qemu_thread_join(&colo_flush_threads.threads[i]);
+    }
+    g_free(colo_flush_threads.threads);
+    g_free(colo_flush_threads.triggers);
+    g_free(colo_flush_threads.completes);
+
      atomic_rcu_set(&migration_bitmap_rcu, NULL);
      if (bitmap) {
          call_rcu(bitmap, migration_bitmap_free, rcu);
@@ -2795,47 +2956,19 @@ void colo_release_ram_cache(void)
   */
  void colo_flush_ram_cache(void)
  {
-    RAMBlock *block = NULL;
-    void *dst_host;
-    void *src_host;
-    ram_addr_t offset = 0, host_off = 0, cache_off = 0;
-    uint64_t host_dirty = 0, both_dirty = 0;
-
+    int i;
      trace_colo_flush_ram_cache_begin(migration_dirty_pages);
      address_space_sync_dirty_bitmap(&address_space_memory);
-    rcu_read_lock();
-    block = QLIST_FIRST_RCU(&ram_list.blocks);
-    while (block) {
-        ram_addr_t ram_addr_abs;
-        if (cache_off == offset) { /* walk ramblock->colo_cache */
-            cache_off = migration_bitmap_find_dirty(block,
-                                                    offset, &ram_addr_abs);
-            if (cache_off < block->used_length) {
-                migration_bitmap_clear_dirty(ram_addr_abs);
-            }
-        }
-        if (host_off == offset) { /* walk ramblock->host */
-            host_off = ramlist_bitmap_find_and_reset_dirty(block, offset);
-        }
-        if (host_off >= block->used_length &&
-            cache_off >= block->used_length) {
-            cache_off = host_off = offset = 0;
-            block = QLIST_NEXT_RCU(block, next);
-        } else {
-            if (host_off <= cache_off) {
-                offset = host_off;
-                host_dirty++;
-                both_dirty += (host_off == cache_off);
-            } else {
-                offset = cache_off;
-            }
-            dst_host = block->host + offset;
-            src_host = block->colo_cache + offset;
-            memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
-        }
+
+    /* Kick all the flush threads to start work */
+    for (i = 0; i < smp_cpus; i++) {
+        qemu_event_reset(&colo_flush_threads.completes[i]);
+        qemu_event_set(&colo_flush_threads.triggers[i]);
+    }
+    /* ...and wait for them to finish */
+    for (i = 0; i < smp_cpus; i++) {
+        qemu_event_wait(&colo_flush_threads.completes[i]);
      }
-    rcu_read_unlock();
-    trace_colo_flush_ram_cache_end(host_dirty, both_dirty);
      assert(migration_dirty_pages == 0);
  }

[Prev in Thread]

Current Thread

[Next in Thread]

[Qemu-devel] [WIP:COLO: 1] Flush colo ram in parallel, Dr. David Alan Gilbert (git), 2016/02/29
- Re: [Qemu-devel] [WIP:COLO: 1] Flush colo ram in parallel, Hailiang Zhang <=
  - Re: [Qemu-devel] [WIP:COLO: 1] Flush colo ram in parallel, Li Zhijian, 2016/02/29

Prev by Date: [Qemu-devel] [PING][PATCH] Use unsigned types for the 'len' argument of all memory read/write functions
Next by Date: Re: [Qemu-devel] [RFC PATCH v0 2/6] spapr: CPU core device
Previous by thread: [Qemu-devel] [WIP:COLO: 1] Flush colo ram in parallel
Next by thread: Re: [Qemu-devel] [WIP:COLO: 1] Flush colo ram in parallel
Index(es):
- Date
- Thread