qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PULL 02/32] memory: RCU ram_list.dirty_memory[] for sa


From: Paolo Bonzini
Subject: Re: [Qemu-devel] [PULL 02/32] memory: RCU ram_list.dirty_memory[] for safe RAM hotplug
Date: Wed, 10 Feb 2016 18:01:55 +0100
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.5.0


On 10/02/2016 17:56, Leon Alrae wrote:
> Hi,
> 
> I just noticed significant performance hit with this change. Booting
> small system (I tried on system mips only) was usually taking around 20
> seconds, now reaches 3 minutes with this change.

You're lucky that it booted at all. :)  Unfortunately I noticed a slow
boot yesterday (6 minutes for Fedora 21 on TCG), but I blamed that I was
running on top of TCG _and_ NFS.  In retrospect that was not too smart.

Today I tried FreeDOS and it hangs completely, but my self-nak and
Peter's push unfortunately crossed.

I've posted a fix and Stefan has already reviewed it.  If you can test
it, that would be great.  The above Fedora 21 image takes about 50
seconds to boot with the patch, so that's in line with the x9 slowdown
you reported.

http://article.gmane.org/gmane.comp.emulators.qemu/393105/raw

Paolo

> Leon
> 
> On 09/02/16 12:13, Paolo Bonzini wrote:
>> From: Stefan Hajnoczi <address@hidden>
>>
>> Although accesses to ram_list.dirty_memory[] use atomics so multiple
>> threads can safely dirty the bitmap, the data structure is not fully
>> thread-safe yet.
>>
>> This patch handles the RAM hotplug case where ram_list.dirty_memory[] is
>> grown.  ram_list.dirty_memory[] is change from a regular bitmap to an
>> RCU array of pointers to fixed-size bitmap blocks.  Threads can continue
>> accessing bitmap blocks while the array is being extended.  See the
>> comments in the code for an in-depth explanation of struct
>> DirtyMemoryBlocks.
>>
>> I have tested that live migration with virtio-blk dataplane works.
>>
>> Signed-off-by: Stefan Hajnoczi <address@hidden>
>> Message-Id: <address@hidden>
>> Signed-off-by: Paolo Bonzini <address@hidden>
>> ---
>>  exec.c                  |  75 +++++++++++++++----
>>  include/exec/ram_addr.h | 189 
>> ++++++++++++++++++++++++++++++++++++++++++------
>>  migration/ram.c         |   4 -
>>  3 files changed, 225 insertions(+), 43 deletions(-)
>>
>> diff --git a/exec.c b/exec.c
>> index ab37360..7d67c11 100644
>> --- a/exec.c
>> +++ b/exec.c
>> @@ -980,8 +980,9 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t 
>> start,
>>                                                ram_addr_t length,
>>                                                unsigned client)
>>  {
>> +    DirtyMemoryBlocks *blocks;
>>      unsigned long end, page;
>> -    bool dirty;
>> +    bool dirty = false;
>>  
>>      if (length == 0) {
>>          return false;
>> @@ -989,8 +990,22 @@ bool 
>> cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
>>  
>>      end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
>>      page = start >> TARGET_PAGE_BITS;
>> -    dirty = bitmap_test_and_clear_atomic(ram_list.dirty_memory[client],
>> -                                         page, end - page);
>> +
>> +    rcu_read_lock();
>> +
>> +    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
>> +
>> +    while (page < end) {
>> +        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - 
>> offset);
>> +
>> +        dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
>> +                                              offset, num);
>> +        page += num;
>> +    }
>> +
>> +    rcu_read_unlock();
>>  
>>      if (dirty && tcg_enabled()) {
>>          tlb_reset_dirty_range_all(start, length);
>> @@ -1504,6 +1519,47 @@ int qemu_ram_resize(ram_addr_t base, ram_addr_t 
>> newsize, Error **errp)
>>      return 0;
>>  }
>>  
>> +/* Called with ram_list.mutex held */
>> +static void dirty_memory_extend(ram_addr_t old_ram_size,
>> +                                ram_addr_t new_ram_size)
>> +{
>> +    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
>> +                                             DIRTY_MEMORY_BLOCK_SIZE);
>> +    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
>> +                                             DIRTY_MEMORY_BLOCK_SIZE);
>> +    int i;
>> +
>> +    /* Only need to extend if block count increased */
>> +    if (new_num_blocks <= old_num_blocks) {
>> +        return;
>> +    }
>> +
>> +    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
>> +        DirtyMemoryBlocks *old_blocks;
>> +        DirtyMemoryBlocks *new_blocks;
>> +        int j;
>> +
>> +        old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
>> +        new_blocks = g_malloc(sizeof(*new_blocks) +
>> +                              sizeof(new_blocks->blocks[0]) * 
>> new_num_blocks);
>> +
>> +        if (old_num_blocks) {
>> +            memcpy(new_blocks->blocks, old_blocks->blocks,
>> +                   old_num_blocks * sizeof(old_blocks->blocks[0]));
>> +        }
>> +
>> +        for (j = old_num_blocks; j < new_num_blocks; j++) {
>> +            new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
>> +        }
>> +
>> +        atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
>> +
>> +        if (old_blocks) {
>> +            g_free_rcu(old_blocks, rcu);
>> +        }
>> +    }
>> +}
>> +
>>  static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
>>  {
>>      RAMBlock *block;
>> @@ -1543,6 +1599,7 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, 
>> Error **errp)
>>                (new_block->offset + new_block->max_length) >> 
>> TARGET_PAGE_BITS);
>>      if (new_ram_size > old_ram_size) {
>>          migration_bitmap_extend(old_ram_size, new_ram_size);
>> +        dirty_memory_extend(old_ram_size, new_ram_size);
>>      }
>>      /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
>>       * QLIST (which has an RCU-friendly variant) does not have insertion at
>> @@ -1568,18 +1625,6 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, 
>> Error **errp)
>>      ram_list.version++;
>>      qemu_mutex_unlock_ramlist();
>>  
>> -    new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
>> -
>> -    if (new_ram_size > old_ram_size) {
>> -        int i;
>> -
>> -        /* ram_list.dirty_memory[] is protected by the iothread lock.  */
>> -        for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
>> -            ram_list.dirty_memory[i] =
>> -                bitmap_zero_extend(ram_list.dirty_memory[i],
>> -                                   old_ram_size, new_ram_size);
>> -       }
>> -    }
>>      cpu_physical_memory_set_dirty_range(new_block->offset,
>>                                          new_block->used_length,
>>                                          DIRTY_CLIENTS_ALL);
>> diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
>> index f2e872d..b1413a1 100644
>> --- a/include/exec/ram_addr.h
>> +++ b/include/exec/ram_addr.h
>> @@ -49,13 +49,43 @@ static inline void *ramblock_ptr(RAMBlock *block, 
>> ram_addr_t offset)
>>      return (char *)block->host + offset;
>>  }
>>  
>> +/* The dirty memory bitmap is split into fixed-size blocks to allow growth
>> + * under RCU.  The bitmap for a block can be accessed as follows:
>> + *
>> + *   rcu_read_lock();
>> + *
>> + *   DirtyMemoryBlocks *blocks =
>> + *       atomic_rcu_read(&ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION]);
>> + *
>> + *   ram_addr_t idx = (addr >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE;
>> + *   unsigned long *block = blocks.blocks[idx];
>> + *   ...access block bitmap...
>> + *
>> + *   rcu_read_unlock();
>> + *
>> + * Remember to check for the end of the block when accessing a range of
>> + * addresses.  Move on to the next block if you reach the end.
>> + *
>> + * Organization into blocks allows dirty memory to grow (but not shrink) 
>> under
>> + * RCU.  When adding new RAMBlocks requires the dirty memory to grow, a new
>> + * DirtyMemoryBlocks array is allocated with pointers to existing blocks 
>> kept
>> + * the same.  Other threads can safely access existing blocks while dirty
>> + * memory is being grown.  When no threads are using the old 
>> DirtyMemoryBlocks
>> + * anymore it is freed by RCU (but the underlying blocks stay because they 
>> are
>> + * pointed to from the new DirtyMemoryBlocks).
>> + */
>> +#define DIRTY_MEMORY_BLOCK_SIZE ((ram_addr_t)256 * 1024 * 8)
>> +typedef struct {
>> +    struct rcu_head rcu;
>> +    unsigned long *blocks[];
>> +} DirtyMemoryBlocks;
>> +
>>  typedef struct RAMList {
>>      QemuMutex mutex;
>> -    /* Protected by the iothread lock.  */
>> -    unsigned long *dirty_memory[DIRTY_MEMORY_NUM];
>>      RAMBlock *mru_block;
>>      /* RCU-enabled, writes protected by the ramlist lock. */
>>      QLIST_HEAD(, RAMBlock) blocks;
>> +    DirtyMemoryBlocks *dirty_memory[DIRTY_MEMORY_NUM];
>>      uint32_t version;
>>  } RAMList;
>>  extern RAMList ram_list;
>> @@ -89,30 +119,70 @@ static inline bool 
>> cpu_physical_memory_get_dirty(ram_addr_t start,
>>                                                   ram_addr_t length,
>>                                                   unsigned client)
>>  {
>> -    unsigned long end, page, next;
>> +    DirtyMemoryBlocks *blocks;
>> +    unsigned long end, page;
>> +    bool dirty = false;
>>  
>>      assert(client < DIRTY_MEMORY_NUM);
>>  
>>      end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
>>      page = start >> TARGET_PAGE_BITS;
>> -    next = find_next_bit(ram_list.dirty_memory[client], end, page);
>>  
>> -    return next < end;
>> +    rcu_read_lock();
>> +
>> +    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
>> +
>> +    while (page < end) {
>> +        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - 
>> offset);
>> +
>> +        if (find_next_bit(blocks->blocks[idx], offset, num) < num) {
>> +            dirty = true;
>> +            break;
>> +        }
>> +
>> +        page += num;
>> +    }
>> +
>> +    rcu_read_unlock();
>> +
>> +    return dirty;
>>  }
>>  
>>  static inline bool cpu_physical_memory_all_dirty(ram_addr_t start,
>>                                                   ram_addr_t length,
>>                                                   unsigned client)
>>  {
>> -    unsigned long end, page, next;
>> +    DirtyMemoryBlocks *blocks;
>> +    unsigned long end, page;
>> +    bool dirty = true;
>>  
>>      assert(client < DIRTY_MEMORY_NUM);
>>  
>>      end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
>>      page = start >> TARGET_PAGE_BITS;
>> -    next = find_next_zero_bit(ram_list.dirty_memory[client], end, page);
>>  
>> -    return next >= end;
>> +    rcu_read_lock();
>> +
>> +    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
>> +
>> +    while (page < end) {
>> +        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - 
>> offset);
>> +
>> +        if (find_next_zero_bit(blocks->blocks[idx], offset, num) < num) {
>> +            dirty = false;
>> +            break;
>> +        }
>> +
>> +        page += num;
>> +    }
>> +
>> +    rcu_read_unlock();
>> +
>> +    return dirty;
>>  }
>>  
>>  static inline bool cpu_physical_memory_get_dirty_flag(ram_addr_t addr,
>> @@ -154,16 +224,31 @@ static inline uint8_t 
>> cpu_physical_memory_range_includes_clean(ram_addr_t start,
>>  static inline void cpu_physical_memory_set_dirty_flag(ram_addr_t addr,
>>                                                        unsigned client)
>>  {
>> +    unsigned long page, idx, offset;
>> +    DirtyMemoryBlocks *blocks;
>> +
>>      assert(client < DIRTY_MEMORY_NUM);
>> -    set_bit_atomic(addr >> TARGET_PAGE_BITS, ram_list.dirty_memory[client]);
>> +
>> +    page = addr >> TARGET_PAGE_BITS;
>> +    idx = page / DIRTY_MEMORY_BLOCK_SIZE;
>> +    offset = page % DIRTY_MEMORY_BLOCK_SIZE;
>> +
>> +    rcu_read_lock();
>> +
>> +    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
>> +
>> +    set_bit_atomic(offset, blocks->blocks[idx]);
>> +
>> +    rcu_read_unlock();
>>  }
>>  
>>  static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start,
>>                                                         ram_addr_t length,
>>                                                         uint8_t mask)
>>  {
>> +    DirtyMemoryBlocks *blocks[DIRTY_MEMORY_NUM];
>>      unsigned long end, page;
>> -    unsigned long **d = ram_list.dirty_memory;
>> +    int i;
>>  
>>      if (!mask && !xen_enabled()) {
>>          return;
>> @@ -171,15 +256,36 @@ static inline void 
>> cpu_physical_memory_set_dirty_range(ram_addr_t start,
>>  
>>      end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
>>      page = start >> TARGET_PAGE_BITS;
>> -    if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) {
>> -        bitmap_set_atomic(d[DIRTY_MEMORY_MIGRATION], page, end - page);
>> -    }
>> -    if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) {
>> -        bitmap_set_atomic(d[DIRTY_MEMORY_VGA], page, end - page);
>> +
>> +    rcu_read_lock();
>> +
>> +    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
>> +        blocks[i] = atomic_rcu_read(&ram_list.dirty_memory[i]);
>>      }
>> -    if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) {
>> -        bitmap_set_atomic(d[DIRTY_MEMORY_CODE], page, end - page);
>> +
>> +    while (page < end) {
>> +        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - 
>> offset);
>> +
>> +        if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) {
>> +            bitmap_set_atomic(blocks[DIRTY_MEMORY_MIGRATION]->blocks[idx],
>> +                              offset, num);
>> +        }
>> +        if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) {
>> +            bitmap_set_atomic(blocks[DIRTY_MEMORY_VGA]->blocks[idx],
>> +                              offset, num);
>> +        }
>> +        if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) {
>> +            bitmap_set_atomic(blocks[DIRTY_MEMORY_CODE]->blocks[idx],
>> +                              offset, num);
>> +        }
>> +
>> +        page += num;
>>      }
>> +
>> +    rcu_read_unlock();
>> +
>>      xen_modified_memory(start, length);
>>  }
>>  
>> @@ -199,21 +305,41 @@ static inline void 
>> cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
>>      /* start address is aligned at the start of a word? */
>>      if ((((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) &&
>>          (hpratio == 1)) {
>> +        unsigned long **blocks[DIRTY_MEMORY_NUM];
>> +        unsigned long idx;
>> +        unsigned long offset;
>>          long k;
>>          long nr = BITS_TO_LONGS(pages);
>>  
>> +        idx = (start >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE;
>> +        offset = BIT_WORD((start >> TARGET_PAGE_BITS) %
>> +                          DIRTY_MEMORY_BLOCK_SIZE);
>> +
>> +        rcu_read_lock();
>> +
>> +        for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
>> +            blocks[i] = atomic_rcu_read(&ram_list.dirty_memory[i])->blocks;
>> +        }
>> +
>>          for (k = 0; k < nr; k++) {
>>              if (bitmap[k]) {
>>                  unsigned long temp = leul_to_cpu(bitmap[k]);
>> -                unsigned long **d = ram_list.dirty_memory;
>>  
>> -                atomic_or(&d[DIRTY_MEMORY_MIGRATION][page + k], temp);
>> -                atomic_or(&d[DIRTY_MEMORY_VGA][page + k], temp);
>> +                atomic_or(&blocks[DIRTY_MEMORY_MIGRATION][idx][offset], 
>> temp);
>> +                atomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp);
>>                  if (tcg_enabled()) {
>> -                    atomic_or(&d[DIRTY_MEMORY_CODE][page + k], temp);
>> +                    atomic_or(&blocks[DIRTY_MEMORY_CODE][idx][offset], 
>> temp);
>>                  }
>>              }
>> +
>> +            if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
>> +                offset = 0;
>> +                idx++;
>> +            }
>>          }
>> +
>> +        rcu_read_unlock();
>> +
>>          xen_modified_memory(start, pages << TARGET_PAGE_BITS);
>>      } else {
>>          uint8_t clients = tcg_enabled() ? DIRTY_CLIENTS_ALL : 
>> DIRTY_CLIENTS_NOCODE;
>> @@ -265,18 +391,33 @@ uint64_t 
>> cpu_physical_memory_sync_dirty_bitmap(unsigned long *dest,
>>      if (((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) {
>>          int k;
>>          int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS);
>> -        unsigned long *src = ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION];
>> +        unsigned long * const *src;
>> +        unsigned long idx = (page * BITS_PER_LONG) / 
>> DIRTY_MEMORY_BLOCK_SIZE;
>> +        unsigned long offset = BIT_WORD((page * BITS_PER_LONG) %
>> +                                        DIRTY_MEMORY_BLOCK_SIZE);
>> +
>> +        rcu_read_lock();
>> +
>> +        src = atomic_rcu_read(
>> +                &ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION])->blocks;
>>  
>>          for (k = page; k < page + nr; k++) {
>> -            if (src[k]) {
>> -                unsigned long bits = atomic_xchg(&src[k], 0);
>> +            if (src[idx][offset]) {
>> +                unsigned long bits = atomic_xchg(&src[idx][offset], 0);
>>                  unsigned long new_dirty;
>>                  new_dirty = ~dest[k];
>>                  dest[k] |= bits;
>>                  new_dirty &= bits;
>>                  num_dirty += ctpopl(new_dirty);
>>              }
>> +
>> +            if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
>> +                offset = 0;
>> +                idx++;
>> +            }
>>          }
>> +
>> +        rcu_read_unlock();
>>      } else {
>>          for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) {
>>              if (cpu_physical_memory_test_and_clear_dirty(
>> diff --git a/migration/ram.c b/migration/ram.c
>> index 3cdfea4..96c749f 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -609,7 +609,6 @@ static void migration_bitmap_sync_init(void)
>>      iterations_prev = 0;
>>  }
>>  
>> -/* Called with iothread lock held, to protect ram_list.dirty_memory[] */
>>  static void migration_bitmap_sync(void)
>>  {
>>      RAMBlock *block;
>> @@ -1921,8 +1920,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>          acct_clear();
>>      }
>>  
>> -    /* iothread lock needed for ram_list.dirty_memory[] */
>> -    qemu_mutex_lock_iothread();
>>      qemu_mutex_lock_ramlist();
>>      rcu_read_lock();
>>      bytes_transferred = 0;
>> @@ -1947,7 +1944,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>      memory_global_dirty_log_start();
>>      migration_bitmap_sync();
>>      qemu_mutex_unlock_ramlist();
>> -    qemu_mutex_unlock_iothread();
>>  
>>      qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
>>  
>>
> 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]