qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 7/6] block/io: improve loadvm performance


From: Vladimir Sementsov-Ogievskiy
Subject: Re: [PATCH 7/6] block/io: improve loadvm performance
Date: Mon, 22 Jun 2020 21:13:16 +0300
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.9.0

22.06.2020 20:17, Denis V. Lunev wrote:
On 6/22/20 8:02 PM, Vladimir Sementsov-Ogievskiy wrote:
22.06.2020 11:33, Denis V. Lunev wrote:
This patch creates intermediate buffer for reading from block driver
state and performs read-ahead to this buffer. Snapshot code performs
reads sequentially and thus we know what offsets will be required
and when they will become not needed.

Results are fantastic. Switch to snapshot times of 2GB Fedora 31 VM
over NVME storage are the following:
                  original     fixed
cached:          1.84s       1.16s
non-cached:     12.74s       1.27s

The difference over HDD would be more significant :)

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Max Reitz <mreitz@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
CC: Fam Zheng <fam@euphon.net>
CC: Juan Quintela <quintela@redhat.com>
CC: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
CC: Denis Plotnikov <dplotnikov@virtuozzo.com>
---

   block/io.c                | 225 +++++++++++++++++++++++++++++++++++++-
   include/block/block_int.h |   3 +
   2 files changed, 225 insertions(+), 3 deletions(-)

diff --git a/block/io.c b/block/io.c
index 71a696deb7..bb06f750d8 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2739,6 +2739,180 @@ static int
bdrv_co_do_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
       }
   }
   +
+typedef struct BdrvLoadVMChunk {
+    void *buf;
+    uint64_t offset;
+    ssize_t bytes;
+
+    QLIST_ENTRY(BdrvLoadVMChunk) list;
+} BdrvLoadVMChunk;
+
+typedef struct BdrvLoadVMState {
+    AioTaskPool *pool;
+
+    int64_t offset;
+    int64_t last_loaded;
+
+    int chunk_count;
+    QLIST_HEAD(, BdrvLoadVMChunk) chunks;
+    QLIST_HEAD(, BdrvLoadVMChunk) loading;
+    CoMutex lock;
+    CoQueue waiters;
+} BdrvLoadVMState;
+
+typedef struct BdrvLoadVMStateTask {
+    AioTask task;
+
+    BlockDriverState *bs;
+    BdrvLoadVMChunk *chunk;
+} BdrvLoadVMStateTask;
+
+static BdrvLoadVMChunk *bdrv_co_find_loadvmstate_chunk(int64_t pos,
+
BdrvLoadVMChunk *c)
+{
+    for (; c != NULL; c = QLIST_NEXT(c, list)) {
+        if (c->offset <= pos && c->offset + c->bytes > pos) {
+            return c;
+        }
+    }
+
+    return NULL;
+}
+
+static void bdrv_free_loadvm_chunk(BdrvLoadVMChunk *c)
+{
+    qemu_vfree(c->buf);
+    g_free(c);
+}
+
+static coroutine_fn int bdrv_co_vmstate_load_task_entry(AioTask *task)
+{
+    int err = 0;
+    BdrvLoadVMStateTask *t = container_of(task, BdrvLoadVMStateTask,
task);
+    BdrvLoadVMChunk *c = t->chunk;
+    BdrvLoadVMState *state = t->bs->loadvm_state;
+    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, c->buf, c->bytes);
+
+    bdrv_inc_in_flight(t->bs);
+    err = t->bs->drv->bdrv_load_vmstate(t->bs, &qiov, c->offset);
+    bdrv_dec_in_flight(t->bs);
+
+    qemu_co_mutex_lock(&state->lock);
+    QLIST_REMOVE(c, list);
+    if (err == 0) {
+        QLIST_INSERT_HEAD(&state->chunks, c, list);
+    } else {
+        bdrv_free_loadvm_chunk(c);
+    }
+    qemu_co_mutex_unlock(&state->lock);
+    qemu_co_queue_restart_all(&state->waiters);
+
+    return err;
+}
+
+static void bdrv_co_start_loadvmstate(BlockDriverState *bs,
+                                      BdrvLoadVMState *state)
+{
+    int i;
+    size_t buf_size = MAX(bdrv_get_cluster_size(bs), 1 * MiB);
+
+    qemu_co_mutex_assert_locked(&state->lock);
+    for (i = state->chunk_count; i < BDRV_VMSTATE_WORKERS_MAX; i++) {
+        BdrvLoadVMStateTask *t = g_new(BdrvLoadVMStateTask, 1);
+
+        *t = (BdrvLoadVMStateTask) {
+            .task.func = bdrv_co_vmstate_load_task_entry,
+            .bs = bs,
+            .chunk = g_new(BdrvLoadVMChunk, 1),
+        };
+
+        *t->chunk = (BdrvLoadVMChunk) {
+            .buf = qemu_blockalign(bs, buf_size),
+            .offset = state->last_loaded,
+            .bytes = buf_size,
+        };
+        /* FIXME: tail of stream */
+
+        QLIST_INSERT_HEAD(&state->loading, t->chunk, list);
+        state->chunk_count++;
+        state->last_loaded += buf_size;
+
+        qemu_co_mutex_unlock(&state->lock);
+        aio_task_pool_start_task(state->pool, &t->task);

Hmm, so, we unlock the mutex here. Am I missing something, or if there
are
two parallel load-state requests in parallel (yes, this shouldn't happen
with current migration code, but if don't care at all the mutex is not
needed),

we may have two such loops in parallel, which will create more than
BDRV_VMSTATE_WORKERS_MAX chunks, as each has own local "i" variable?

I could stall on waiting free slot in the pool (technically)
while pool completion will need the lock. This should
NOT happen with the current arch, but would be better
to do things correctly, i.e. drop the lock.


+        qemu_co_mutex_lock(&state->lock);
+    }
+}
+
+static int bdrv_co_do_load_vmstate(BlockDriverState *bs,
QEMUIOVector *qiov,
+                                   int64_t pos)
+{
+    BdrvLoadVMState *state = bs->loadvm_state;
+    BdrvLoadVMChunk *c;
+    size_t off;
+
+    if (state == NULL) {
+        if (pos != 0) {
+            /* Normally this branch is not reachable from migration */
+            return bs->drv->bdrv_load_vmstate(bs, qiov, pos);
+        }
+
+        state = g_new(BdrvLoadVMState, 1);
+        *state = (BdrvLoadVMState) {
+            .pool = aio_task_pool_new(BDRV_VMSTATE_WORKERS_MAX),
+            .chunks = QLIST_HEAD_INITIALIZER(state->chunks),
+            .loading = QLIST_HEAD_INITIALIZER(state->loading),
+        };
+        qemu_co_mutex_init(&state->lock);
+        qemu_co_queue_init(&state->waiters);
+
+        bs->loadvm_state = state;
+    }
+
+    if (state->offset != pos) {
+        /* Normally this branch is not reachable from migration */
+        return bs->drv->bdrv_load_vmstate(bs, qiov, pos);
+    }
+
+    off = 0;
+    qemu_co_mutex_lock(&state->lock);
+    bdrv_co_start_loadvmstate(bs, state);
+
+    while (off < qiov->size && aio_task_pool_status(state->pool) ==
0) {
+        c = bdrv_co_find_loadvmstate_chunk(pos,
QLIST_FIRST(&state->chunks));
+        if (c != NULL) {
+            ssize_t chunk_off = pos - c->offset;
+            ssize_t to_copy = MIN(qiov->size - off, c->bytes -
chunk_off);
+
+            qemu_iovec_from_buf(qiov, off, c->buf + chunk_off,
to_copy);
+
+            off += to_copy;
+            pos += to_copy;
+
+            if (pos == c->offset + c->bytes) {
+                state->chunk_count--;
+                /* End of buffer, discard it from the list */
+                QLIST_REMOVE(c, list);
+                bdrv_free_loadvm_chunk(c);
+            }
+
+            state->offset += to_copy;
+            continue;
+        }
+
+        c = bdrv_co_find_loadvmstate_chunk(pos,
QLIST_FIRST(&state->loading));
+        if (c != NULL) {
+            qemu_co_queue_wait(&state->waiters, &state->lock);
+            continue;
+        }
+
+        bdrv_co_start_loadvmstate(bs, state);

Hmm, so, you assume that if we are in a loop, we'll definitely find
our chunk at some moment.
It should work with normal migration stream, sequentially reading
chunk by chunk. But if not,
it may hang:

Consider two reads at same position, running in parallel: they both
pass "state->offset != pos"
check and wait on mutex. Then the first one will go into critical
section, find and remove
corresponding chunk. Then the second will go into critical section and
loop forever, because
corresponding chunk will never be generated again.

So, seems, we should check (state->offset == pos) on each iteration,
or something like this.

we can. But the reader is synchronous and could not
issue 2 requests. Not a problem.


I think, it's OK to rely on current one-synchronous-reader arch, but some
relating comments/assertions won't hurt.


--
Best regards,
Vladimir



reply via email to

[Prev in Thread] Current Thread [Next in Thread]