qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 2/2] qcow2: improve savevm performance


From: Denis V. Lunev
Subject: Re: [PATCH 2/2] qcow2: improve savevm performance
Date: Thu, 11 Jun 2020 11:25:35 +0300
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.8.0

On 6/11/20 11:04 AM, Vladimir Sementsov-Ogievskiy wrote:
> 10.06.2020 22:00, Denis V. Lunev wrote:
>> This patch does 2 standard basic things:
>> - it creates intermediate buffer for all writes from QEMU migration code
>>    to QCOW2 image,
>> - this buffer is sent to disk asynchronously, allowing several writes to
>>    run in parallel.
>>
>> In general, migration code is fantastically inefficent (by observation),
>> buffers are not aligned and sent with arbitrary pieces, a lot of time
>> less than 100 bytes at a chunk, which results in read-modify-write
>> operations with non-cached operations. It should also be noted that all
>> operations are performed into unallocated image blocks, which also
>> suffer
>> due to partial writes to such new clusters.
>>
>> Snapshot creation time (2 GB Fedora-31 VM running over NVME storage):
>>                  original     fixed
>> cached:          1.79s       1.27s
>> non-cached:      3.29s       0.81s
>>
>> The difference over HDD would be more significant :)
>>
>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>
> If I follow correctly, you make qcow2_save_vmstate implicitly
> asynchronous:
> it may return immediately after creating a task, and task is executing in
> parallel.
>
> I think, block-layer is unprepared for such behavior, it rely on the
> fact that
> .bdrv_save_vmstate is synchronous.
>
> For example, look at bdrv_co_rw_vmstate(). It calls
> drv->bdrv_save_vmstate
> inside pair of bdrv_inc_in_flight()/bdrv_dec_in_flight(). It means
> that with
> this patch, we may break drained section.
>
Drained sections are not involved into the equation - the guest
is stopped at the moment.

If we are talking about in_flight count, it should not be a problem
even if the guest is running. We could just put inc/dec into
qcow2_co_vmstate_task_entry().

> Next, it's a kind of cache for vmstate-write operation. It seems for
> me that
> it's not directly related to qcow2. So, we can implement it in generic
> block
> layer, where we can handle in_fligth requests. Can we keep
> .bdrv_save_vmstate
> handlers of format drivers as is, keep them synchronous, but instead
> change
> generic interface to be (optionally?) cached?

This has been discussed already in the previous thread. .bdrv_save_vmstate
is implemented in QCOW2 and sheepdog only. Thus other formats are
mostly non-interested.

>
>> ---
>>   block/qcow2.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++++-
>>   block/qcow2.h |   4 ++
>>   2 files changed, 113 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/qcow2.c b/block/qcow2.c
>> index 0cd2e6757e..e6232f32e2 100644
>> --- a/block/qcow2.c
>> +++ b/block/qcow2.c
>> @@ -4797,11 +4797,43 @@ static int qcow2_make_empty(BlockDriverState
>> *bs)
>>       return ret;
>>   }
>>   +
>> +typedef struct Qcow2VMStateTask {
>> +    AioTask task;
>> +
>> +    BlockDriverState *bs;
>> +    int64_t offset;
>> +    void *buf;
>> +    size_t bytes;
>> +} Qcow2VMStateTask;
>> +
>> +typedef struct Qcow2SaveVMState {
>> +    AioTaskPool *pool;
>> +    Qcow2VMStateTask *t;
>> +} Qcow2SaveVMState;
>> +
>>   static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
>>   {
>>       BDRVQcow2State *s = bs->opaque;
>> +    Qcow2SaveVMState *state = s->savevm_state;
>>       int ret;
>>   +    if (state != NULL) {
>> +        aio_task_pool_start_task(state->pool, &state->t->task);
>> +
>> +        aio_task_pool_wait_all(state->pool);
>> +        ret = aio_task_pool_status(state->pool);
>> +
>> +        aio_task_pool_free(state->pool);
>> +        g_free(state);
>> +
>> +        s->savevm_state = NULL;
>> +
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +    }
>> +
>>       qemu_co_mutex_lock(&s->lock);
>>       ret = qcow2_write_caches(bs);
>>       qemu_co_mutex_unlock(&s->lock);
>> @@ -5098,14 +5130,89 @@ static int
>> qcow2_has_zero_init(BlockDriverState *bs)
>>       }
>>   }
>>   +
>> +static coroutine_fn int qcow2_co_vmstate_task_entry(AioTask *task)
>> +{
>> +    int err = 0;
>> +    Qcow2VMStateTask *t = container_of(task, Qcow2VMStateTask, task);
>> +
>> +    if (t->bytes != 0) {
>> +        QEMUIOVector local_qiov;
>> +        qemu_iovec_init_buf(&local_qiov, t->buf, t->bytes);
>> +        err = t->bs->drv->bdrv_co_pwritev_part(t->bs, t->offset,
>> t->bytes,
>> +                                               &local_qiov, 0, 0);
>> +    }
>> +
>> +    qemu_vfree(t->buf);
>> +    return err;
>> +}
>> +
>> +static Qcow2VMStateTask *qcow2_vmstate_task_create(BlockDriverState
>> *bs,
>> +                                                    int64_t pos,
>> size_t size)
>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +    Qcow2VMStateTask *t = g_new(Qcow2VMStateTask, 1);
>> +
>> +    *t = (Qcow2VMStateTask) {
>> +        .task.func = qcow2_co_vmstate_task_entry,
>> +        .buf = qemu_blockalign(bs, size),
>> +        .offset = qcow2_vm_state_offset(s) + pos,
>> +        .bs = bs,
>> +    };
>> +
>> +    return t;
>> +}
>> +
>>   static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector
>> *qiov,
>>                                 int64_t pos)
>>   {
>>       BDRVQcow2State *s = bs->opaque;
>> +    Qcow2SaveVMState *state = s->savevm_state;
>> +    Qcow2VMStateTask *t;
>> +    size_t buf_size = MAX(s->cluster_size, 1 * MiB);
>> +    size_t to_copy;
>> +    size_t off;
>>         BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
>> -    return bs->drv->bdrv_co_pwritev_part(bs,
>> qcow2_vm_state_offset(s) + pos,
>> -                                         qiov->size, qiov, 0, 0);
>> +
>> +    if (state == NULL) {
>> +        state = g_new(Qcow2SaveVMState, 1);
>> +        *state = (Qcow2SaveVMState) {
>> +            .pool = aio_task_pool_new(QCOW2_MAX_WORKERS),
>> +            .t = qcow2_vmstate_task_create(bs, pos, buf_size),
>> +        };
>> +
>> +        s->savevm_state = state;
>> +    }
>> +
>> +    if (aio_task_pool_status(state->pool) != 0) {
>> +        return aio_task_pool_status(state->pool);
>> +    }
>> +
>> +    t = state->t;
>> +    if (t->offset + t->bytes != qcow2_vm_state_offset(s) + pos) {
>> +        /* Normally this branch is not reachable from migration */
>> +        return bs->drv->bdrv_co_pwritev_part(bs,
>> +                qcow2_vm_state_offset(s) + pos, qiov->size, qiov, 0,
>> 0);
>> +    }
>> +
>> +    off = 0;
>> +    while (1) {
>> +        to_copy = MIN(qiov->size - off, buf_size - t->bytes);
>> +        qemu_iovec_to_buf(qiov, off, t->buf + t->bytes, to_copy);
>> +        t->bytes += to_copy;
>> +        if (t->bytes < buf_size) {
>> +            return 0;
>> +        }
>> +
>> +        aio_task_pool_start_task(state->pool, &t->task);
>> +
>> +        pos += to_copy;
>> +        off += to_copy;
>> +        state->t = t = qcow2_vmstate_task_create(bs, pos, buf_size);
>> +    }
>> +
>> +    return 0;
>>   }
>>     static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector
>> *qiov,
>> diff --git a/block/qcow2.h b/block/qcow2.h
>> index 7ce2c23bdb..146cfed739 100644
>> --- a/block/qcow2.h
>> +++ b/block/qcow2.h
>> @@ -291,6 +291,8 @@ typedef struct Qcow2BitmapHeaderExt {
>>     #define QCOW2_MAX_THREADS 4
>>   +typedef struct Qcow2SaveVMState Qcow2SaveVMState;
>> +
>>   typedef struct BDRVQcow2State {
>>       int cluster_bits;
>>       int cluster_size;
>> @@ -384,6 +386,8 @@ typedef struct BDRVQcow2State {
>>        * is to convert the image with the desired compression type set.
>>        */
>>       Qcow2CompressionType compression_type;
>> +
>> +    Qcow2SaveVMState *savevm_state;
>>   } BDRVQcow2State;
>>     typedef struct Qcow2COWRegion {
>>
>
>




reply via email to

[Prev in Thread] Current Thread [Next in Thread]