[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [Qemu-block] [PATCH 3/3] block: mirror - zero unallocat
From: |
Max Reitz |
Subject: |
Re: [Qemu-devel] [Qemu-block] [PATCH 3/3] block: mirror - zero unallocated target sectors when zero init not present |
Date: |
Mon, 28 Sep 2015 19:32:00 +0200 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.2.0 |
On 28.09.2015 05:29, Jeff Cody wrote:
> During mirror, if the target device does not have support zero
> initialization, a mirror may result in a corrupt image.
>
> For instance, on mirror to a host device with format = raw, whatever
> random data is on the target device will still be there for unallocated
> sectors.
>
> This is because during the mirror, we set the dirty bitmap to copy only
> sectors allocated above 'base'. In the case of target devices where we
> cannot assume unallocated sectors will be read as zeroes, we need to
> explicitely zero out this data.
>
> In order to avoid zeroing out all sectors of the target device prior to
> mirroring, we do zeroing as part of the block job. A second dirty
> bitmap cache is created, to track sectors that are unallocated above
> 'base'. These sectors are then checked for status of BDRV_BLOCK_ZERO
> on the target - if they are not, then zeroes are explicitly written.
>
> This only occurs under two conditions:
>
> 1. 'mode' != "existing"
> 2. bdrv_has_zero_init(target) == NULL
>
> We perform the mirroring through mirror_iteration() as before, except
> in two passes. If the above two conditions are met, the first pass
> is using the bitmap tracking unallocated sectors, to write the needed
> zeroes. Then, the second pass is performed, to mirror the actual data
> as before.
>
> If the above two conditions are not met, then the first pass is skipped,
> and only the second pass (the one with the actual data) is performed.
>
> Signed-off-by: Jeff Cody <address@hidden>
> ---
> block/mirror.c | 109
> ++++++++++++++++++++++++++++++++++------------
> blockdev.c | 2 +-
> include/block/block_int.h | 3 +-
> qapi/block-core.json | 6 ++-
> 4 files changed, 87 insertions(+), 33 deletions(-)
>
> diff --git a/block/mirror.c b/block/mirror.c
> index 405e5c4..b599176 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -50,7 +50,9 @@ typedef struct MirrorBlockJob {
> int64_t bdev_length;
> unsigned long *cow_bitmap;
> BdrvDirtyBitmap *dirty_bitmap;
> - HBitmapIter hbi;
> + HBitmapIter zero_hbi;
> + HBitmapIter allocated_hbi;
> + HBitmapIter *hbi;
> uint8_t *buf;
> QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
> int buf_free_count;
> @@ -60,6 +62,8 @@ typedef struct MirrorBlockJob {
> int sectors_in_flight;
> int ret;
> bool unmap;
> + bool zero_unallocated;
> + bool zero_cycle;
> bool waiting_for_io;
> } MirrorBlockJob;
>
> @@ -166,10 +170,10 @@ static uint64_t coroutine_fn
> mirror_iteration(MirrorBlockJob *s)
> int pnum;
> int64_t ret;
>
> - s->sector_num = hbitmap_iter_next(&s->hbi);
> + s->sector_num = hbitmap_iter_next(s->hbi);
> if (s->sector_num < 0) {
> - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
> - s->sector_num = hbitmap_iter_next(&s->hbi);
> + bdrv_dirty_iter_init(s->dirty_bitmap, s->hbi);
> + s->sector_num = hbitmap_iter_next(s->hbi);
> trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
> assert(s->sector_num >= 0);
> }
> @@ -287,7 +291,7 @@ static uint64_t coroutine_fn
> mirror_iteration(MirrorBlockJob *s)
> */
> if (next_sector > hbitmap_next_sector
> && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
> - hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
> + hbitmap_next_sector = hbitmap_iter_next(s->hbi);
> }
>
> next_sector += sectors_per_chunk;
> @@ -300,25 +304,34 @@ static uint64_t coroutine_fn
> mirror_iteration(MirrorBlockJob *s)
> s->sectors_in_flight += nb_sectors;
> trace_mirror_one_iteration(s, sector_num, nb_sectors);
>
> - ret = bdrv_get_block_status_above(source, NULL, sector_num,
> - nb_sectors, &pnum);
> - if (ret < 0 || pnum < nb_sectors ||
> - (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
> - bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> - mirror_read_complete, op);
> - } else if (ret & BDRV_BLOCK_ZERO) {
> - bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
> - s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
> - mirror_write_complete, op);
> + if (s->zero_cycle) {
> + ret = bdrv_get_block_status(s->target, sector_num, nb_sectors,
> &pnum);
> + if (!(ret & BDRV_BLOCK_ZERO)) {
> + bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
> + s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
> + mirror_write_complete, op);
> + }
> } else {
> - assert(!(ret & BDRV_BLOCK_DATA));
> - bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
> - mirror_write_complete, op);
> + ret = bdrv_get_block_status_above(source, NULL, sector_num,
> + nb_sectors, &pnum);
> + if (ret < 0 || pnum < nb_sectors ||
> + (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
> + bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> + mirror_read_complete, op);
> + } else if (ret & BDRV_BLOCK_ZERO) {
> + bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
> + s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
> + mirror_write_complete, op);
> + } else {
> + assert(!(ret & BDRV_BLOCK_DATA));
> + bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
> + mirror_write_complete, op);
> + }
> }
> return delay_ns;
> }
>
> -static int mirror_do_iteration(MirrorBlockJob *s, uint64_t last_pause_ns)
> +static int mirror_do_iteration(MirrorBlockJob *s, uint64_t *last_pause_ns)
> {
> int ret;
>
> @@ -347,7 +360,7 @@ static int mirror_do_iteration(MirrorBlockJob *s,
> uint64_t last_pause_ns)
> * We do so every SLICE_TIME nanoseconds, or when there is an error,
> * or when the source is clean, whichever comes first.
> */
> - if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns <
> SLICE_TIME
> + if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - *last_pause_ns <
> SLICE_TIME
> && s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
> if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
> (cnt == 0 && s->in_flight > 0)) {
> @@ -371,6 +384,14 @@ static int mirror_do_iteration(MirrorBlockJob *s,
> uint64_t last_pause_ns)
> goto immediate_exit;
> }
> } else {
> +
> + if (s->zero_cycle) {
> + /* this is not the end of the streaming cycle,
> + * if we are just filling in zeroes for unallocated
> + * sectors prior to streaming the real data */
> + goto immediate_exit;
> + }
> +
> /* We're out of the streaming phase. From now on, if the job
> * is cancelled we will actually complete all pending I/O and
> * report completion. This way, block-job-cancel will leave
> @@ -419,7 +440,7 @@ static int mirror_do_iteration(MirrorBlockJob *s,
> uint64_t last_pause_ns)
> s->common.cancelled = false;
> break;
> }
> - last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> + *last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> }
>
> immediate_exit:
> @@ -511,6 +532,15 @@ static void coroutine_fn mirror_run(void *opaque)
> checking for a NULL string */
> int ret = 0;
> int n;
> + BdrvDirtyBitmap *zero_dirty_bitmap;
> + BdrvDirtyBitmap *allocated_dirty_bitmap = s->dirty_bitmap;
> +
> + zero_dirty_bitmap = bdrv_create_dirty_bitmap(s->target,
> + s->granularity, NULL, true,
> + NULL);
> + if (zero_dirty_bitmap == NULL) {
> + goto immediate_exit;
> + }
I think I'd like the error to be reported to the user; but in any case,
you have to set ret to a negative value.
>
> if (block_job_is_cancelled(&s->common)) {
> goto immediate_exit;
> @@ -588,14 +618,33 @@ static void coroutine_fn mirror_run(void *opaque)
> assert(n > 0);
> if (ret == 1) {
> bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
> + } else if (s->zero_unallocated) {
> + bdrv_set_dirty_bitmap(zero_dirty_bitmap, sector_num, n);
> }
> sector_num += n;
> }
> }
>
> - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
> + bdrv_dirty_iter_init(s->dirty_bitmap, &s->allocated_hbi);
>
> - ret = mirror_do_iteration(s, last_pause_ns);
> + if (s->zero_unallocated) {
> + bdrv_dirty_iter_init(zero_dirty_bitmap, &s->zero_hbi);
> + s->dirty_bitmap = zero_dirty_bitmap;
> + s->hbi = &s->zero_hbi;
> +
> + s->zero_cycle = true;
> + ret = mirror_do_iteration(s, &last_pause_ns);
> + if (ret < 0) {
> + goto immediate_exit;
> + }
> +
> + mirror_drain(s);
> + s->zero_cycle = false;
> + }
> +
> + s->dirty_bitmap = allocated_dirty_bitmap;
> + s->hbi = &s->allocated_hbi;
> + ret = mirror_do_iteration(s, &last_pause_ns);
>
> immediate_exit:
> if (s->in_flight > 0) {
> @@ -611,7 +660,8 @@ immediate_exit:
> qemu_vfree(s->buf);
> g_free(s->cow_bitmap);
> g_free(s->in_flight_bitmap);
> - bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
> + bdrv_release_dirty_bitmap(bs, allocated_dirty_bitmap);
> + bdrv_release_dirty_bitmap(NULL, zero_dirty_bitmap);
> bdrv_iostatus_disable(s->target);
>
> data = g_malloc(sizeof(*data));
> @@ -702,7 +752,7 @@ static void mirror_start_job(BlockDriverState *bs,
> BlockDriverState *target,
> int64_t buf_size,
> BlockdevOnError on_source_error,
> BlockdevOnError on_target_error,
> - bool unmap,
> + bool unmap, bool existing,
> BlockCompletionFunc *cb,
> void *opaque, Error **errp,
> const BlockJobDriver *driver,
> @@ -737,6 +787,7 @@ static void mirror_start_job(BlockDriverState *bs,
> BlockDriverState *target,
> return;
> }
>
> + s->zero_unallocated = !existing && !bdrv_has_zero_init(target);
I think this should be set only if we're doing a full mirror operation.
For instance, I could do a none, top or incremental mirror to a new
qcow2 file, which would give it a backing file, obviously. You're lucky
in that qcow2 claims to always have zero initialization, when this is in
fact not true (someone's ought to fix that...): With a backing file, an
overlay file just cannot have zero initialization, it's impossible
(well, unless the backing file is completely zero).
So if qcow2 were to answer correctly, i.e. "No, with a backing file I do
not have zero init", then this would overwrite all sectors which are
supposed to be unallocated because they are present in the backing file.
> s->replaces = g_strdup(replaces);
> s->on_source_error = on_source_error;
> s->on_target_error = on_target_error;
> @@ -767,7 +818,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState
> *target,
> int64_t speed, uint32_t granularity, int64_t buf_size,
> MirrorSyncMode mode, BlockdevOnError on_source_error,
> BlockdevOnError on_target_error,
> - bool unmap,
> + bool unmap, bool existing,
> BlockCompletionFunc *cb,
> void *opaque, Error **errp)
> {
> @@ -782,8 +833,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState
> *target,
> base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
> mirror_start_job(bs, target, replaces,
> speed, granularity, buf_size,
> - on_source_error, on_target_error, unmap, cb, opaque,
> errp,
> - &mirror_job_driver, is_none_mode, base);
> + on_source_error, on_target_error, unmap, existing, cb,
> + opaque, errp, &mirror_job_driver, is_none_mode, base);
> }
>
> void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
> @@ -830,7 +881,7 @@ void commit_active_start(BlockDriverState *bs,
> BlockDriverState *base,
>
> bdrv_ref(base);
> mirror_start_job(bs, base, NULL, speed, 0, 0,
> - on_error, on_error, false, cb, opaque, &local_err,
> + on_error, on_error, false, false, cb, opaque,
> &local_err,
This should probably be true; the commit target is already existing,
after all. Also, without it being true, iotest 097 fails.
> &commit_active_job_driver, false, base);
> if (local_err) {
> error_propagate(errp, local_err);
> diff --git a/blockdev.c b/blockdev.c
> index cb9f78d..c06ac60 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -2816,7 +2816,7 @@ void qmp_drive_mirror(const char *device, const char
> *target,
> has_replaces ? replaces : NULL,
> speed, granularity, buf_size, sync,
> on_source_error, on_target_error,
> - unmap,
> + unmap, mode == NEW_IMAGE_MODE_EXISTING,
> block_job_cb, bs, &local_err);
> if (local_err != NULL) {
> bdrv_unref(target_bs);
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 14ad4c3..21a8988 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -614,6 +614,7 @@ void commit_active_start(BlockDriverState *bs,
> BlockDriverState *base,
> * @on_source_error: The action to take upon error reading from the source.
> * @on_target_error: The action to take upon error writing to the target.
> * @unmap: Whether to unmap target where source sectors only contain zeroes.
> + * @existing: Whether target image is an existing image prior to the QMP cmd.
> * @cb: Completion function for the job.
> * @opaque: Opaque pointer value passed to @cb.
> * @errp: Error object.
> @@ -628,7 +629,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState
> *target,
> int64_t speed, uint32_t granularity, int64_t buf_size,
> MirrorSyncMode mode, BlockdevOnError on_source_error,
> BlockdevOnError on_target_error,
> - bool unmap,
> + bool unmap, bool existing,
> BlockCompletionFunc *cb,
> void *opaque, Error **errp);
>
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index bb2189e..033afb4 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -952,8 +952,10 @@
> # broken Quorum files. (Since 2.1)
> #
> # @mode: #optional whether and how QEMU should create a new image, default is
> -# 'absolute-paths'.
> -#
This empty line should stay.
> +# 'absolute-paths'. If mode != 'existing', and the target does not
> +# have zero init (sparseness), then the target image will have
> sectors
> +# zeroed out that correspond to sectors in an unallocated state in
> the
> +# source image.
As I said above, this should only happen if @sync == 'full'.
Max
> # @speed: #optional the maximum speed, in bytes per second
> #
> # @sync: what parts of the disk image should be copied to the destination
>
signature.asc
Description: OpenPGP digital signature
- Re: [Qemu-devel] [Qemu-block] [PATCH 2/3] block: mirror - split out part of mirror_run(), (continued)