qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [Qemu-block] [PATCH 3/3] block: mirror - zero unallocat


From: Max Reitz
Subject: Re: [Qemu-devel] [Qemu-block] [PATCH 3/3] block: mirror - zero unallocated target sectors when zero init not present
Date: Mon, 28 Sep 2015 19:32:00 +0200
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.2.0

On 28.09.2015 05:29, Jeff Cody wrote:
> During mirror, if the target device does not have support zero
> initialization, a mirror may result in a corrupt image.
> 
> For instance, on mirror to a host device with format = raw, whatever
> random data is on the target device will still be there for unallocated
> sectors.
> 
> This is because during the mirror, we set the dirty bitmap to copy only
> sectors allocated above 'base'.  In the case of target devices where we
> cannot assume unallocated sectors will be read as zeroes, we need to
> explicitely zero out this data.
> 
> In order to avoid zeroing out all sectors of the target device prior to
> mirroring, we do zeroing as part of the block job.  A second dirty
> bitmap cache is created, to track sectors that are unallocated above
> 'base'.  These sectors are then checked for status of BDRV_BLOCK_ZERO
> on the target - if they are not, then zeroes are explicitly written.
> 
> This only occurs under two conditions:
> 
>     1. 'mode' != "existing"
>     2. bdrv_has_zero_init(target) == NULL
> 
> We perform the mirroring through mirror_iteration() as before, except
> in two passes.  If the above two conditions are met, the first pass
> is using the bitmap tracking unallocated sectors, to write the needed
> zeroes.  Then, the second pass is performed, to mirror the actual data
> as before.
> 
> If the above two conditions are not met, then the first pass is skipped,
> and only the second pass (the one with the actual data) is performed.
> 
> Signed-off-by: Jeff Cody <address@hidden>
> ---
>  block/mirror.c            | 109 
> ++++++++++++++++++++++++++++++++++------------
>  blockdev.c                |   2 +-
>  include/block/block_int.h |   3 +-
>  qapi/block-core.json      |   6 ++-
>  4 files changed, 87 insertions(+), 33 deletions(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index 405e5c4..b599176 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -50,7 +50,9 @@ typedef struct MirrorBlockJob {
>      int64_t bdev_length;
>      unsigned long *cow_bitmap;
>      BdrvDirtyBitmap *dirty_bitmap;
> -    HBitmapIter hbi;
> +    HBitmapIter zero_hbi;
> +    HBitmapIter allocated_hbi;
> +    HBitmapIter *hbi;
>      uint8_t *buf;
>      QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
>      int buf_free_count;
> @@ -60,6 +62,8 @@ typedef struct MirrorBlockJob {
>      int sectors_in_flight;
>      int ret;
>      bool unmap;
> +    bool zero_unallocated;
> +    bool zero_cycle;
>      bool waiting_for_io;
>  } MirrorBlockJob;
>  
> @@ -166,10 +170,10 @@ static uint64_t coroutine_fn 
> mirror_iteration(MirrorBlockJob *s)
>      int pnum;
>      int64_t ret;
>  
> -    s->sector_num = hbitmap_iter_next(&s->hbi);
> +    s->sector_num = hbitmap_iter_next(s->hbi);
>      if (s->sector_num < 0) {
> -        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
> -        s->sector_num = hbitmap_iter_next(&s->hbi);
> +        bdrv_dirty_iter_init(s->dirty_bitmap, s->hbi);
> +        s->sector_num = hbitmap_iter_next(s->hbi);
>          trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
>          assert(s->sector_num >= 0);
>      }
> @@ -287,7 +291,7 @@ static uint64_t coroutine_fn 
> mirror_iteration(MirrorBlockJob *s)
>           */
>          if (next_sector > hbitmap_next_sector
>              && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
> -            hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
> +            hbitmap_next_sector = hbitmap_iter_next(s->hbi);
>          }
>  
>          next_sector += sectors_per_chunk;
> @@ -300,25 +304,34 @@ static uint64_t coroutine_fn 
> mirror_iteration(MirrorBlockJob *s)
>      s->sectors_in_flight += nb_sectors;
>      trace_mirror_one_iteration(s, sector_num, nb_sectors);
>  
> -    ret = bdrv_get_block_status_above(source, NULL, sector_num,
> -                                      nb_sectors, &pnum);
> -    if (ret < 0 || pnum < nb_sectors ||
> -            (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
> -        bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> -                       mirror_read_complete, op);
> -    } else if (ret & BDRV_BLOCK_ZERO) {
> -        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
> -                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
> -                              mirror_write_complete, op);
> +    if (s->zero_cycle) {
> +        ret = bdrv_get_block_status(s->target, sector_num, nb_sectors, 
> &pnum);
> +        if (!(ret & BDRV_BLOCK_ZERO)) {
> +            bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
> +                                  s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
> +                                  mirror_write_complete, op);
> +        }
>      } else {
> -        assert(!(ret & BDRV_BLOCK_DATA));
> -        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
> -                         mirror_write_complete, op);
> +        ret = bdrv_get_block_status_above(source, NULL, sector_num,
> +                                          nb_sectors, &pnum);
> +        if (ret < 0 || pnum < nb_sectors ||
> +                (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
> +            bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> +                           mirror_read_complete, op);
> +        } else if (ret & BDRV_BLOCK_ZERO) {
> +            bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
> +                                  s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
> +                                  mirror_write_complete, op);
> +        } else {
> +            assert(!(ret & BDRV_BLOCK_DATA));
> +            bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
> +                             mirror_write_complete, op);
> +        }
>      }
>      return delay_ns;
>  }
>  
> -static int mirror_do_iteration(MirrorBlockJob *s, uint64_t last_pause_ns)
> +static int mirror_do_iteration(MirrorBlockJob *s, uint64_t *last_pause_ns)
>  {
>      int ret;
>  
> @@ -347,7 +360,7 @@ static int mirror_do_iteration(MirrorBlockJob *s, 
> uint64_t last_pause_ns)
>           * We do so every SLICE_TIME nanoseconds, or when there is an error,
>           * or when the source is clean, whichever comes first.
>           */
> -        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < 
> SLICE_TIME
> +        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - *last_pause_ns < 
> SLICE_TIME
>              && s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
>              if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
>                  (cnt == 0 && s->in_flight > 0)) {
> @@ -371,6 +384,14 @@ static int mirror_do_iteration(MirrorBlockJob *s, 
> uint64_t last_pause_ns)
>                      goto immediate_exit;
>                  }
>              } else {
> +
> +                if (s->zero_cycle) {
> +                    /* this is not the end of the streaming cycle,
> +                     * if we are just filling in zeroes for unallocated
> +                     * sectors prior to streaming the real data */
> +                    goto immediate_exit;
> +                }
> +
>                  /* We're out of the streaming phase.  From now on, if the job
>                   * is cancelled we will actually complete all pending I/O and
>                   * report completion.  This way, block-job-cancel will leave
> @@ -419,7 +440,7 @@ static int mirror_do_iteration(MirrorBlockJob *s, 
> uint64_t last_pause_ns)
>              s->common.cancelled = false;
>              break;
>          }
> -        last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> +        *last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
>      }
>  
>  immediate_exit:
> @@ -511,6 +532,15 @@ static void coroutine_fn mirror_run(void *opaque)
>                                   checking for a NULL string */
>      int ret = 0;
>      int n;
> +    BdrvDirtyBitmap *zero_dirty_bitmap;
> +    BdrvDirtyBitmap *allocated_dirty_bitmap = s->dirty_bitmap;
> +
> +    zero_dirty_bitmap = bdrv_create_dirty_bitmap(s->target,
> +                                                 s->granularity, NULL, true,
> +                                                 NULL);
> +    if (zero_dirty_bitmap == NULL) {
> +        goto immediate_exit;
> +    }

I think I'd like the error to be reported to the user; but in any case,
you have to set ret to a negative value.

>  
>      if (block_job_is_cancelled(&s->common)) {
>          goto immediate_exit;
> @@ -588,14 +618,33 @@ static void coroutine_fn mirror_run(void *opaque)
>              assert(n > 0);
>              if (ret == 1) {
>                  bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
> +            } else if (s->zero_unallocated) {
> +                bdrv_set_dirty_bitmap(zero_dirty_bitmap, sector_num, n);
>              }
>              sector_num += n;
>          }
>      }
>  
> -    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
> +    bdrv_dirty_iter_init(s->dirty_bitmap, &s->allocated_hbi);
>  
> -    ret = mirror_do_iteration(s, last_pause_ns);
> +    if (s->zero_unallocated) {
> +        bdrv_dirty_iter_init(zero_dirty_bitmap, &s->zero_hbi);
> +        s->dirty_bitmap = zero_dirty_bitmap;
> +        s->hbi = &s->zero_hbi;
> +
> +        s->zero_cycle = true;
> +        ret = mirror_do_iteration(s, &last_pause_ns);
> +        if (ret < 0) {
> +            goto immediate_exit;
> +        }
> +
> +        mirror_drain(s);
> +        s->zero_cycle = false;
> +    }
> +
> +    s->dirty_bitmap = allocated_dirty_bitmap;
> +    s->hbi = &s->allocated_hbi;
> +    ret = mirror_do_iteration(s, &last_pause_ns);
>  
>  immediate_exit:
>      if (s->in_flight > 0) {
> @@ -611,7 +660,8 @@ immediate_exit:
>      qemu_vfree(s->buf);
>      g_free(s->cow_bitmap);
>      g_free(s->in_flight_bitmap);
> -    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
> +    bdrv_release_dirty_bitmap(bs, allocated_dirty_bitmap);
> +    bdrv_release_dirty_bitmap(NULL, zero_dirty_bitmap);
>      bdrv_iostatus_disable(s->target);
>  
>      data = g_malloc(sizeof(*data));
> @@ -702,7 +752,7 @@ static void mirror_start_job(BlockDriverState *bs, 
> BlockDriverState *target,
>                               int64_t buf_size,
>                               BlockdevOnError on_source_error,
>                               BlockdevOnError on_target_error,
> -                             bool unmap,
> +                             bool unmap, bool existing,
>                               BlockCompletionFunc *cb,
>                               void *opaque, Error **errp,
>                               const BlockJobDriver *driver,
> @@ -737,6 +787,7 @@ static void mirror_start_job(BlockDriverState *bs, 
> BlockDriverState *target,
>          return;
>      }
>  
> +    s->zero_unallocated = !existing && !bdrv_has_zero_init(target);

I think this should be set only if we're doing a full mirror operation.
For instance, I could do a none, top or incremental mirror to a new
qcow2 file, which would give it a backing file, obviously. You're lucky
in that qcow2 claims to always have zero initialization, when this is in
fact not true (someone's ought to fix that...): With a backing file, an
overlay file just cannot have zero initialization, it's impossible
(well, unless the backing file is completely zero).

So if qcow2 were to answer correctly, i.e. "No, with a backing file I do
not have zero init", then this would overwrite all sectors which are
supposed to be unallocated because they are present in the backing file.

>      s->replaces = g_strdup(replaces);
>      s->on_source_error = on_source_error;
>      s->on_target_error = on_target_error;
> @@ -767,7 +818,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState 
> *target,
>                    int64_t speed, uint32_t granularity, int64_t buf_size,
>                    MirrorSyncMode mode, BlockdevOnError on_source_error,
>                    BlockdevOnError on_target_error,
> -                  bool unmap,
> +                  bool unmap, bool existing,
>                    BlockCompletionFunc *cb,
>                    void *opaque, Error **errp)
>  {
> @@ -782,8 +833,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState 
> *target,
>      base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
>      mirror_start_job(bs, target, replaces,
>                       speed, granularity, buf_size,
> -                     on_source_error, on_target_error, unmap, cb, opaque, 
> errp,
> -                     &mirror_job_driver, is_none_mode, base);
> +                     on_source_error, on_target_error, unmap, existing, cb,
> +                     opaque, errp, &mirror_job_driver, is_none_mode, base);
>  }
>  
>  void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
> @@ -830,7 +881,7 @@ void commit_active_start(BlockDriverState *bs, 
> BlockDriverState *base,
>  
>      bdrv_ref(base);
>      mirror_start_job(bs, base, NULL, speed, 0, 0,
> -                     on_error, on_error, false, cb, opaque, &local_err,
> +                     on_error, on_error, false, false, cb, opaque, 
> &local_err,

This should probably be true; the commit target is already existing,
after all. Also, without it being true, iotest 097 fails.

>                       &commit_active_job_driver, false, base);
>      if (local_err) {
>          error_propagate(errp, local_err);
> diff --git a/blockdev.c b/blockdev.c
> index cb9f78d..c06ac60 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -2816,7 +2816,7 @@ void qmp_drive_mirror(const char *device, const char 
> *target,
>                   has_replaces ? replaces : NULL,
>                   speed, granularity, buf_size, sync,
>                   on_source_error, on_target_error,
> -                 unmap,
> +                 unmap, mode == NEW_IMAGE_MODE_EXISTING,
>                   block_job_cb, bs, &local_err);
>      if (local_err != NULL) {
>          bdrv_unref(target_bs);
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 14ad4c3..21a8988 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -614,6 +614,7 @@ void commit_active_start(BlockDriverState *bs, 
> BlockDriverState *base,
>   * @on_source_error: The action to take upon error reading from the source.
>   * @on_target_error: The action to take upon error writing to the target.
>   * @unmap: Whether to unmap target where source sectors only contain zeroes.
> + * @existing: Whether target image is an existing image prior to the QMP cmd.
>   * @cb: Completion function for the job.
>   * @opaque: Opaque pointer value passed to @cb.
>   * @errp: Error object.
> @@ -628,7 +629,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState 
> *target,
>                    int64_t speed, uint32_t granularity, int64_t buf_size,
>                    MirrorSyncMode mode, BlockdevOnError on_source_error,
>                    BlockdevOnError on_target_error,
> -                  bool unmap,
> +                  bool unmap, bool existing,
>                    BlockCompletionFunc *cb,
>                    void *opaque, Error **errp);
>  
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index bb2189e..033afb4 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -952,8 +952,10 @@
>  #            broken Quorum files. (Since 2.1)
>  #
>  # @mode: #optional whether and how QEMU should create a new image, default is
> -#        'absolute-paths'.
> -#

This empty line should stay.

> +#        'absolute-paths'.  If mode != 'existing', and the target does not
> +#         have zero init (sparseness), then the target image will have 
> sectors
> +#         zeroed out that correspond to sectors in an unallocated state in 
> the
> +#         source image.

As I said above, this should only happen if @sync == 'full'.

Max

>  # @speed:  #optional the maximum speed, in bytes per second
>  #
>  # @sync: what parts of the disk image should be copied to the destination
> 


Attachment: signature.asc
Description: OpenPGP digital signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]