qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-block] [PATCH] mirror: Improve zero-write and discard with fra


From: Max Reitz
Subject: Re: [Qemu-block] [PATCH] mirror: Improve zero-write and discard with fragmented image
Date: Fri, 6 Nov 2015 19:36:14 +0100
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.3.0

On 06.11.2015 11:22, Fam Zheng wrote:
> The "pnum < nb_sectors" condition in deciding whether to actually copy
> data is unnecessarily strict, and the qiov initialization is
> unnecessarily too, for both bdrv_aio_write_zeroes and bdrv_aio_discard
> branches.
> 
> Reorganize mirror_iteration flow so that we:
> 
>     1) Find the contiguous zero/discarded sectors with
>     bdrv_get_block_status_above() before deciding what to do. We query
>     s->buf_size sized blocks at a time.
> 
>     2) If the sectors in question are zeroed/discarded and aligned to
>     target cluster, issue zero write or discard accordingly. It's done
>     in mirror_do_zero_or_discard, where we don't add buffer to qiov.
> 
>     3) Otherwise, do the same loop as before in mirror_do_read.
> 
> Signed-off-by: Fam Zheng <address@hidden>
> ---
>  block/mirror.c | 161 
> +++++++++++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 127 insertions(+), 34 deletions(-)

Looks good overall, some comments below.

Max

> diff --git a/block/mirror.c b/block/mirror.c
> index b1252a1..ac796b4 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -157,28 +157,21 @@ static void mirror_read_complete(void *opaque, int ret)
>                      mirror_write_complete, op);
>  }
>  
> -static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
> +static uint64_t mirror_do_read(MirrorBlockJob *s)
>  {
>      BlockDriverState *source = s->common.bs;
> -    int nb_sectors, sectors_per_chunk, nb_chunks;
> -    int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
> +    int sectors_per_chunk, nb_sectors, nb_chunks;
> +    int64_t end, next_chunk, next_sector, hbitmap_next_sector, sector_num;
>      uint64_t delay_ns = 0;
>      MirrorOp *op;
> -    int pnum;
> -    int64_t ret;
>  
> -    s->sector_num = hbitmap_iter_next(&s->hbi);
> -    if (s->sector_num < 0) {
> -        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
> -        s->sector_num = hbitmap_iter_next(&s->hbi);
> -        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
> -        assert(s->sector_num >= 0);
> -    }
> +    sector_num = s->sector_num;
> +    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
> +    end = s->bdev_length / BDRV_SECTOR_SIZE;
>  
> +    next_sector = sector_num;
> +    next_chunk = sector_num / sectors_per_chunk;

@next_sector and @next_chunk set here...

>      hbitmap_next_sector = s->sector_num;
> -    sector_num = s->sector_num;
> -    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
> -    end = s->bdev_length / BDRV_SECTOR_SIZE;
>  
>      /* Extend the QEMUIOVector to include all adjacent blocks that will
>       * be copied in this operation.
> @@ -198,14 +191,6 @@ static uint64_t coroutine_fn 
> mirror_iteration(MirrorBlockJob *s)
>      next_sector = sector_num;
>      next_chunk = sector_num / sectors_per_chunk;

...and here already. So the above seems superfluous, considering that
they are not read in between.

(If you keep hbitmap_next_sector = s->sector_num; above the sector_num =
... block, that may reduce conflicts further)

>  
> -    /* Wait for I/O to this cluster (from a previous iteration) to be done.  
> */
> -    while (test_bit(next_chunk, s->in_flight_bitmap)) {
> -        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
> -        s->waiting_for_io = true;
> -        qemu_coroutine_yield();
> -        s->waiting_for_io = false;
> -    }
> -
>      do {
>          int added_sectors, added_chunks;
>  
> @@ -301,24 +286,132 @@ static uint64_t coroutine_fn 
> mirror_iteration(MirrorBlockJob *s)
>      s->sectors_in_flight += nb_sectors;
>      trace_mirror_one_iteration(s, sector_num, nb_sectors);
>  
> -    ret = bdrv_get_block_status_above(source, NULL, sector_num,
> -                                      nb_sectors, &pnum);
> -    if (ret < 0 || pnum < nb_sectors ||
> -            (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
> -        bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> -                       mirror_read_complete, op);
> -    } else if (ret & BDRV_BLOCK_ZERO) {
> +    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> +                   mirror_read_complete, op);
> +    return delay_ns;
> +}
> +
> +static uint64_t mirror_do_zero_or_discard(MirrorBlockJob *s,
> +                                          int64_t sector_num,
> +                                          int nb_sectors,
> +                                          bool is_discard)
> +{
> +    int sectors_per_chunk, nb_chunks;
> +    int64_t next_chunk, next_sector, hbitmap_next_sector;
> +    uint64_t delay_ns = 0;
> +    MirrorOp *op;
> +
> +    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
> +    assert(nb_sectors >= sectors_per_chunk);
> +    next_chunk = sector_num / sectors_per_chunk;
> +    nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk);
> +    bitmap_set(s->in_flight_bitmap, next_chunk, nb_chunks);
> +    delay_ns = ratelimit_calculate_delay(&s->limit, nb_sectors);
> +
> +    /* Allocate a MirrorOp that is used as an AIO callback. The qiov is 
> zeroed
> +     * out so the freeing in iteration is nop. */
> +    op = g_new0(MirrorOp, 1);
> +    op->s = s;
> +    op->sector_num = sector_num;
> +    op->nb_sectors = nb_sectors;
> +
> +    /* Advance the HBitmapIter in parallel, so that we do not examine
> +     * the same sector twice.
> +     */
> +    hbitmap_next_sector = sector_num;
> +    next_sector = sector_num + nb_sectors;
> +    while (next_sector > hbitmap_next_sector) {
> +        hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
> +        if (hbitmap_next_sector < 0) {
> +            break;
> +        }
> +    }
> +
> +    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors);
> +    s->in_flight++;
> +    s->sectors_in_flight += nb_sectors;
> +    if (is_discard) {
> +        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
> +                         mirror_write_complete, op);
> +    } else {
>          bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
>                                s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
>                                mirror_write_complete, op);
> -    } else {
> -        assert(!(ret & BDRV_BLOCK_DATA));
> -        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
> -                         mirror_write_complete, op);
>      }
> +
>      return delay_ns;
>  }
>  
> +static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
> +{
> +    BlockDriverState *source = s->common.bs;
> +    int sectors_per_chunk;
> +    int64_t sector_num, next_chunk;
> +    int ret;
> +    int contiguous_sectors = s->buf_size >> BDRV_SECTOR_BITS;
> +    enum MirrorMethod {
> +        MIRROR_METHOD_COPY,
> +        MIRROR_METHOD_ZERO,
> +        MIRROR_METHOD_DISCARD
> +    } mirror_method;
> +
> +    s->sector_num = hbitmap_iter_next(&s->hbi);
> +    if (s->sector_num < 0) {
> +        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
> +        s->sector_num = hbitmap_iter_next(&s->hbi);
> +        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
> +        assert(s->sector_num >= 0);
> +    }
> +
> +    sector_num = s->sector_num;
> +    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
> +    next_chunk = sector_num / sectors_per_chunk;
> +
> +    /* Wait for I/O to this cluster (from a previous iteration) to be done.  
> */
> +    while (test_bit(next_chunk, s->in_flight_bitmap)) {
> +        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
> +        s->waiting_for_io = true;
> +        qemu_coroutine_yield();
> +        s->waiting_for_io = false;
> +    }
> +
> +    mirror_method = MIRROR_METHOD_COPY;
> +    ret = bdrv_get_block_status_above(source, NULL, sector_num,
> +                                      contiguous_sectors,
> +                                      &contiguous_sectors);

We did have NULL as the base before, but is that correct? Shouldn't we
use backing_bs(source) or s->base if @sync is none or top?

Examining this is fun, the short answer is probably "NULL is wrong, but
it's the best we can do here".

(1) with NULL here:

$ ./qemu-img create -f qcow2 -b base.qcow2 top.qcow2
$ ./qemu-io -c 'write -P 42 0 64k' base.qcow2
$ ./qemu-img create -f qcow2 -o compat=0.10 -b base.qcow2 top.qcow2
# compat=0.10 is important because otherwise discard will create a
# zero cluster instead of truly discarding
$ (echo "{'execute':'qmp_capabilities'}
         {'execute':'drive-mirror','arguments':
             {'device':'drive0','target':'out.qcow2',
              'format':'qcow2','sync':'top'}}
         {'execute':'human-monitor-command','arguments':
             {'command-line':'qemu-io drive0 \"discard 0 64k\"'}}
         {'execute':'block-job-complete','arguments':
             {'device':'drive0'}}
         {'execute':'quit'}") | \
  x86_64-softmmu/qemu-system-x86_64 -qmp stdio \
      -drive if=none,file=top.qcow2,id=drive0,discard=unmap
{"QMP": {"version": {"qemu": {"micro": 50, "minor": 4, "major": 2},
"package": ""}, "capabilities": []}}
{"return": {}}
Formatting 'out.qcow2', fmt=qcow2 size=67108864 backing_file=base.qcow2
backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off
refcount_bits=16
{"return": {}}
discard 65536/65536 bytes at offset 0
64 KiB, 1 ops; 0.0000 sec (3.815 GiB/sec and 62500.0000 ops/sec)
{"return": ""}
{"timestamp": {"seconds": 1446830775, "microseconds": 198114}, "event":
"BLOCK_JOB_READY", "data": {"device": "drive0", "len": 0, "offset": 0,
"speed": 0, "type": "mirror"}}
{"return": {}}
{"return": {}}
{"timestamp": {"seconds": 1446830775, "microseconds": 198371}, "event":
"SHUTDOWN"}
{"timestamp": {"seconds": 1446830775, "microseconds": 205506}, "event":
"BLOCK_JOB_COMPLETED", "data": {"device": "drive0", "len": 65536,
"offset": 65536, "speed": 0, "type": "mirror"}}

(Note that the discard must have been executed before BLOCK_JOB_READY,
it's a bit racy)

Now we check:

$ ./qemu-io -c 'read -P 0 0 64k' out.qcow2
Pattern verification failed at offset 0, 65536 bytes
read 65536/65536 bytes at offset 0
64 KiB, 1 ops; 0.0000 sec (856.164 MiB/sec and 13698.6301 ops/sec)
$ ./qemu-io -c 'read -P 42 0 64k' out.qcow2
read 65536/65536 bytes at offset 0
64 KiB, 1 ops; 0.0000 sec (868.056 MiB/sec and 13888.8889 ops/sec)

Well, that seems reasonable, considering that out.qcow2 has base.qcow2
as the backing file, but:

$ ./qemu-img map out.qcow2
Offset          Length          Mapped to       File
0               0x10000         0x50000         out.qcow2

Hm... Well, at least out.qcow2 and top.qcow2 return the same values when
read, but we don't need to copy that from the backing file. Just leaving
the cluster unallocated would have been enough.


(2) So with s->base instead of NULL we get:

$ ./qemu-io -c 'read -P 0 0 64k' out.qcow2
read 65536/65536 bytes at offset 0
64 KiB, 1 ops; 0.0000 sec (856.164 MiB/sec and 13698.6301 ops/sec)
$ ./qemu-io -c 'read -P 42 0 64k' out.qcow2
Pattern verification failed at offset 0, 65536 bytes
read 65536/65536 bytes at offset 0
64 KiB, 1 ops; 0.0000 sec (892.857 MiB/sec and 14285.7143 ops/sec)

And:

$ ./qemu-img map out.qcow2
Offset          Length          Mapped to       File

So the map output looks right, but now we don't read the backing file
anymore. That's because for qcow2 v3, qemu writes zero clusters when
asked to discard (that's why I had to use compat=0.10 for top.qcow2).

Using a compat=0.10 image with @mode = existing fixes that. But well...


So the issue is: We don't have a reliable function to punch a hole into
an overlay file. discard could do that (and does so for qcow2 v2), but
we considered returning zeros the better option if possible because
people might find that more reasonable.

So while it would be best to copy holes in the overlay file created by
discards to the mirrored file, there is no way for us to do that.
Therefore, the best we can do is copy from the backing file.


So, OK, NULL it is.

> +
> +    contiguous_sectors -= contiguous_sectors % sectors_per_chunk;
> +    if (ret < 0 || contiguous_sectors < sectors_per_chunk) {
> +        contiguous_sectors = sectors_per_chunk;
> +    } else if (!(ret & BDRV_BLOCK_DATA)) {
> +        int64_t target_sector_num;
> +        int target_nb_sectors;
> +        bdrv_round_to_clusters(s->target, sector_num, contiguous_sectors,
> +                               &target_sector_num, &target_nb_sectors);
> +        if (target_sector_num == sector_num &&
> +            target_nb_sectors == contiguous_sectors) {
> +            mirror_method = ret & BDRV_BLOCK_ZERO ?
> +                                MIRROR_METHOD_ZERO :
> +                                MIRROR_METHOD_DISCARD;
> +        }
> +    }
> +
> +    switch (mirror_method) {
> +    case MIRROR_METHOD_COPY:
> +        return mirror_do_read(s);
> +    case MIRROR_METHOD_ZERO:
> +        return mirror_do_zero_or_discard(s, sector_num,
> +                                         contiguous_sectors,
> +                                         false);
> +    case MIRROR_METHOD_DISCARD:
> +        return mirror_do_zero_or_discard(s, sector_num,
> +                                         contiguous_sectors,
> +                                         false);

s/false/true/?

> +    default:
> +        abort();
> +    }
> +}
> +
>  static void mirror_free_init(MirrorBlockJob *s)
>  {
>      int granularity = s->granularity;
> 



Attachment: signature.asc
Description: OpenPGP digital signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]