qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 1/2 v7] block: add-cow file format


From: Dong Xu Wang
Subject: Re: [Qemu-devel] [PATCH 1/2 v7] block: add-cow file format
Date: Thu, 8 Mar 2012 10:27:00 +0800

On Wed, Mar 7, 2012 at 00:59, Stefan Hajnoczi <address@hidden> wrote:
> On Thu, Mar 1, 2012 at 2:56 AM, Dong Xu Wang <address@hidden> wrote:
>> diff --git a/block/add-cow-cache.c b/block/add-cow-cache.c
>> new file mode 100644
>> index 0000000..6be02ff
>> --- /dev/null
>> +++ b/block/add-cow-cache.c
>> @@ -0,0 +1,171 @@
>> +/*
>> + * Cache For QEMU ADD-COW Disk Format
>> + *
>> + * Copyright IBM, Corp. 2012
>> + *
>> + * Authors:
>> + *  Dong Xu Wang <address@hidden>
>> + * This work is licensed under the terms of the GNU LGPL, version 2 or 
>> later.
>> + * See the COPYING.LIB file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "block_int.h"
>> +#include "qemu-common.h"
>> +#include "add-cow.h"
>
> This patch is missing add-cow.h.
>

>> diff --git a/block/add-cow.c b/block/add-cow.c
>> new file mode 100644
>> index 0000000..6897a52
>> --- /dev/null
>> +++ b/block/add-cow.c
>> @@ -0,0 +1,402 @@
>> +/*
>> + * QEMU ADD-COW Disk Format
>> + *
>> + * Copyright IBM, Corp. 2012
>> + *
>> + * Authors:
>> + *  Dong Xu Wang <address@hidden>
>> + * This work is licensed under the terms of the GNU LGPL, version 2 or 
>> later.
>> + * See the COPYING.LIB file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "qemu-common.h"
>> +#include "block_int.h"
>> +#include "module.h"
>> +#include "add-cow.h"
>> +
>> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char 
>> *filename)
>> +{
>> +    const AddCowHeader *header = (const void *)buf;
>
> Minor coding style comment: please cast to const AddCowHeader* instead of 
> void.
>
>> +
>> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
>> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
>> +        return 100;
>> +    } else {
>> +        return 0;
>> +    }
>> +}
>> +
>> +static int add_cow_open(BlockDriverState *bs, int flags)
>> +{
>> +    AddCowHeader        header;
>> +    char                image_filename[ADD_COW_FILE_LEN];
>> +    BlockDriver         *image_drv = NULL;
>> +    int                 ret;
>> +    BDRVAddCowState     *s = bs->opaque;
>> +
>> +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
>> +    if (ret != sizeof(header)) {
>> +        goto fail;
>> +    }
>> +
>> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC) {
>> +        ret = -EINVAL;
>> +        goto fail;
>> +    }
>> +    if (be32_to_cpu(header.version) != ADD_COW_VERSION) {
>> +        char version[64];
>> +        snprintf(version, sizeof(version), "ADD-COW version %d", 
>> header.version);
>
> be32_to_cpu(header.version)
>
>> +        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
>> +            bs->device_name, "add-cow", version);
>> +        ret = -ENOTSUP;
>> +        goto fail;
>> +    }
>> +
>> +    QEMU_BUILD_BUG_ON(sizeof(bs->backing_file) != 
>> sizeof(header.backing_file));
>> +    strncpy(bs->backing_file, header.backing_file,
>> +            sizeof(bs->backing_file));
>
> Please use pstrcpy() - it always NUL-terminates the string.  strncpy()
> only stores a '\0' when the source string is *shorter than* max
> length, so a max length string results in bs->backing_file missing the
> '\0'.
>
>> +
>> +    if (header.image_file[0] == '\0') {
>> +        ret = -ENOENT;
>> +        goto fail;
>> +    }
>> +    s->image_hd = bdrv_new("");
>> +    if (path_has_protocol(header.image_file)) {
>
> Please safely copy in header.image_file before treating it as a
> NUL-terminated string.
>
>> +        strncpy(image_filename, header.image_file, sizeof(image_filename));
>> +    } else {
>> +        path_combine(image_filename, sizeof(image_filename),
>> +                     bs->filename, header.image_file);
>> +    }
>> +
>> +    image_drv = bdrv_find_format("raw");
>> +    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
>> +    if (ret < 0) {
>> +        bdrv_delete(s->image_hd);
>> +        goto fail;
>> +    }
>> +    bs->total_sectors = s->image_hd->total_sectors;
>> +    s->cluster_size = ADD_COW_CLUSTER_SIZE;
>> +    s->bitmap_cache = add_cow_cache_create(bs, ADD_COW_CACHE_SIZE);
>> +    qemu_co_mutex_init(&s->lock);
>> +    return 0;
>> + fail:
>> +    return ret;
>> +}
>> +
>> +static inline bool is_bit_set(BlockDriverState *bs, int64_t bitnum)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    uint64_t offset = bitnum >> 3;
>> +    uint8_t *bitmap;
>> +    int ret = add_cow_cache_get(bs, s->bitmap_cache,
>> +        offset & ~(ADD_COW_CLUSTER_SIZE - 1), (void **)&bitmap);
>
> (void **) should not be necessary.
>
>> +    if (ret < 0) {
>> +        abort();
>> +    }
>> +
>> +    return *(bitmap + (offset & (ADD_COW_CLUSTER_SIZE - 1))) & (1 << 
>> (bitnum % 8));
>
> The cluster concept confuses me.  Normally the point of a cluster is
> to reduce metadata size, so you would not index using bitnum % 8.
>
>> +}
>> +
>> +static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
>> +        int64_t sector_num, int nb_sectors, int *num_same)
>> +{
>> +    int changed;
>> +
>> +    if (nb_sectors == 0) {
>> +        *num_same = nb_sectors;
>> +        return 0;
>> +    }
>> +
>> +    changed = is_bit_set(bs, sector_num);
>> +    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
>> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
>> +            break;
>> +        }
>> +    }
>> +
>> +    return changed;
>> +}
>> +
>> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
>> +        int nb_sectors)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    uint8_t *bitmap;
>> +
>> +    int i, ret = 0;
>> +    for (i = 0; i < nb_sectors; i++) {
>> +        int ret = add_cow_cache_get(bs, s->bitmap_cache,
>> +            (sector_num + i) / 8 & ~(ADD_COW_CLUSTER_SIZE - 1), (void 
>> **)&bitmap);
>> +        if (ret < 0) {
>> +            abort();
>> +        }
>> +        *(bitmap + ((sector_num + i) / 8 & (ADD_COW_CLUSTER_SIZE - 1))) |=
>> +                    (1 << ((sector_num + i) % 8));
>> +        add_cow_cache_entry_mark_dirty(s->bitmap_cache, bitmap);
>> +
>> +    }
>> +    ret = add_cow_cache_flush(bs, s->bitmap_cache);
>
> Data must be flushed to the cow file before writing metadata updates,
> otherwise a crash in between could result in zero sectors instead of
> backing file sectors.
>
Sorry, what does it mean?  The correct steps shoud be:
1. flush to the image file
2. if 1 succeeds, flush to add-cow?
That means metadata updates should in image  file flushing step, not
in add_cow_co_writev?

>> +static int add_cow_backing_read(BlockDriverState *bs, QEMUIOVector *qiov,
>> +                  int64_t sector_num, int nb_sectors)
>
> This function name is confusing because we don't actually perform a read here.
>
> I'm also not sure why we need to handle the case where a request spans
> the end of the device.  bdrv_check_byte_request, called from
> bdrv_co_do_readv() should prevent such requests?
>
>> +{
>> +    int n1;
>> +    if ((sector_num + nb_sectors) <= bs->total_sectors) {
>> +        return nb_sectors;
>> +    }
>> +    if (sector_num >= bs->total_sectors) {
>> +        n1 = 0;
>> +    } else {
>> +        n1 = bs->total_sectors - sector_num;
>> +    }
>> +
>> +    qemu_iovec_memset_skip(qiov, 0, BDRV_SECTOR_SIZE * (nb_sectors - n1),
>> +            BDRV_SECTOR_SIZE * n1);
>> +    return n1;
>> +}
>> +
>> +static coroutine_fn int add_cow_co_readv(BlockDriverState *bs, int64_t 
>> sector_num,
>> +                         int remaining_sectors, QEMUIOVector *qiov)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int cur_nr_sectors;
>> +    uint64_t bytes_done = 0;
>> +    QEMUIOVector hd_qiov;
>> +    int n, n1, ret = 0;
>> +
>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> +    qemu_co_mutex_lock(&s->lock);
>> +    while (remaining_sectors != 0) {
>> +        cur_nr_sectors = remaining_sectors;
>> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
>> +            cur_nr_sectors = n;
>> +            qemu_iovec_reset(&hd_qiov);
>> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
>> +            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
>
> During the read it seems safe to unlock s->lock.  We need to acquire
> it again immediately afterwards, but it allows other requests to
> access the cow cache in the meantime.
>
>> +            if (ret < 0) {
>> +                goto fail;
>> +            }
>> +        } else {
>> +            cur_nr_sectors = n;
>> +            if (bs->backing_hd) {
>> +                n1 = add_cow_backing_read(bs->backing_hd, &hd_qiov,
>> +                    sector_num, cur_nr_sectors);
>> +                if (n1 > 0) {
>> +                    qemu_iovec_reset(&hd_qiov);
>> +                    qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>> +                                cur_nr_sectors * BDRV_SECTOR_SIZE);
>> +                    ret = bdrv_co_readv(bs->backing_hd, sector_num,
>> +                                        n, &hd_qiov);
>> +                    if (ret < 0) {
>> +                        goto fail;
>> +                    }
>> +                }
>> +            } else {
>> +                qemu_iovec_reset(&hd_qiov);
>
> We need to zero bytes, otherwise qiov will contain uninitialized bytes
> instead of zeroes read when there is no backing file.
>
>> +            }
>> +        }
>> +        remaining_sectors -= cur_nr_sectors;
>> +        sector_num += cur_nr_sectors;
>> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
>> +    }
>> +fail:
>> +    qemu_co_mutex_unlock(&s->lock);
>> +    qemu_iovec_destroy(&hd_qiov);
>> +    return ret;
>> +}
>> +
>> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs, int64_t 
>> sector_num,
>> +                          int remaining_sectors, QEMUIOVector *qiov)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int ret = 0;
>> +    QEMUIOVector hd_qiov;
>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> +    qemu_co_mutex_lock(&s->lock);
>> +    qemu_iovec_reset(&hd_qiov);
>> +    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors * 
>> BDRV_SECTOR_SIZE);
>
> Why use hd_qiov?  It should be possible to use qiov directly, I think.
>
>> +    ret = bdrv_co_writev(s->image_hd,
>> +                     sector_num,
>> +                     remaining_sectors, &hd_qiov);
>> +    if (ret < 0) {
>> +        goto fail;
>> +    }
>> +
>> +    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
>> +    if (ret < 0) {
>> +        goto fail;
>> +    }
>> +fail:
>> +    qemu_co_mutex_unlock(&s->lock);
>> +    qemu_iovec_destroy(&hd_qiov);
>> +    return ret;
>> +}
>> +
>> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
>> +{
>> +    int ret = 0;
>> +    int64_t image_sectors = offset / BDRV_SECTOR_SIZE;
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int64_t old_image_sector = s->image_hd->total_sectors;
>> +
>> +    ret = bdrv_truncate(bs->file, sizeof(AddCowHeader) + ((image_sectors + 
>> 7) >> 3));
>> +    if (ret < 0) {
>> +        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
>> +        return ret;
>> +    }
>> +    return ret;
>> +}
>
> I'm surprised that we truncate the image file...but only in the error case.
>
> Also, do we need to resize the cow cache?
>
cow cache should be resized, because the function will be called while creating.

Kevin gave comments in v2:
"bs->file only contains metadata, so why is this correct? Shoudln't you
update the header with the new size and truncate s->image_hd?"

So I want to know should image_file have the same virtual size as the
backing_file?

>> +
>> +static coroutine_fn int add_cow_co_flush(BlockDriverState *bs)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int ret = bdrv_co_flush(s->image_hd);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    qemu_co_mutex_lock(&s->lock);
>> +    ret = add_cow_cache_flush(bs, s->bitmap_cache);
>
> I suggest moving qemu_co_mutex_unlock(&s->lock) here so you don't need
> to duplicate it in the success and error cases.
Okay.
>
>> +    if (ret < 0) {
>> +        qemu_co_mutex_unlock(&s->lock);
>> +        return ret;
>> +    }
>> +    qemu_co_mutex_unlock(&s->lock);
>> +    return bdrv_co_flush(bs->file);
>> +}
>> +
>> +static QEMUOptionParameter add_cow_create_options[] = {
>> +    {
>> +        .name = BLOCK_OPT_SIZE,
>> +        .type = OPT_SIZE,
>> +        .help = "Virtual disk size"
>> +    },
>> +    {
>> +        .name = BLOCK_OPT_BACKING_FILE,
>> +        .type = OPT_STRING,
>> +        .help = "File name of a base image"
>> +    },
>> +    {
>> +        .name = BLOCK_OPT_IMAGE_FILE,
>> +        .type = OPT_STRING,
>> +        .help = "File name of a image file"
>> +    },
>> +    {
>> +        .name = BLOCK_OPT_BACKING_FMT,
>> +        .type = OPT_STRING,
>> +        .help = "Image format of the base image"
>> +    },
>> +    { NULL }
>> +};
>> +
>> +static BlockDriver bdrv_add_cow = {
>> +    .format_name                = "add-cow",
>> +    .instance_size              = sizeof(BDRVAddCowState),
>> +    .bdrv_probe                 = add_cow_probe,
>> +    .bdrv_open                  = add_cow_open,
>> +    .bdrv_close                 = add_cow_close,
>> +    .bdrv_create                = add_cow_create,
>> +    .bdrv_co_is_allocated       = add_cow_is_allocated,
>> +
>> +    .bdrv_co_readv              = add_cow_co_readv,
>> +    .bdrv_co_writev             = add_cow_co_writev,
>> +    .bdrv_truncate              = bdrv_add_cow_truncate,
>> +
>> +    .create_options             = add_cow_create_options,
>> +    .bdrv_co_flush_to_disk      = add_cow_co_flush,
>> +};
>> +
>> +static void bdrv_add_cow_init(void)
>> +{
>> +    bdrv_register(&bdrv_add_cow);
>> +}
>> +
>> +block_init(bdrv_add_cow_init);
>> diff --git a/block_int.h b/block_int.h
>> index b460c36..8126f27 100644
>> --- a/block_int.h
>> +++ b/block_int.h
>> @@ -50,6 +50,7 @@
>>  #define BLOCK_OPT_TABLE_SIZE    "table_size"
>>  #define BLOCK_OPT_PREALLOC      "preallocation"
>>  #define BLOCK_OPT_SUBFMT        "subformat"
>> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>>
>>  typedef struct BdrvTrackedRequest BdrvTrackedRequest;
>>
>> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
>> new file mode 100644
>> index 0000000..db992a4
>> --- /dev/null
>> +++ b/docs/specs/add-cow.txt
>> @@ -0,0 +1,68 @@
>> +== General ==
>> +
>> +Raw file format does not support backing_file and copy on write feature.
>> +The add-cow image format makes it possible to use backing files with raw
>> +image by keeping a separate .add-cow metadata file.  Once all sectors
>> +have been written to in the raw image it is safe to discard the .add-cow
>> +and backing files and instead use the raw image directly.
>> +
>> +When using add-cow, procedures may like this:
>> +(ubuntu.img is a disk image which has been installed OS.)
>> +    1)  Create a raw image with the same size of ubuntu.img
>> +            qemu-img create -f raw test.raw 8G
>> +    2)  Create a add-cow image which will store dirty bitmap
>> +            qemu-img create -f add-cow test.add-cow -o 
>> backing_file=ubuntu.img,image_file=test.raw
>> +    3)  Run qemu with add-cow image
>> +            qemu -drive if=virtio,file=test.add-cow
>> +
>> +=Specification=
>> +
>> +The file format looks like this:
>> +
>> + +---------------+--------------------------+
>> + |     Header    |           Data           |
>> + +---------------+--------------------------+
>> +
>> +All numbers in add-cow are stored in Big Endian byte order.
>> +
>> +== Header ==
>> +
>> +The Header is included in the first bytes:
>> +
>> +    Byte  0 -  7:       magic
>> +                        add-cow magic string ("ADD_COW\xff")
>> +
>> +          8 -  11:      version
>> +                        Version number (only valid value is 1 now)
>> +
>> +          12 - 1035:    backing_file
>> +                        backing_file file name related to add-cow file. All
>> +                        unused bytes are padded with zeros. Must not be 
>> longer
>> +                        than 1023 bytes.
>> +
>> +         1036 - 2059:   image_file
>> +                        image_file is a raw file. All unused bytes are 
>> padded
>> +                        with zeros. Must not be longer than 1023 bytes.
>> +
>> +         2060  - 2559:   The Reserved field is used to make sure Data field 
>> starts
>> +                        at the multiple of 512, not used currently. All 
>> bytes are
>> +                        filled with 0.
>> +
>> +== Data ==
>> +
>> +The Data field starts at the 2560th byte, stores a bitmap related to 
>> backing_file
>> +and image_file. The bitmap will track whether the sector in backing_file is 
>> dirty
>> +or not.
>
> Cluster size is not mentioned and not documented in the add-cow header
> structure.
>
> I didn't look closely at how the cache and clusters are supposed to
> work but I think they don't really work.  What I would expect is:
>
> 1. The bitmap operates at ADD_COW_CLUSTER_SIZE granularity.  This
> means the bits in the add-cow file actually represent
> ADD_COW_CLUSTER_SIZE sectors of data allocation, not just 1 sector.
>
Yes, I made a mistake. It should not be called cluster.
> 2. Code that accesses the bitmap first divides by ADD_COW_CLUSTER_SIZE
> to operate on an entire cluster.  Right now the lookup code seems to
> do it *during* the bitmap lookup instead of before.
Sorry, I do not understand this comment clearly. Could you explain more?
>
> Sorry, this isn't a good explanation but I've run out of time to
> really understand how this works in your patch.  I suggest looking
> carefully at the is_allocated lookup and the cache to see if you're
> really maintaining the bitmap at cluster granularity.
>
> Stefan
>
Thanks for you reviewing.



reply via email to

[Prev in Thread] Current Thread [Next in Thread]