qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v2] Support vhd type VHD_DIFFERENCING


From: Kevin Wolf
Subject: Re: [Qemu-devel] [PATCH v2] Support vhd type VHD_DIFFERENCING
Date: Mon, 8 Sep 2014 18:15:11 +0200
User-agent: Mutt/1.5.21 (2010-09-15)

Am 08.09.2014 um 16:41 hat Xiaodong Gong geschrieben:
> From: Xiaodong Gong <address@hidden>
> 
> Now qemu only supports vhd type VHD_FIXED and VHD_DYNAMIC,
> so qemu can't read snapshot volume of vhd, and can't support
> other storage features of vhd file.
> 
> This patch add read parent information in function "vpc_open",
> read bitmap in "vpc_read", and change bitmap in "vpc_write".
> 
> Signed-off-by: Xiaodong Gong <address@hidden>

I have a few comments and questions on this patch. Please find my
comments inline.

>  block/vpc.c | 329 
> +++++++++++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 261 insertions(+), 68 deletions(-)
> 
> diff --git a/block/vpc.c b/block/vpc.c
> index c024b4c..3ba0d57 100644
> --- a/block/vpc.c
> +++ b/block/vpc.c
> @@ -33,13 +33,18 @@
>  /**************************************************************/
>  
>  #define HEADER_SIZE 512
> +#define DYNAMIC_HEADER_SIZE 1024
> +#define PARENT_LOCATOR_NUM 8
> +#define PARENT_PREFIX_LEN 7 /* such as file:// */
> +#define TBBATMAP_HEAD_SIZE 28
> +#define MACX 0x5863614d /* big endian */

Perhaps call it PLATFORM_MACX? At first I was wondering what this might
mean.

>  //#define CACHE
>  
>  enum vhd_type {
>      VHD_FIXED           = 2,
>      VHD_DYNAMIC         = 3,
> -    VHD_DIFFERENCING    = 4,
> +    VHD_DIFF            = 4,
>  };
>  
>  // Seconds since Jan 1, 2000 0:00:00 (UTC)
> @@ -138,6 +143,15 @@ typedef struct BDRVVPCState {
>      Error *migration_blocker;
>  } BDRVVPCState;
>  
> +typedef struct vhd_tdbatmap_header {

This should be VHDTdBatmapHeader as well.

> +    char magic[8]; /* always "tdbatmap" */
> +
> +    uint64_t batmap_offset;
> +    uint32_t batmap_size;
> +    uint32_t batmap_version;
> +    uint32_t checksum;
> +} QEMU_PACKED VHDTdBatmapHeader;
> +
>  static uint32_t vpc_checksum(uint8_t* buf, size_t size)
>  {
>      uint32_t res = 0;
> @@ -153,7 +167,7 @@ static uint32_t vpc_checksum(uint8_t* buf, size_t size)
>  static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
>  {
>      if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
> -     return 100;
> +        return 100;
>      return 0;
>  }

I wouldn't touch this function at all when you're not changing any
functionality. But if you do touch it, please also add curly braces for
the if statement.

> @@ -164,11 +178,17 @@ static int vpc_open(BlockDriverState *bs, QDict 
> *options, int flags,
>      int i;
>      VHDFooter *footer;
>      VHDDynDiskHeader *dyndisk_header;
> -    uint8_t buf[HEADER_SIZE];
> +    uint8_t buf[DYNAMIC_HEADER_SIZE];
> +    uint8_t tdbatmap_header_buf[TBBATMAP_HEAD_SIZE];
>      uint32_t checksum;
>      uint64_t computed_size;
> -    int disk_type = VHD_DYNAMIC;
> +    uint32_t disk_type;
>      int ret;
> +    VHDTdBatmapHeader *tdbatmap_header;
> +    int parent_locator_offset = 0;
> +    int64_t data_offset = 0;
> +    int data_length = 0;
> +    uint32_t platform;
>  
>      ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
>      if (ret < 0) {
> @@ -176,6 +196,8 @@ static int vpc_open(BlockDriverState *bs, QDict *options, 
> int flags,
>      }
>  
>      footer = (VHDFooter *) s->footer_buf;
> +    disk_type = be32_to_cpu(footer->type);
> +
>      if (strncmp(footer->creator, "conectix", 8)) {
>          int64_t offset = bdrv_getlength(bs->file);
>          if (offset < 0) {
> @@ -230,9 +252,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, 
> int flags,
>          goto fail;
>      }
>  
> -    if (disk_type == VHD_DYNAMIC) {
> +    if (disk_type == VHD_DYNAMIC || disk_type == VHD_DIFF) {
>          ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
> -                         HEADER_SIZE);
> +                         DYNAMIC_HEADER_SIZE);
>          if (ret < 0) {
>              goto fail;
>          }
> @@ -286,6 +308,56 @@ static int vpc_open(BlockDriverState *bs, QDict 
> *options, int flags,
>          s->free_data_block_offset =
>              (s->bat_offset + (s->max_table_entries * 4) + 511) & ~511;
>  
> +        /* Read tdbatmap header by offset */
> +        ret = bdrv_pread(bs->file, s->free_data_block_offset,
> +            tdbatmap_header_buf, TBBATMAP_HEAD_SIZE);
> +        if (ret < 0) {
> +            goto fail;
> +        }

Where can I find information about this tdbatmap structure? I'm looking
at the VHD specification 1.0, but can't find it there.

Is this only relevant for differencing images or could it be an
independent patch?

> +        tdbatmap_header = (VHDTdBatmapHeader *) tdbatmap_header_buf;
> +        if (!strncmp(tdbatmap_header->magic, "tdbatmap", 8)) {
> +            s->free_data_block_offset =
> +                be32_to_cpu(tdbatmap_header->batmap_size) * 512
> +                + be64_to_cpu(tdbatmap_header->batmap_offset);
> +        }
> +
> +        /* Read backing file location from dyn header table */
> +        if (dyndisk_header->parent_name[0] || 
> dyndisk_header->parent_name[1]) {

Probably nicer written as *(uint16_t*) dyndisk_header->parent_name. Or
actually, parent_name could be declared as an array of uint16_t instead
of uint8_t.

> +            for (i = 0; i < PARENT_LOCATOR_NUM; i++) {
> +                data_offset =
> +                    
> be64_to_cpu(dyndisk_header->parent_locator[i].data_offset);
> +                data_length =
> +                    
> be32_to_cpu(dyndisk_header->parent_locator[i].data_length);
> +                platform = dyndisk_header->parent_locator[i].platform;
> +
> +                if (MACX == platform) {

Why do you only implement MACX? Aren't W2KU and W2RU more important for
real-life images? Or does the spec give a wrong impression here?

> +                    if (data_offset + PARENT_PREFIX_LEN >
> +                        s->max_table_entries * s->block_size) {
> +                            goto fail;
> +                    }
> +                        if (data_length - PARENT_PREFIX_LEN > 1024) {
> +                            goto fail;

Indentation is off here.

Also, should 1024 really be DYNAMIC_HEADER_SIZE?

> +                    }
> +                    ret = bdrv_pread(bs->file, data_offset + 
> PARENT_PREFIX_LEN,
> +                        bs->backing_file, data_length - PARENT_PREFIX_LEN);
> +                    if (ret < 0) {
> +                        goto fail;
> +                    }
> +
> +                    bs->backing_file[data_length - PARENT_PREFIX_LEN] = '\0';
> +                }
> +
> +                if (data_offset > parent_locator_offset) {
> +                    parent_locator_offset = data_offset;
> +                }
> +            }
> +        }
> +
> +        if (parent_locator_offset + 512 > s->free_data_block_offset) {
> +            s->free_data_block_offset = parent_locator_offset + 512;
> +        }
> +
>          for (i = 0; i < s->max_table_entries; i++) {
>              be32_to_cpus(&s->pagetable[i]);
>              if (s->pagetable[i] != 0xFFFFFFFF) {
> @@ -363,19 +435,6 @@ static inline int64_t get_sector_offset(BlockDriverState 
> *bs,
>      bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
>      block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index);
>  
> -    // We must ensure that we don't write to any sectors which are marked as
> -    // unused in the bitmap. We get away with setting all bits in the block
> -    // bitmap each time we write to a new block. This might cause Virtual PC 
> to
> -    // miss sparse read optimization, but it's not a problem in terms of
> -    // correctness.
> -    if (write && (s->last_bitmap_offset != bitmap_offset)) {
> -        uint8_t bitmap[s->bitmap_size];
> -
> -        s->last_bitmap_offset = bitmap_offset;
> -        memset(bitmap, 0xff, s->bitmap_size);
> -        bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
> -    }
> -

Doesn't removing this part break dynamic images because all sectors stay
always marked as unallocated?

>  //    printf("sector: %" PRIx64 ", index: %x, offset: %x, bioff: %" PRIx64 
> ", bloff: %" PRIx64 "\n",
>  //   sector_num, pagetable_index, pageentry_index,
>  //   bitmap_offset, block_offset);
> @@ -412,6 +471,53 @@ static inline int64_t get_sector_offset(BlockDriverState 
> *bs,
>  }
>  
>  /*
> + * Returns the absolute byte offset of the given sector in the differencing
> + * image file.
> + *
> + * If the sector is not allocated, -1 is returned instead. If the sector is
> + * allocated in the backing file, -2 is returned. If the sector is allocated
> + * in current file, the block offset is returned.
> + */
> +static inline int64_t get_sector_offset_diff(BlockDriverState *bs,
> +    int64_t sector_num)
> +{
> +    BDRVVPCState *s = bs->opaque;
> +    uint64_t offset = sector_num << BDRV_SECTOR_BITS;
> +    uint64_t bitmap_offset;
> +    uint64_t block_offset;
> +    uint32_t pagetable_index, pageentry_index;
> +    uint32_t bitmap_index, bitmapentry_index;
> +    uint8_t bitmap[s->bitmap_size];

How big can bitmaps become? Can we safely put them on the stack?

> +    int ret;
> +
> +    pagetable_index = offset / s->block_size;
> +    if (pagetable_index >= s->max_table_entries) {
> +        return -1;
> +    } else if (0xffffffff == s->pagetable[pagetable_index]) {

Please write comparisons in the "natural" order, i.e.
s->pagetable[pagetable_index] == 0xffffffff.

> +        return -2;
> +    }
> +
> +    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
> +    if (bitmap_offset > s->max_table_entries * s->block_size) {
> +        return -1;
> +    }
> +    ret = bdrv_pread(bs->file, bitmap_offset, bitmap, s->bitmap_size);
> +    if (ret < 0) {
> +        return -1;
> +    }

This is the problem with the -1/-2 return codes. Normally you should
return ret here so that the caller still has the errno value, but you've
already used negative return values otherwise.

qcow2 returns 0 for unallocated clusters (which means that they are
looked up in the backing file). Is the difference between -1 and -2
actually relevant for VHD?

> +    pageentry_index = (offset % s->block_size) / 512;
> +    bitmap_index = pageentry_index / 8;
> +    bitmapentry_index = 7 - pageentry_index % 8;
> +    if (bitmap[bitmap_index] & 0x1 << bitmapentry_index) {
> +        block_offset = bitmap_offset + s->bitmap_size + (512 * 
> pageentry_index);
> +        return block_offset;
> +    } else {
> +         return -2;
> +    }
> +}
> +
> +/*
>   * Writes the footer to the end of the image file. This is needed when the
>   * file grows as it overwrites the old footer
>   *
> @@ -437,7 +543,8 @@ static int rewrite_footer(BlockDriverState* bs)
>   *
>   * Returns the sectors' offset in the image file on success and < 0 on error
>   */
> -static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
> +static int64_t alloc_block(BlockDriverState *bs, int64_t sector_num,
> +    bool isdiff)
>  {
>      BDRVVPCState *s = bs->opaque;
>      int64_t bat_offset;
> @@ -457,7 +564,11 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t 
> sector_num)
>      s->pagetable[index] = s->free_data_block_offset / 512;
>  
>      // Initialize the block's bitmap
> -    memset(bitmap, 0xff, s->bitmap_size);
> +    if (isdiff) {
> +        memset(bitmap, 0x0, s->bitmap_size);
> +    } else {
> +        memset(bitmap, 0xff, s->bitmap_size);
> +    }

I had hoped that your work on differencing images would automatically
fix dynamic images as well. In theory, 0x0 is correct for both types (if
you correctly update the bitmap after new allocations).

>      ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
>          s->bitmap_size);
>      if (ret < 0) {
> @@ -501,36 +612,62 @@ static int vpc_read(BlockDriverState *bs, int64_t 
> sector_num,
>                      uint8_t *buf, int nb_sectors)
>  {
>      BDRVVPCState *s = bs->opaque;
> -    int ret;
> -    int64_t offset;
> -    int64_t sectors, sectors_per_block;
>      VHDFooter *footer = (VHDFooter *) s->footer_buf;
> +    int64_t sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
> +    int64_t offset, sectors;
> +    int ret;
>  
> -    if (be32_to_cpu(footer->type) == VHD_FIXED) {
> +    switch (be32_to_cpu(footer->type)) {
> +    case VHD_FIXED:
>          return bdrv_read(bs->file, sector_num, buf, nb_sectors);
> -    }
> -    while (nb_sectors > 0) {
> -        offset = get_sector_offset(bs, sector_num, 0);
> -
> -        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
> -        sectors = sectors_per_block - (sector_num % sectors_per_block);
> -        if (sectors > nb_sectors) {
> -            sectors = nb_sectors;
> -        }
> +    case VHD_DYNAMIC:
> +        while (nb_sectors > 0) {
> +            sectors = sectors_per_block - (sector_num % sectors_per_block);
> +            if (sectors > nb_sectors) {
> +                sectors = nb_sectors;
> +            }
>  
> -        if (offset == -1) {
> -            memset(buf, 0, sectors * BDRV_SECTOR_SIZE);
> -        } else {
> -            ret = bdrv_pread(bs->file, offset, buf,
> -                sectors * BDRV_SECTOR_SIZE);
> -            if (ret != sectors * BDRV_SECTOR_SIZE) {
> -                return -1;
> +            offset = get_sector_offset(bs, sector_num, 0);
> +            if (-1 == offset) {
> +                memset(buf, 0, sectors * BDRV_SECTOR_SIZE);
> +            } else {
> +                ret = bdrv_pread(bs->file, offset, buf,
> +                    sectors * BDRV_SECTOR_SIZE);
> +                if (ret != sectors * BDRV_SECTOR_SIZE) {
> +                    return -1;
> +                }
>              }
> +
> +            nb_sectors -= sectors;
> +            sector_num += sectors;
> +            buf += sectors * BDRV_SECTOR_SIZE;
>          }
> +        break;
> +    case VHD_DIFF:
> +        while (nb_sectors > 0) {
> +            offset = get_sector_offset_diff(bs, sector_num);
> +            if (-1 == offset) {
> +                memset(buf, 0, BDRV_SECTOR_SIZE);
> +            } else if (-2 == offset) {
> +                ret = bdrv_pread(bs->backing_hd, sector_num << 
> BDRV_SECTOR_BITS
> +                    , buf, BDRV_SECTOR_SIZE);
> +                if (ret < 0) {
> +                    return -1;
> +                }
> +            } else {
> +                ret = bdrv_pread(bs->file, offset, buf, BDRV_SECTOR_SIZE);
> +                if (ret != BDRV_SECTOR_SIZE) {
> +                    return -1;
> +                }
> +            }
>  
> -        nb_sectors -= sectors;
> -        sector_num += sectors;
> -        buf += sectors * BDRV_SECTOR_SIZE;
> +            nb_sectors--;
> +            sector_num++;
> +            buf += BDRV_SECTOR_SIZE;
> +        }
> +        break;
> +    default:
> +        return -1;
>      }
>      return 0;
>  }

I think the VHD_DYNAMIC and the VHD_DIFF case should really be using the
same code. The differences I can see in your code are:

* get_sector_offset() vs. get_sector_offset_diff(). The former is really
  a not quite correct hack. Dynamic images would be better off using
  your new get_sector_offset_diff() function as well.

* If the cluster isn't allocated, memset() vs. read backing file. Here
  you'd have an if (backing_hd) { read from backing } else { memset }

Performance will suffer a bit from reading in the bitmap with each
request. We'll probably want at least a very simple cache to keep the
last used bitmap in memory so that sequential writes don't suffer as
much.

We may consider reusing a generalised Qcow2Cache here, but that would be
material for another patch series.

> @@ -546,43 +683,98 @@ static coroutine_fn int vpc_co_read(BlockDriverState 
> *bs, int64_t sector_num,
>      return ret;
>  }
>  
> +static inline int64_t write_bitmap(BlockDriverState *bs, int64_t sector_num,
> +    int64_t sectors)
> +{
> +    BDRVVPCState *s = bs->opaque;
> +    uint64_t offset = sector_num << BDRV_SECTOR_BITS;
> +    uint64_t bitmap_offset;
> +    uint32_t pagetable_index, pageentry_index;
> +    uint8_t bitmap[s->bitmap_size];

Same question about stack variables.

> +    uint32_t bitmap_index, bitmapbit_index;
> +    int i;
> +    int ret;
> +
> +    pagetable_index = offset / s->block_size;
> +    pageentry_index = (offset % s->block_size) / 512;
> +    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
> +
> +    if (bitmap_offset > s->max_table_entries * s->block_size) {
> +        return -1;
> +    }
> +    ret = bdrv_pread(bs->file, bitmap_offset, bitmap, s->bitmap_size);
> +    if (ret < 0) {
> +        return -1;
> +    }
> +
> +    for (i = 0; i < sectors; i++) {
> +        bitmap_index = pageentry_index / 8;
> +        bitmapbit_index = 7 - pageentry_index % 8;
> +        bitmap[bitmap_index] |= (0x1 << bitmapbit_index);
> +        pageentry_index++;
> +    }
> +    ret = bdrv_pwrite(bs->file, bitmap_offset, bitmap, s->bitmap_size);
> +    if (ret < 0) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
>  static int vpc_write(BlockDriverState *bs, int64_t sector_num,
>      const uint8_t *buf, int nb_sectors)
>  {
>      BDRVVPCState *s = bs->opaque;
> -    int64_t offset;
> -    int64_t sectors, sectors_per_block;
> +    VHDFooter *footer = (VHDFooter *) s->footer_buf;
> +    int64_t sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
> +    int64_t offset, sectors;
> +    bool isdiff = true;
>      int ret;
> -    VHDFooter *footer =  (VHDFooter *) s->footer_buf;
>  
> -    if (be32_to_cpu(footer->type) == VHD_FIXED) {
> +    switch (be32_to_cpu(footer->type)) {
> +    case VHD_FIXED:
>          return bdrv_write(bs->file, sector_num, buf, nb_sectors);
> -    }
> -    while (nb_sectors > 0) {
> -        offset = get_sector_offset(bs, sector_num, 1);
> -
> -        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS;
> -        sectors = sectors_per_block - (sector_num % sectors_per_block);
> -        if (sectors > nb_sectors) {
> -            sectors = nb_sectors;
> +    case VHD_DYNAMIC:
> +    case VHD_DIFF:
> +        if (VHD_DYNAMIC == be32_to_cpu(footer->type)) {
> +            isdiff = false;
>          }
>  
> -        if (offset == -1) {
> -            offset = alloc_block(bs, sector_num);
> -            if (offset < 0)
> +        while (nb_sectors > 0) {
> +            sectors = sectors_per_block - (sector_num % sectors_per_block);
> +            if (sectors > nb_sectors) {
> +                sectors = nb_sectors;
> +            }
> +
> +            offset = get_sector_offset(bs, sector_num, 1);
> +            if (offset == -1) {
> +                offset = alloc_block(bs, sector_num, isdiff);
> +                if (offset < 0) {
> +                    return -1;
> +                }
> +            }
> +
> +            ret = bdrv_pwrite(bs->file, offset, buf,
> +                sectors * BDRV_SECTOR_SIZE);
> +            if (ret != sectors * BDRV_SECTOR_SIZE) {
>                  return -1;
> -        }
> +            }
>  
> -        ret = bdrv_pwrite(bs->file, offset, buf, sectors * BDRV_SECTOR_SIZE);
> -        if (ret != sectors * BDRV_SECTOR_SIZE) {
> -            return -1;
> -        }
> +            if (true == isdiff) {
> +                ret = write_bitmap(bs, sector_num, sectors);
> +                if (ret < 0) {
> +                    return -1;
> +                }
> +            }

Do we need a specific ordering between bitmap updates and the guest data
write? So far we don't have any guarantees. It might actually be okay.

> -        nb_sectors -= sectors;
> -        sector_num += sectors;
> -        buf += sectors * BDRV_SECTOR_SIZE;
> +            nb_sectors -= sectors;
> +            sector_num += sectors;
> +            buf += sectors * BDRV_SECTOR_SIZE;
> +        }
> +        break;
> +    default:
> +        return -1;

This is a "can't ever happen" case, so make it an abort()?

>      }
> -
>      return 0;
>  }
>  
> @@ -910,6 +1102,7 @@ static BlockDriver bdrv_vpc = {
>      .bdrv_close             = vpc_close,
>      .bdrv_reopen_prepare    = vpc_reopen_prepare,
>      .bdrv_create            = vpc_create,
> +    .supports_backing       = true,
>  
>      .bdrv_read              = vpc_co_read,
>      .bdrv_write             = vpc_co_write,

Kevin



reply via email to

[Prev in Thread] Current Thread [Next in Thread]