qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH] Introduce cache images for the QCOW2 format


From: Kaveh Razavi
Subject: [Qemu-devel] [PATCH] Introduce cache images for the QCOW2 format
Date: Tue, 13 Aug 2013 19:03:56 +0200

Using copy-on-write images with the base image stored remotely is common
practice in data centers. This saves significant network traffic by
avoiding the transfer of the complete base image. However, the data
blocks needed for a VM boot still need to be transfered to the node that
runs the VM. On slower networks, this will create a bottleneck when
booting many VMs simultaneously from a single VM image. Also,
simultaneously booting VMs from more than one VM image creates a
bottleneck at the storage device of the base image, if the storage
device does not fair well with the random access pattern that happens
during booting.

This patch introduces a block-level caching mechanism by introducing a
copy-on-read image that supports quota and goes in between the base
image and copy-on-write image. This cache image can either be stored on
the nodes that run VMs or on a storage device that can handle random
access well (e.g. memory, SSD, etc.). This cache image is effective
since usually only a very small part of the image is necessary for
booting a VM. We measured 100MB to be enough for a default CentOS and
Debian installations.

A cache image with a quota of 100MB can be created using these commands:

$ qemu-img create -f qcow2 -o
cache_img_quota=104857600,backing_file=/path/to/base /path/to/cache
$ qemu-img create -f qcow2 -o backing_file=/path/to/cache /path/to/cow

The first time a VM boots from the copy-on-write image, the cache gets
warm. Subsequent boots do not need to read from the base image.

The implementation is a small extension to the QCOW2 format. If you are
interested to know more, please read this paper:
http://cs.vu.nl/~kaveh/pubs/pdf/sc13.pdf
---
 block.c                   |   28 +++++++++-
 block/qcow2.c             |  121 +++++++++++++++++++++++++++++++++++++++++++--
 block/qcow2.h             |    6 ++
 include/block/block_int.h |    3 +
 4 files changed, 151 insertions(+), 7 deletions(-)

diff --git a/block.c b/block.c
index 01b66d8..52a92b4 100644
--- a/block.c
+++ b/block.c
@@ -920,18 +920,40 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*options)
         back_drv = bdrv_find_format(bs->backing_format);
     }
 
-    /* backing files always opened read-only */
-    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
-
+    /* backing files always opened read-only except for cache images,
+     * we first open the file with RDWR and check whether it is a cache 
+     * image. If so, we leave the RDWR, if not, we re-open read-only.
+     */
+    back_flags = (bs->open_flags & ~(BDRV_O_SNAPSHOT)) | BDRV_O_RDWR;
+    
     ret = bdrv_open(bs->backing_hd,
                     *backing_filename ? backing_filename : NULL, options,
                     back_flags, back_drv);
     if (ret < 0) {
+        goto out;
+    }
+    /* was not a cache image? */
+    if(bs->backing_hd->is_cache_img == false)
+    {
+        /* re-open read-only */
+        back_flags = bs->open_flags & ~(BDRV_O_SNAPSHOT | BDRV_O_RDWR);
+        bdrv_delete(bs->backing_hd);
+        ret = bdrv_open(bs->backing_hd,
+                *backing_filename ? backing_filename : NULL, options,
+                back_flags, back_drv);
+        if (ret < 0) {
+            goto out;
+        }
+    }
+out:
+    if (ret < 0)
+    {
         bdrv_delete(bs->backing_hd);
         bs->backing_hd = NULL;
         bs->open_flags |= BDRV_O_NO_BACKING;
         return ret;
     }
+
     return 0;
 }
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 3376901..3b0706a 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -57,6 +57,7 @@ typedef struct {
 #define  QCOW2_EXT_MAGIC_END 0
 #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
 #define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
+#define  QCOW2_EXT_MAGIC_CACHE_IMG 0x31393834
 
 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
@@ -148,6 +149,27 @@ static int qcow2_read_extensions(BlockDriverState *bs, 
uint64_t start_offset,
                 *p_feature_table = feature_table;
             }
             break;
+        
+        case QCOW2_EXT_MAGIC_CACHE_IMG:
+            bs->is_cache_img = true;
+            if(ext.len != 2 * sizeof(uint64_t)) {
+                fprintf(stderr, "ERROR: cache_img_extension is not %zd"
+                        "bytes (%"PRIu32")", 2 * sizeof(uint64_t), ext.len);
+                return 4;
+            }
+            if ((ret = bdrv_pread(bs->file, offset, &(s->cache_img_inuse),
+                sizeof(uint64_t))) != sizeof(uint64_t)) {
+                return ret;
+            }
+            be64_to_cpus(&(s->cache_img_inuse));
+            s->cache_img_cur_inuse = s->cache_img_inuse;
+            if ((ret = bdrv_pread(bs->file, offset + sizeof(uint64_t), 
+                            &(s->cache_img_quota), sizeof(uint64_t))) != 
+                    sizeof(uint64_t)) {
+                return ret;
+            }
+            be64_to_cpus(&(s->cache_img_quota));
+            break;
 
         default:
             /* unknown magic - save it in case we need to rewrite the header */
@@ -694,7 +716,6 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState 
*bs, int64_t sector_num,
     qemu_co_mutex_lock(&s->lock);
 
     while (remaining_sectors != 0) {
-
         /* prepare next request */
         cur_nr_sectors = remaining_sectors;
         if (s->crypt_method) {
@@ -730,6 +751,31 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState 
*bs, int64_t sector_num,
                     if (ret < 0) {
                         goto fail;
                     }
+                    /* do copy-on-read if this is a cache image */
+                    if (bs->is_cache_img && !s->is_cache_full && 
+                            !s->is_writing_on_cache)
+                    {
+                        qemu_co_mutex_unlock(&s->lock);
+                        s->is_writing_on_cache = true;
+                        ret = bdrv_co_writev(bs,
+                                             sector_num,
+                                             n1,
+                                             &hd_qiov);
+                        s->is_writing_on_cache = false;
+                        qemu_co_mutex_lock(&s->lock);
+                        if (ret < 0) {
+                            if (ret == (-ENOSPC))
+                            {
+                                s->is_cache_full = true;
+                            }
+                            else {
+                                /* error is other than cache space */
+                                fprintf(stderr, "Cache write error (%d)\n", 
+                                        ret);
+                                goto fail;
+                            }
+                        }
+                    }
                 }
             } else {
                 /* Note: in this case, no need to wait */
@@ -840,7 +886,6 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState 
*bs,
     qemu_co_mutex_lock(&s->lock);
 
     while (remaining_sectors != 0) {
-
         l2meta = NULL;
 
         trace_qcow2_writev_start_part(qemu_coroutine_self());
@@ -859,6 +904,20 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState 
*bs,
 
         assert((cluster_offset & 511) == 0);
 
+        if(bs->is_cache_img)
+        {
+            if(s->cache_img_cur_inuse + (cur_nr_sectors * 512) > 
+               s->cache_img_quota)
+            {
+                ret = -ENOSPC;
+                goto fail;
+            }
+            else
+            {
+                s->cache_img_cur_inuse += (cur_nr_sectors * 512);
+            }
+        }
+
         qemu_iovec_reset(&hd_qiov);
         qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
             cur_nr_sectors * 512);
@@ -946,6 +1005,13 @@ fail:
 static void qcow2_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
+
+    if (bs->is_cache_img && (s->cache_img_cur_inuse != s->cache_img_inuse))
+    {
+        s->cache_img_inuse = s->cache_img_cur_inuse;
+        qcow2_update_header(bs);
+    }
+
     g_free(s->l1_table);
 
     qcow2_cache_flush(bs, s->l2_table_cache);
@@ -1041,6 +1107,7 @@ int qcow2_update_header(BlockDriverState *bs)
     uint32_t refcount_table_clusters;
     size_t header_length;
     Qcow2UnknownHeaderExtension *uext;
+    char cache_img_ext[2 * sizeof(uint64_t)];
 
     buf = qemu_blockalign(bs, buflen);
 
@@ -1122,6 +1189,21 @@ int qcow2_update_header(BlockDriverState *bs)
         buflen -= ret;
     }
 
+    if (s->cache_img_quota)
+    {
+        cpu_to_be64s(&s->cache_img_inuse);
+        cpu_to_be64s(&s->cache_img_quota);
+        mempcpy(mempcpy(cache_img_ext, &s->cache_img_inuse, sizeof(uint64_t)),
+                &s->cache_img_quota, sizeof(uint64_t));
+        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_CACHE_IMG, &cache_img_ext, 
+                sizeof(cache_img_ext), buflen);
+        be64_to_cpus(&s->cache_img_inuse);
+        be64_to_cpus(&s->cache_img_quota);
+
+        buf += ret;
+        buflen -= ret;
+    }
+
     /* Feature table */
     Qcow2Feature features[] = {
         {
@@ -1201,6 +1283,16 @@ static int qcow2_change_backing_file(BlockDriverState 
*bs,
     return qcow2_update_header(bs);
 }
 
+static int qcow2_update_cache_img_fields(BlockDriverState *bs,
+        uint64_t cache_img_inuse, uint64_t cache_img_quota)
+{
+    BDRVQcowState *s = bs->opaque;
+    s->cache_img_inuse = cache_img_inuse;
+    s->cache_img_quota = cache_img_quota;
+
+    return qcow2_update_header(bs);
+}
+
 static int preallocate(BlockDriverState *bs)
 {
     uint64_t nb_sectors;
@@ -1260,7 +1352,8 @@ static int preallocate(BlockDriverState *bs)
 static int qcow2_create2(const char *filename, int64_t total_size,
                          const char *backing_file, const char *backing_format,
                          int flags, size_t cluster_size, int prealloc,
-                         QEMUOptionParameter *options, int version)
+                         QEMUOptionParameter *options, int version, 
+                         uint64_t cache_img_quota)
 {
     /* Calculate cluster_bits */
     int cluster_bits;
@@ -1377,6 +1470,15 @@ static int qcow2_create2(const char *filename, int64_t 
total_size,
         }
     }
 
+    /* Is this a cache image? */
+    if (cache_img_quota) {
+        ret = qcow2_update_cache_img_fields(bs, 0, cache_img_quota);
+
+        if (ret < 0) {
+            goto out;
+        }
+    }
+
     /* And if we're supposed to preallocate metadata, do that now */
     if (prealloc) {
         BDRVQcowState *s = bs->opaque;
@@ -1403,6 +1505,7 @@ static int qcow2_create(const char *filename, 
QEMUOptionParameter *options)
     size_t cluster_size = DEFAULT_CLUSTER_SIZE;
     int prealloc = 0;
     int version = 2;
+    uint64_t cache_img_quota = 0;
 
     /* Read out options */
     while (options && options->name) {
@@ -1440,6 +1543,10 @@ static int qcow2_create(const char *filename, 
QEMUOptionParameter *options)
             }
         } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
             flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
+        } else if (!strcmp(options->name, BLOCK_OPT_CACHE_IMG_QUOTA)) {
+            if (options->value.n) {
+                cache_img_quota = (uint64_t)(options->value.n);
+            }
         }
         options++;
     }
@@ -1457,7 +1564,8 @@ static int qcow2_create(const char *filename, 
QEMUOptionParameter *options)
     }
 
     return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
-                         cluster_size, prealloc, options, version);
+                         cluster_size, prealloc, options, version, 
+                         cache_img_quota);
 }
 
 static int qcow2_make_empty(BlockDriverState *bs)
@@ -1774,6 +1882,11 @@ static QEMUOptionParameter qcow2_create_options[] = {
         .type = OPT_FLAG,
         .help = "Postpone refcount updates",
     },
+    {
+        .name = BLOCK_OPT_CACHE_IMG_QUOTA,
+        .type = OPT_SIZE,
+        .help = "Quota of the cache image"
+    },
     { NULL }
 };
 
diff --git a/block/qcow2.h b/block/qcow2.h
index dba9771..36922da 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -203,6 +203,12 @@ typedef struct BDRVQcowState {
     uint64_t compatible_features;
     uint64_t autoclear_features;
 
+    uint64_t cache_img_cur_inuse; /* current data size in the cache */
+    uint64_t cache_img_inuse; /* data size in the cache on open */
+    uint64_t cache_img_quota; /* max size allowed for cache image */
+    bool is_cache_full; /* whether cache is full */
+    bool is_writing_on_cache; /* currently writing to the cache */
+
     size_t unknown_header_fields_size;
     void* unknown_header_fields;
     QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext;
diff --git a/include/block/block_int.h b/include/block/block_int.h
index e45f2a0..0e4f21f 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -58,6 +58,7 @@
 #define BLOCK_OPT_COMPAT_LEVEL      "compat"
 #define BLOCK_OPT_LAZY_REFCOUNTS    "lazy_refcounts"
 #define BLOCK_OPT_ADAPTER_TYPE      "adapter_type"
+#define BLOCK_OPT_CACHE_IMG_QUOTA   "cache_img_quota"
 
 typedef struct BdrvTrackedRequest {
     BlockDriverState *bs;
@@ -255,6 +256,8 @@ struct BlockDriverState {
     BlockDriverState *backing_hd;
     BlockDriverState *file;
 
+    bool is_cache_img; /* if set, the image is a cache */
+
     NotifierList close_notifiers;
 
     /* Callback before write request is processed */
-- 
1.7.0.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]