qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v4 2/5] qcow2-cluster: Expand zero clusters


From: Max Reitz
Subject: Re: [Qemu-devel] [PATCH v4 2/5] qcow2-cluster: Expand zero clusters
Date: Tue, 03 Sep 2013 09:11:40 +0200
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130805 Thunderbird/17.0.8

Am 02.09.2013 17:13, schrieb Kevin Wolf:
Am 02.09.2013 um 12:04 hat Max Reitz geschrieben:
Add functionality for expanding zero clusters. This is necessary for
downgrading the image version to one without zero cluster support.

For non-backed images, this function may also just discard zero clusters
instead of truly expanding them.

Signed-off-by: Max Reitz <address@hidden>
---
  block/qcow2-cluster.c  | 228 +++++++++++++++++++++++++++++++++++++++++++++++++
  block/qcow2-refcount.c |  29 ++++---
  block/qcow2.h          |   5 ++
  3 files changed, 248 insertions(+), 14 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 2d5aa92..c90fb51 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1497,3 +1497,231 @@ fail:
return ret;
  }
+
+/*
+ * Expands all zero clusters in a specific L1 table (or deallocates them, for
+ * non-backed non-pre-allocated zero clusters).
+ *
+ * expanded_clusters is a bitmap where every bit corresponds to one cluster in
+ * the image file; a bit gets set if the corresponding cluster has been used 
for
+ * zero expansion (i.e., has been filled with zeroes and is referenced from an
+ * L2 table). nb_clusters contains the total cluster count of the image file,
+ * i.e., the number of bits in expanded_clusters.
+ */
+static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
+                                      int l1_size, uint8_t *expanded_clusters,
+                                      uint64_t nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+    bool is_active_l1 = (l1_table == s->l1_table);
+    uint64_t *l2_table = NULL;
+    int ret;
+    int i, j;
+
+    if (!is_active_l1) {
+        /* inactive L2 tables require a buffer to be stored in when loading
+         * them from disk */
+        l2_table = qemu_blockalign(bs, s->cluster_size);
+    }
+
+    for (i = 0; i < l1_size; i++) {
+        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
+        bool l2_dirty = false;
+
+        if (!l2_offset) {
+            /* unallocated */
+            continue;
+        }
+
+        if (is_active_l1) {
+            /* get active L2 tables from cache */
+            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
+                    (void **)&l2_table);
+        } else {
+            /* load inactive L2 tables from disk */
+            ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
+                    (void *)l2_table, s->cluster_sectors);
+        }
+        if (ret < 0) {
+            goto fail;
+        }
+
+        for (j = 0; j < s->l2_size; j++) {
+            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
+            int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index;
+            int cluster_type = qcow2_get_cluster_type(l2_entry);
+
+            if (cluster_type == QCOW2_CLUSTER_NORMAL) {
+                cluster_index = offset >> s->cluster_bits;
+                assert((cluster_index >= 0) && (cluster_index < nb_clusters));
+                if (expanded_clusters[cluster_index / 8] &
+                    (1 << (cluster_index % 8))) {
+                    /* Probably a shared L2 table; this cluster was a zero
+                     * cluster which has been expanded, its refcount
+                     * therefore most likely requires an update. */
+                    ret = qcow2_update_cluster_refcount(bs, cluster_index, 1,
+                                                        QCOW2_DISCARD_NEVER);
+                    if (ret < 0) {
+                        goto fail;
+                    }
+                    /* Since we just increased the refcount, the COPIED flag 
may
+                     * no longer be set. */
+                    l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED);
+                    l2_dirty = true;
+                }
+                continue;
+            }
+            else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) {
+                continue;
+            }
+
+            if (!offset) {
+                /* not preallocated */
+                if (!bs->backing_hd) {
+                    /* not backed; therefore we can simply deallocate the
+                     * cluster */
+                    l2_table[j] = 0;
+                    l2_dirty = true;
+                    continue;
+                }
+
+                offset = qcow2_alloc_clusters(bs, s->cluster_size);
+                if (offset < 0) {
+                    ret = offset;
+                    goto fail;
+                }
+            }
+
+            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
+                                                offset, s->cluster_size);
+            if (ret < 0) {
+                qcow2_free_clusters(bs, offset, s->cluster_size,
+                        QCOW2_DISCARD_ALWAYS);
+                goto fail;
+            }
+
+            ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE,
+                                    s->cluster_sectors);
+            if (ret < 0) {
+                qcow2_free_clusters(bs, offset, s->cluster_size,
+                        QCOW2_DISCARD_ALWAYS);
+                goto fail;
+            }
+
+            l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
+            l2_dirty = true;
+
+            cluster_index = offset >> s->cluster_bits;
+            assert((cluster_index >= 0) && (cluster_index < nb_clusters));
+            expanded_clusters[cluster_index / 8] |= 1 << (cluster_index % 8);
+        }
+
+        if (is_active_l1) {
+            if (l2_dirty) {
+                qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+                qcow2_cache_depends_on_flush(s->l2_table_cache);
+            }
+            ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
+            if (ret < 0) {
+                l2_table = NULL;
+                goto fail;
+            }
+        } else {
+            if (l2_dirty) {
+                ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT &
+                        ~(QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2), 
l2_offset,
+                        s->cluster_size);
+                if (ret < 0) {
+                    goto fail;
+                }
+
+                ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
+                        (void *)l2_table, s->cluster_sectors);
+                if (ret < 0) {
+                    goto fail;
+                }
+            }
+        }
+    }
+
+    ret = 0;
+
+fail:
+    if (l2_table) {
+        if (!is_active_l1) {
+            qemu_vfree(l2_table);
+        } else {
+            if (ret < 0) {
+                qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
+            } else {
+                ret = qcow2_cache_put(bs, s->l2_table_cache,
+                        (void **)&l2_table);
+            }
+        }
+    }
+    return ret;
+}
+
+/*
+ * For backed images, expands all zero clusters on the image. For non-backed
+ * images, deallocates all non-pre-allocated zero clusters (and claims the
+ * allocation for pre-allocated ones). This is important for downgrading to a
+ * qcow2 version which doesn't yet support metadata zero clusters.
+ */
+int qcow2_expand_zero_clusters(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l1_table = NULL;
+    uint64_t nb_clusters;
+    uint8_t *expanded_clusters;
+    int ret;
+    int i, j;
+
+    nb_clusters = bs->total_sectors >> (s->cluster_bits - BDRV_SECTOR_BITS);
Shouldn't this value actually be rounded up?
Oh, yes, of course.

+    expanded_clusters = g_malloc0(nb_clusters / 8);
Just like the size here?

+
+    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
+                                     expanded_clusters, nb_clusters);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Inactive L1 tables may point to active L2 tables - therefore it is
+     * necessary to flush the L2 table cache before trying to access the L2
+     * tables pointed to by inactive L1 entries (else we might try to expand
+     * zero clusters that have already been expanded) */
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        goto fail;
+    }
I'm afraid that this is not enough in fact. If you modify an active L2
table through an inactive L1, you bypass the cache and use bdrv_write().
The cache will still have a stale copy of the table.
Hm, yes, that didn't occur to me because the image isn't being written to after this zero cluster expansion; at least not in the tests I ran.

Can't we just access inactive L2 tables through the cache as well? Or
can we get into trouble in other places if we do that? If so, we'd have
to implement a function that flushes _and empties_ the cache. (And then
we could always go the bdrv_read/write path, so there's some code to
save either way)
I'd personally go for emptying the cache.

+    for (i = 0; i < s->nb_snapshots; i++) {
+        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
+                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;
+
+        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
+
+        ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset /
+                BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        for (j = 0; j < s->snapshots[i].l1_size; j++) {
+            be64_to_cpus(&l1_table[j]);
+        }
+
+        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
+                                         expanded_clusters, nb_clusters);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    ret = 0;
+
+fail:
+    g_free(expanded_clusters);
+    g_free(l1_table);
+    return ret;
+}
Kevin

Max



reply via email to

[Prev in Thread] Current Thread [Next in Thread]