qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v1 2/2] block/qcow2: introduce inflight writes counters: fix disc


From: Vladimir Sementsov-Ogievskiy
Subject: [PATCH v1 2/2] block/qcow2: introduce inflight writes counters: fix discard
Date: Thu, 25 Feb 2021 13:10:39 +0300

There is a bug in qcow2: host cluster can be discarded (refcount
becomes 0) and reused during data write. In this case data write may
pollute another cluster (recently allocated) or even metadata.

To fix the issue let's track inflight writes to host cluster in the
hash table and consider new counter when discarding and reusing host
clusters.

Enable qcow2-discard-during-rewrite as it is fixed.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/qcow2.h                                 |   9 +
 block/qcow2-refcount.c                        | 154 +++++++++++++++++-
 block/qcow2.c                                 |  26 ++-
 .../tests/qcow2-discard-during-rewrite        |   2 +-
 4 files changed, 186 insertions(+), 5 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index 0678073b74..fea2525a76 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -420,6 +420,8 @@ typedef struct BDRVQcow2State {
      * is to convert the image with the desired compression type set.
      */
     Qcow2CompressionType compression_type;
+
+    GHashTable *inflight_writes_counters;
 } BDRVQcow2State;
 
 typedef struct Qcow2COWRegion {
@@ -896,6 +898,13 @@ int qcow2_shrink_reftable(BlockDriverState *bs);
 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size);
 int qcow2_detect_metadata_preallocation(BlockDriverState *bs);
 
+int qcow2_inflight_writes_inc_locked(BlockDriverState *bs, int64_t offset,
+                                     int64_t length);
+int qcow2_inflight_writes_dec(BlockDriverState *bs, int64_t offset,
+                              int64_t length);
+int qcow2_inflight_writes_dec_locked(BlockDriverState *bs, int64_t offset,
+                                     int64_t length);
+
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         bool exact_size);
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 8e649b008e..0ecb1167a6 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -799,6 +799,145 @@ found:
     }
 }
 
+/*
+ * Qcow2InFlightRefcount is a type for values of s->inflight_writes_counters
+ * hasm map. And it's keys are cluster indices.
+ */
+typedef struct Qcow2InFlightRefcount {
+    /*
+     * Number of in-flight writes to the cluster, always > 0, as when becomes
+     * 0 the entry is removed from s->inflight_writes_counters.
+     */
+    uint64_t inflight_writes_cnt;
+
+    /* Cluster refcount is known to be zero */
+    bool refcount_zero;
+
+    /* Cluster refcount was made zero with this discard-type */
+    enum qcow2_discard_type type;
+} Qcow2InFlightRefcount;
+
+static Qcow2InFlightRefcount *find_infl_wr(BDRVQcow2State *s,
+                                           int64_t cluster_index)
+{
+    Qcow2InFlightRefcount *infl;
+
+    if (!s->inflight_writes_counters) {
+        return NULL;
+    }
+
+    infl = g_hash_table_lookup(s->inflight_writes_counters, &cluster_index);
+
+    if (infl) {
+        assert(infl->inflight_writes_cnt > 0);
+    }
+
+    return infl;
+}
+
+/*
+ * Returns true if there are any in-flight writes to the cluster blocking
+ * its reallocation.
+ */
+static bool has_infl_wr(BDRVQcow2State *s, int64_t cluster_index)
+{
+    return !!find_infl_wr(s, cluster_index);
+}
+
+static int update_inflight_write_cnt(BlockDriverState *bs,
+                                     int64_t offset, int64_t length,
+                                     bool decrease, bool locked)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int64_t start, last, cluster_offset;
+
+    if (locked) {
+        qemu_co_mutex_assert_locked(&s->lock);
+    }
+
+    start = start_of_cluster(s, offset);
+    last = start_of_cluster(s, offset + length - 1);
+    for (cluster_offset = start; cluster_offset <= last;
+         cluster_offset += s->cluster_size)
+    {
+        int64_t cluster_index = cluster_offset >> s->cluster_bits;
+        Qcow2InFlightRefcount *infl = find_infl_wr(s, cluster_index);
+
+        if (!infl) {
+            infl = g_new0(Qcow2InFlightRefcount, 1);
+            g_hash_table_insert(s->inflight_writes_counters,
+                                g_memdup(&cluster_index, 
sizeof(cluster_index)),
+                                infl);
+        }
+
+        if (decrease) {
+            assert(infl->inflight_writes_cnt >= 1);
+            infl->inflight_writes_cnt--;
+        } else {
+            infl->inflight_writes_cnt++;
+        }
+
+        if (infl->inflight_writes_cnt == 0) {
+            bool refcount_zero = infl->refcount_zero;
+            enum qcow2_discard_type type = infl->type;
+
+            g_hash_table_remove(s->inflight_writes_counters, &cluster_index);
+
+            if (refcount_zero) {
+                /*
+                 * Slow path. We must reset normal refcount to actually release
+                 * the cluster.
+                 */
+                int ret;
+
+                if (!locked) {
+                    qemu_co_mutex_lock(&s->lock);
+                }
+                ret = qcow2_update_cluster_refcount(bs, cluster_index, 0,
+                                                    true, type);
+                if (!locked) {
+                    qemu_co_mutex_unlock(&s->lock);
+                }
+
+                if (ret < 0) {
+                    return ret;
+                }
+            }
+        }
+
+    }
+
+    return 0;
+}
+
+/*
+ * It makes sense to call qcow2_inflight_writes_inc() only in same s->lock
+ * critical section where corresponding region was allocated (or taken from L2
+ * table).
+ */
+int qcow2_inflight_writes_inc_locked(BlockDriverState *bs, int64_t offset,
+                                     int64_t length)
+{
+    return update_inflight_write_cnt(bs, offset, length, false, true);
+}
+
+/*
+ * Called with s->lock not locked by caller. Will take s->lock only if need to
+ * release the cluster (refcount is 0 and inflight-write-cnt becomes zero).
+ */
+int qcow2_inflight_writes_dec(BlockDriverState *bs, int64_t offset,
+                              int64_t length)
+{
+    return update_inflight_write_cnt(bs, offset, length, true, false);
+}
+
+/* Called with s->lock locked. */
+int qcow2_inflight_writes_dec_locked(BlockDriverState *bs, int64_t offset,
+                                     int64_t length)
+{
+    return update_inflight_write_cnt(bs, offset, length, true, true);
+}
+
 /* XXX: cache several refcount block clusters ? */
 /* @addend is the absolute value of the addend; if @decrease is set, @addend
  * will be subtracted from the current refcount, otherwise it will be added */
@@ -885,6 +1024,13 @@ static int QEMU_WARN_UNUSED_RESULT 
update_refcount(BlockDriverState *bs,
 
         if (refcount == 0) {
             void *table;
+            Qcow2InFlightRefcount *infl = find_infl_wr(s, cluster_index);
+
+            if (infl) {
+                infl->refcount_zero = true;
+                infl->type = type;
+                continue;
+            }
 
             table = qcow2_cache_is_table_offset(s->refcount_block_cache,
                                                 offset);
@@ -983,7 +1129,7 @@ retry:
 
         if (ret < 0) {
             return ret;
-        } else if (refcount != 0) {
+        } else if (refcount != 0 || has_infl_wr(s, next_cluster_index)) {
             goto retry;
         }
     }
@@ -1046,7 +1192,7 @@ int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, 
uint64_t offset,
             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
             if (ret < 0) {
                 return ret;
-            } else if (refcount != 0) {
+            } else if (refcount != 0 || has_infl_wr(s, cluster_index)) {
                 break;
             }
         }
@@ -2294,7 +2440,9 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs,
          contiguous_free_clusters < cluster_count;
          cluster++)
     {
-        if (!s->get_refcount(*refcount_table, cluster)) {
+        if (!s->get_refcount(*refcount_table, cluster) &&
+            !has_infl_wr(s, cluster))
+        {
             contiguous_free_clusters++;
             if (first_gap) {
                 /* If this is the first free cluster found, update
diff --git a/block/qcow2.c b/block/qcow2.c
index d9f49a52e7..00a5a2eca0 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1831,6 +1831,8 @@ static int coroutine_fn qcow2_do_open(BlockDriverState 
*bs, QDict *options,
 #endif
 
     qemu_co_queue_init(&s->thread_task_queue);
+    s->inflight_writes_counters =
+        g_hash_table_new_full(g_int64_hash, g_int64_equal, g_free, g_free);
 
     return ret;
 
@@ -2546,6 +2548,9 @@ out_unlocked:
 
 out_locked:
     qcow2_handle_l2meta(bs, &l2meta, false);
+
+    qcow2_inflight_writes_dec_locked(bs, host_offset, bytes);
+
     qemu_co_mutex_unlock(&s->lock);
 
     qemu_vfree(crypt_buf);
@@ -2605,6 +2610,8 @@ static coroutine_fn int qcow2_co_pwritev_part(
             goto out_locked;
         }
 
+        qcow2_inflight_writes_inc_locked(bs, host_offset, cur_bytes);
+
         qemu_co_mutex_unlock(&s->lock);
 
         if (!aio && cur_bytes != bytes) {
@@ -2707,6 +2714,9 @@ static void qcow2_close(BlockDriverState *bs)
     g_free(s->image_backing_file);
     g_free(s->image_backing_format);
 
+    assert(g_hash_table_size(s->inflight_writes_counters) == 0);
+    g_hash_table_unref(s->inflight_writes_counters);
+
     if (has_data_file(bs)) {
         bdrv_unref_child(bs, s->data_file);
         s->data_file = NULL;
@@ -4097,10 +4107,17 @@ qcow2_co_copy_range_to(BlockDriverState *bs,
             goto fail;
         }
 
+        qcow2_inflight_writes_inc_locked(bs, host_offset, cur_bytes);
+
         qemu_co_mutex_unlock(&s->lock);
+
         ret = bdrv_co_copy_range_to(src, src_offset, s->data_file, host_offset,
                                     cur_bytes, read_flags, write_flags);
+
         qemu_co_mutex_lock(&s->lock);
+
+        qcow2_inflight_writes_dec_locked(bs, host_offset, cur_bytes);
+
         if (ret < 0) {
             goto fail;
         }
@@ -4536,13 +4553,20 @@ qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
     }
 
     ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len, true);
-    qemu_co_mutex_unlock(&s->lock);
     if (ret < 0) {
+        qemu_co_mutex_unlock(&s->lock);
         goto fail;
     }
 
+    qcow2_inflight_writes_inc_locked(bs, cluster_offset, out_len);
+
+    qemu_co_mutex_unlock(&s->lock);
+
     BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
     ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
+
+    qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
+
     if (ret < 0) {
         goto fail;
     }
diff --git a/tests/qemu-iotests/tests/qcow2-discard-during-rewrite 
b/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
index 7f0d8a107a..2e2e0d2cb0 100755
--- a/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
+++ b/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# group: quick disabled
+# group: quick
 #
 # Test discarding (and reusing) host cluster during writing data to it.
 #
-- 
2.29.2




reply via email to

[Prev in Thread] Current Thread [Next in Thread]