qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH v2 4/8] rdma: unpin support


From: mrhines
Subject: [Qemu-devel] [PATCH v2 4/8] rdma: unpin support
Date: Fri, 28 Jun 2013 15:59:59 -0400

From: "Michael R. Hines" <address@hidden>

As requested, the protocol now includes memory unpinning support.
This has been implemented in a non-optimized manner, in such a way
that one could devise an LRU or other workload-specific information
on top of the basic mechanism to influence the way unpinning happens
during runtime.

The feature is not yet user-facing, and is thus can only be enable
at compile-time.

Signed-off-by: Michael R. Hines <address@hidden>
---
 migration-rdma.c |  143 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)

diff --git a/migration-rdma.c b/migration-rdma.c
index 0bd5e23..6218d48 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -944,6 +944,132 @@ const char *print_wrid(int wrid)
     return wrid_desc[wrid];
 }
 
+/*
+ * RDMA requires memory registration (mlock/pinning), but this is not good for
+ * overcommitment.
+ *
+ * In preparation for the future where LRU information or workload-specific
+ * writable writable working set memory access behavior is available to QEMU
+ * it would be nice to have in place the ability to UN-register/UN-pin
+ * particular memory regions from the RDMA hardware when it is determine that
+ * those regions of memory will likely not be accessed again in the near 
future.
+ *
+ * While we do not yet have such information right now, the following
+ * compile-time option allows us to perform a non-optimized version of this
+ * behavior.
+ *
+ * By uncommenting this option, you will cause *all* RDMA transfers to be
+ * unregistered immediately after the transfer completes on both sides of the
+ * connection. This has no effect in 'rdma-pin-all' mode, only regular mode.
+ *
+ * This will have a terrible impact on migration performance, so until future
+ * workload information or LRU information is available, do not attempt to use
+ * this feature except for basic testing.
+ */
+//#define RDMA_UNREGISTRATION_EXAMPLE
+
+/*
+ * Perform a non-optimized memory unregistration after every transfer
+ * for demonsration purposes, only if pin-all is not requested.
+ *
+ * Potential optimizations:
+ * 1. Start a new thread to run this function continuously
+        - for bit clearing
+        - and for receipt of unregister messages
+ * 2. Use an LRU.
+ * 3. Use workload hints.
+ */
+#ifdef RDMA_UNREGISTRATION_EXAMPLE
+static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
+{
+    while (rdma->unregistrations[rdma->unregister_current]) {
+        int ret;
+        uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
+        uint64_t chunk =
+            (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
+        uint64_t index =
+            (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
+        RDMALocalBlock *block =
+            &(rdma->local_ram_blocks.block[index]);
+        RDMARegister reg = { .current_index = index };
+        RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
+                                 };
+        RDMAControlHeader head = { .len = sizeof(RDMARegister),
+                                   .type = RDMA_CONTROL_UNREGISTER_REQUEST,
+                                   .repeat = 1,
+                                 };
+
+        DDPRINTF("Processing unregister for chunk: %" PRIu64 " at position 
%d\n",
+                    chunk, rdma->unregister_current);
+
+        rdma->unregistrations[rdma->unregister_current] = 0;
+        rdma->unregister_current++;
+
+        if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
+            rdma->unregister_current = 0;
+        }
+
+        DDPRINTF("Sending unregister for chunk: %" PRIu64 "\n", chunk);
+
+        clear_bit(chunk, block->unregister_bitmap);
+
+        if (test_bit(chunk, block->transit_bitmap)) {
+            DDPRINTF("Cannot unregister inflight chunk: %" PRIu64 "\n", chunk);
+            continue;
+        }
+
+        ret = ibv_dereg_mr(block->pmr[chunk]);
+        block->pmr[chunk] = NULL;
+        block->remote_keys[chunk] = 0;
+
+        if (ret != 0) {
+            perror("unregistration chunk failed");
+            return -ret;
+        }
+        rdma->total_registrations--;
+
+        reg.key.chunk = chunk;
+        register_to_network(&reg);
+        ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
+                                &resp, NULL, NULL);
+        if (ret < 0) {
+            return ret;
+        }
+
+        DDPRINTF("Unregister for chunk: %" PRIu64 " complete.\n", chunk);
+    }
+
+    return 0;
+}
+
+/*
+ * Set bit for unregistration in the next iteration.
+ * We cannot transmit right here, but will unpin later.
+ */
+static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
+                                        uint64_t chunk, uint64_t wr_id)
+{
+    if (rdma->unregistrations[rdma->unregister_next] != 0) {
+        fprintf(stderr, "rdma migration: queue is full!\n");
+    } else {
+        RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
+
+        if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
+            DDPRINTF("Appending unregister chunk %" PRIu64
+                    " at position %d\n", chunk, rdma->unregister_next);
+
+            rdma->unregistrations[rdma->unregister_next++] = wr_id;
+
+            if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
+                rdma->unregister_next = 0;
+            }
+        } else {
+            DDPRINTF("Unregister chunk %" PRIu64 " already in queue.\n",
+                    chunk);
+        }
+    }
+}
+#endif
 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
                                    uint8_t *data, RDMAControlHeader *resp,
                                    int *resp_idx,
@@ -1006,6 +1132,17 @@ static uint64_t qemu_rdma_poll(RDMAContext *rdma, 
uint64_t *wr_id_out)
         if (rdma->nb_sent > 0) {
             rdma->nb_sent--;
         }
+        if (!rdma->pin_all) {
+            /*
+             * FYI: If one wanted to signal a specific chunk to be unregistered
+             * using LRU or workload-specific information, this is the function
+             * you would call to do so. That chunk would then get 
asynchronously
+             * unregistered later.
+             */
+#ifdef RDMA_UNREGISTRATION_EXAMPLE
+            qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
+#endif
+        }
     } else {
         DDPRINTF("other completion %s (%" PRId64 ") received left %d\n",
             print_wrid(wr_id), wr_id, rdma->nb_sent);
@@ -1423,6 +1560,12 @@ retry:
     chunk_start = ram_chunk_start(block, chunk);
     chunk_end = ram_chunk_end(block, chunk);
 
+    if (!rdma->pin_all) {
+#ifdef RDMA_UNREGISTRATION_EXAMPLE
+        qemu_rdma_unregister_waiting(rdma);
+#endif
+    }
+
     while (test_bit(chunk, block->transit_bitmap)) {
         (void)count;
         DDPRINTF("(%d) Not clobbering: block: %d chunk %" PRIu64
-- 
1.7.10.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]