[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH v2 4/8] rdma: unpin support
From: |
mrhines |
Subject: |
[Qemu-devel] [PATCH v2 4/8] rdma: unpin support |
Date: |
Fri, 28 Jun 2013 15:59:59 -0400 |
From: "Michael R. Hines" <address@hidden>
As requested, the protocol now includes memory unpinning support.
This has been implemented in a non-optimized manner, in such a way
that one could devise an LRU or other workload-specific information
on top of the basic mechanism to influence the way unpinning happens
during runtime.
The feature is not yet user-facing, and is thus can only be enable
at compile-time.
Signed-off-by: Michael R. Hines <address@hidden>
---
migration-rdma.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 143 insertions(+)
diff --git a/migration-rdma.c b/migration-rdma.c
index 0bd5e23..6218d48 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -944,6 +944,132 @@ const char *print_wrid(int wrid)
return wrid_desc[wrid];
}
+/*
+ * RDMA requires memory registration (mlock/pinning), but this is not good for
+ * overcommitment.
+ *
+ * In preparation for the future where LRU information or workload-specific
+ * writable writable working set memory access behavior is available to QEMU
+ * it would be nice to have in place the ability to UN-register/UN-pin
+ * particular memory regions from the RDMA hardware when it is determine that
+ * those regions of memory will likely not be accessed again in the near
future.
+ *
+ * While we do not yet have such information right now, the following
+ * compile-time option allows us to perform a non-optimized version of this
+ * behavior.
+ *
+ * By uncommenting this option, you will cause *all* RDMA transfers to be
+ * unregistered immediately after the transfer completes on both sides of the
+ * connection. This has no effect in 'rdma-pin-all' mode, only regular mode.
+ *
+ * This will have a terrible impact on migration performance, so until future
+ * workload information or LRU information is available, do not attempt to use
+ * this feature except for basic testing.
+ */
+//#define RDMA_UNREGISTRATION_EXAMPLE
+
+/*
+ * Perform a non-optimized memory unregistration after every transfer
+ * for demonsration purposes, only if pin-all is not requested.
+ *
+ * Potential optimizations:
+ * 1. Start a new thread to run this function continuously
+ - for bit clearing
+ - and for receipt of unregister messages
+ * 2. Use an LRU.
+ * 3. Use workload hints.
+ */
+#ifdef RDMA_UNREGISTRATION_EXAMPLE
+static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
+{
+ while (rdma->unregistrations[rdma->unregister_current]) {
+ int ret;
+ uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
+ uint64_t chunk =
+ (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
+ uint64_t index =
+ (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
+ RDMALocalBlock *block =
+ &(rdma->local_ram_blocks.block[index]);
+ RDMARegister reg = { .current_index = index };
+ RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
+ };
+ RDMAControlHeader head = { .len = sizeof(RDMARegister),
+ .type = RDMA_CONTROL_UNREGISTER_REQUEST,
+ .repeat = 1,
+ };
+
+ DDPRINTF("Processing unregister for chunk: %" PRIu64 " at position
%d\n",
+ chunk, rdma->unregister_current);
+
+ rdma->unregistrations[rdma->unregister_current] = 0;
+ rdma->unregister_current++;
+
+ if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
+ rdma->unregister_current = 0;
+ }
+
+ DDPRINTF("Sending unregister for chunk: %" PRIu64 "\n", chunk);
+
+ clear_bit(chunk, block->unregister_bitmap);
+
+ if (test_bit(chunk, block->transit_bitmap)) {
+ DDPRINTF("Cannot unregister inflight chunk: %" PRIu64 "\n", chunk);
+ continue;
+ }
+
+ ret = ibv_dereg_mr(block->pmr[chunk]);
+ block->pmr[chunk] = NULL;
+ block->remote_keys[chunk] = 0;
+
+ if (ret != 0) {
+ perror("unregistration chunk failed");
+ return -ret;
+ }
+ rdma->total_registrations--;
+
+ reg.key.chunk = chunk;
+ register_to_network(®);
+ ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
+ &resp, NULL, NULL);
+ if (ret < 0) {
+ return ret;
+ }
+
+ DDPRINTF("Unregister for chunk: %" PRIu64 " complete.\n", chunk);
+ }
+
+ return 0;
+}
+
+/*
+ * Set bit for unregistration in the next iteration.
+ * We cannot transmit right here, but will unpin later.
+ */
+static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
+ uint64_t chunk, uint64_t wr_id)
+{
+ if (rdma->unregistrations[rdma->unregister_next] != 0) {
+ fprintf(stderr, "rdma migration: queue is full!\n");
+ } else {
+ RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
+
+ if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
+ DDPRINTF("Appending unregister chunk %" PRIu64
+ " at position %d\n", chunk, rdma->unregister_next);
+
+ rdma->unregistrations[rdma->unregister_next++] = wr_id;
+
+ if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
+ rdma->unregister_next = 0;
+ }
+ } else {
+ DDPRINTF("Unregister chunk %" PRIu64 " already in queue.\n",
+ chunk);
+ }
+ }
+}
+#endif
static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
uint8_t *data, RDMAControlHeader *resp,
int *resp_idx,
@@ -1006,6 +1132,17 @@ static uint64_t qemu_rdma_poll(RDMAContext *rdma,
uint64_t *wr_id_out)
if (rdma->nb_sent > 0) {
rdma->nb_sent--;
}
+ if (!rdma->pin_all) {
+ /*
+ * FYI: If one wanted to signal a specific chunk to be unregistered
+ * using LRU or workload-specific information, this is the function
+ * you would call to do so. That chunk would then get
asynchronously
+ * unregistered later.
+ */
+#ifdef RDMA_UNREGISTRATION_EXAMPLE
+ qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
+#endif
+ }
} else {
DDPRINTF("other completion %s (%" PRId64 ") received left %d\n",
print_wrid(wr_id), wr_id, rdma->nb_sent);
@@ -1423,6 +1560,12 @@ retry:
chunk_start = ram_chunk_start(block, chunk);
chunk_end = ram_chunk_end(block, chunk);
+ if (!rdma->pin_all) {
+#ifdef RDMA_UNREGISTRATION_EXAMPLE
+ qemu_rdma_unregister_waiting(rdma);
+#endif
+ }
+
while (test_bit(chunk, block->transit_bitmap)) {
(void)count;
DDPRINTF("(%d) Not clobbering: block: %d chunk %" PRIu64
--
1.7.10.4
- [Qemu-devel] [PATCH v2 0/8] rdma: core logic w/ unpin example, mrhines, 2013/06/28
- [Qemu-devel] [PATCH v2 5/8] rdma: send pc.ram, mrhines, 2013/06/28
- [Qemu-devel] [PATCH v2 6/8] rdma: allow state transitions between other states besides ACTIVE, mrhines, 2013/06/28
- [Qemu-devel] [PATCH v2 2/8] rdma: introduce ram_handle_compressed(), mrhines, 2013/06/28
- [Qemu-devel] [PATCH v2 4/8] rdma: unpin support,
mrhines <=
- [Qemu-devel] [PATCH v2 1/8] rdma: update documentation to reflect new unpin support, mrhines, 2013/06/28
- [Qemu-devel] [PATCH v2 8/8] rdma: account for the time spent in MIG_STATE_SETUP through QMP, mrhines, 2013/06/28
- [Qemu-devel] [PATCH v2 3/8] rdma: core logic, mrhines, 2013/06/28
- [Qemu-devel] [PATCH v2 7/8] rdma: introduce MIG_STATE_NONE and change MIG_STATE_SETUP state transition, mrhines, 2013/06/28
- Re: [Qemu-devel] [PATCH v2 0/8] rdma: core logic w/ unpin example, Michael R. Hines, 2013/06/28