[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] An RDMA race?
From: |
Dr. David Alan Gilbert |
Subject: |
[Qemu-devel] An RDMA race? |
Date: |
Fri, 11 Dec 2015 17:48:50 +0000 |
User-agent: |
Mutt/1.5.24 (2015-08-30) |
Hi Michael,
I think I've got an RDMA race condition, but I'm being a little
cautious at the moment and wondered if you agree with the following
diagnosis.
It's showing up in a world of mine that's sending more control messages
from the destination->source and I'm seeing the following.
We normally expect:
src dest
----------->control ready->
Sees SEND_CONTROL signal to ack that it has been sent
<-----control message--
Sees RECV_CONTROL message from dest
but what I'm seeing is:
src dest
----------->control ready->
<-----control message--
Sees RECV_CONTROL message from dest
Sees SEND_CONTROL signal to ack that it has been sent
which seems a bit odd - that means that the source is sending
a message, and getting the reply before it gets the acknowledgment
from it's own stack that it sent the message the reply is to ?!
It's rare (~1 in 100 migrations ish) and only in my world
where I'm sending more control messages.
But that confuses qemu_rdma_post_send_control when it sends
the READY, because it calls block_for_wrid(SEND_CONTROL),
that sees the RECV_CONTROL (which it loses) and then the SEND_CONTROL.
Does this sound a sane explanation? My current fix (below)
I only use it for RDMA_CONTROL_READY , I'm torn between worrying
that the race is potentially more general, but worry what will
happen if I stop making post_send_control wait at the end for
all the other control messages.
It seems to work :-)
What do you reckon?
Dave
From 332b867fb0f63be47d89822b7a10b2a519b431fc Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <address@hidden>
Date: Fri, 11 Dec 2015 14:53:11 +0000
Subject: [PATCH] Don't wait for send-control on a ready
---
migration/rdma.c | 46 +++++++++++++++++++++++++++++++++++++++++-----
trace-events | 1 +
2 files changed, 42 insertions(+), 5 deletions(-)
diff --git a/migration/rdma.c b/migration/rdma.c
index b0feddc..6a0e59f 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -314,7 +314,13 @@ typedef struct RDMAContext {
* the READY message before send() does, in which case we need to
* know if it completed.
*/
- int control_ready_expected;
+ bool control_ready_expected;
+
+ /*
+ * Set when we've sent a control message but not yet waited for the
+ * result.
+ */
+ bool control_sent_expected;
/* number of outstanding writes */
int nb_sent;
@@ -1454,6 +1460,13 @@ static uint64_t qemu_rdma_poll(RDMAContext *rdma,
uint64_t *wr_id_out,
rdma->control_ready_expected = 0;
}
+ if (rdma->control_sent_expected &&
+ (wr_id >= RDMA_WRID_SEND_CONTROL)) {
+ trace_qemu_rdma_poll_send(wrid_desc[RDMA_WRID_SEND_CONTROL],
+ wr_id - RDMA_WRID_SEND_CONTROL, wr_id, rdma->nb_sent);
+ rdma->control_sent_expected = 0;
+ }
+
if (wr_id == RDMA_WRID_RDMA_WRITE) {
uint64_t chunk =
(wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
@@ -1609,6 +1622,7 @@ static int qemu_rdma_post_send_control(RDMAContext *rdma,
uint8_t *buf,
RDMAControlHeader *head)
{
int ret = 0;
+ bool wait;
RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
struct ibv_send_wr *bad_wr;
struct ibv_sge sge = {
@@ -1626,6 +1640,26 @@ static int qemu_rdma_post_send_control(RDMAContext
*rdma, uint8_t *buf,
trace_qemu_rdma_post_send_control(control_desc[head->type]);
+ if (rdma->control_sent_expected) {
+ ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
+ if (ret < 0) {
+ error_report("%s: send polling control error (entry) %s",
+ __func__, strerror(ret));
+ return ret;
+ }
+ }
+
+ switch (head->type) {
+ case RDMA_CONTROL_READY:
+ wait=false;
+ rdma->control_sent_expected = true;
+ break;
+
+ default:
+ wait=true;
+ break;
+ }
+
/*
* We don't actually need to do a memcpy() in here if we used
* the "sge" properly, but since we're only sending control messages
@@ -1646,13 +1680,15 @@ static int qemu_rdma_post_send_control(RDMAContext
*rdma, uint8_t *buf,
ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
if (ret > 0) {
- error_report("Failed to use post IB SEND for control");
+ error_report("Failed to use post IB SEND for control: %s",
strerror(ret));
return -ret;
}
- ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
- if (ret < 0) {
- error_report("rdma migration: send polling control error");
+ if (wait) {
+ ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
+ if (ret < 0) {
+ error_report("rdma migration: send polling control error");
+ }
}
return ret;
diff --git a/trace-events b/trace-events
index 9043b56..0eeb6b2 100644
--- a/trace-events
+++ b/trace-events
@@ -1447,6 +1447,7 @@ qemu_rdma_exchange_send_received(const char *desc)
"Response %s received."
qemu_rdma_fill(size_t control_len, size_t size) "RDMA %zd of %zd bytes already
in buffer"
qemu_rdma_init_ram_blocks(int blocks) "Allocated %d local ram block structures"
qemu_rdma_poll_recv(const char *compstr, int64_t comp, int64_t id, int sent)
"completion %s #%" PRId64 " received (%" PRId64 ") left %d"
+qemu_rdma_poll_send(const char *compstr, int64_t comp, int64_t id, int sent)
"completion %s #%" PRId64 " received (%" PRId64 ") left %d"
qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t
block, uint64_t chunk, void *local, void *remote) "completions %s (%" PRId64 ")
left %d, block %" PRIu64 ", chunk: %" PRIu64 " %p %p"
qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other
completion %s (%" PRId64 ") received left %d"
qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
--
2.5.0
--
Dr. David Alan Gilbert / address@hidden / Manchester, UK
- [Qemu-devel] An RDMA race?,
Dr. David Alan Gilbert <=