qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [Qemu-stable][PATCH] rdma: fix multiple VMs parallel mi


From: Frank Yang
Subject: Re: [Qemu-devel] [Qemu-stable][PATCH] rdma: fix multiple VMs parallel migration
Date: Wed, 04 Sep 2013 10:42:48 +0800
User-agent: Mozilla/5.0 (Windows NT 5.1; rv:17.0) Gecko/20130801 Thunderbird/17.0.8

On 2013-9-3 13:03, Lei Li wrote:
> Hi Frank,
>
> I failed to apply this patch. Please make sure to use git-send-email, 
> otherwise
> it's a little hard to review. :)
>
> On 08/30/2013 08:39 PM, Frank Yang wrote:
>> When several VMs migrate with RDMA at the same time, the increased pressure 
>> cause packet loss probabilistically and make source and destination wait for 
>> each other. There might be some of VMs blocked during the migration.
>>
>> Fix the bug by using two completion queues, for sending and receiving 
>> respectively.
>
>>
>> From 0c4829495cdc89eea2e94b103ac42c3f6a4b32c2 Mon Sep 17 00:00:00 2001
>> From: Frank Yang <address@hidden <mailto:address@hidden>>
>> Date: Fri, 30 Aug 2013 17:53:34 +0800
>> Subject: [PATCH] rdma: fix multiple VMs parallel migration
>
> The commit message should be here within the patch. You can use 'git commit 
> --amend'
> to add it.
>  
>
>>
>> Signed-off-by: Frank Yang <address@hidden <mailto:address@hidden>>
>> ---
>>  migration-rdma.c | 57 
>> ++++++++++++++++++++++++++++++++++++--------------------
>>  1 file changed, 37 insertions(+), 20 deletions(-)
>>
>> diff --git a/migration-rdma.c b/migration-rdma.c
>> index 3d1266f..d0eacbb 100644
>> --- a/migration-rdma.c
>> +++ b/migration-rdma.c
>> @@ -362,7 +362,8 @@ typedef struct RDMAContext {
>>      struct ibv_qp *qp;                      /* queue pair */
>>      struct ibv_comp_channel *comp_channel;  /* completion channel */
>>      struct ibv_pd *pd;                      /* protection domain */
>> -    struct ibv_cq *cq;                      /* completion queue */
>> +    struct ibv_cq *send_cq;                 /* send completion queue */
>> +    struct ibv_cq *recv_cq;                 /* receive completion queue */
>>      /*
>>       * If a previous write failed (perhaps because of a failed
>> @@ -1006,9 +1007,12 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
>>       * Completion queue can be filled by both read and write work requests,
>>       * so must reflect the sum of both possible queue sizes.
>>       */
>> -    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
>> +    rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 2),
>>              NULL, rdma->comp_channel, 0);
>> -    if (!rdma->cq) {
>> +    rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX, NULL,
>> +            rdma->comp_channel, 0);
>> +
>> +    if (!rdma->send_cq || !rdma->recv_cq) {
>>          fprintf(stderr, "failed to allocate completion queue\n");
>>          goto err_alloc_pd_cq;
>>      }
>> @@ -1040,8 +1044,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
>>      attr.cap.max_recv_wr = 3;
>>      attr.cap.max_send_sge = 1;
>>      attr.cap.max_recv_sge = 1;
>> -    attr.send_cq = rdma->cq;
>> -    attr.recv_cq = rdma->cq;
>> +    attr.send_cq = rdma->send_cq;
>> +    attr.recv_cq = rdma->recv_cq;
>>      attr.qp_type = IBV_QPT_RC;
>>      ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
>> @@ -1361,13 +1365,18 @@ static void qemu_rdma_signal_unregister(RDMAContext 
>> *rdma, uint64_t index,
>>   * Return the work request ID that completed.
>>   */
>>  static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
>> -                               uint32_t *byte_len)
>> +                               uint32_t *byte_len, int wrid_requested)
>>  {
>>      int ret;
>>      struct ibv_wc wc;
>>      uint64_t wr_id;
>> -    ret = ibv_poll_cq(rdma->cq, 1, &wc);
>> +    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
>> +        wrid_requested == RDMA_WRID_SEND_CONTROL) {
>> +        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
>> +    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
>> +        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
>> +    }
>>      if (!ret) {
>>          *wr_id_out = RDMA_WRID_NONE;
>> @@ -1460,12 +1469,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext 
>> *rdma, int wrid_requested,
>>      void *cq_ctx;
>>      uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
>> -    if (ibv_req_notify_cq(rdma->cq, 0)) {
>> -        return -1;
>> -    }
>>      /* poll cq first */
>>      while (wr_id != wrid_requested) {
>> -        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
>> +        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
>>          if (ret < 0) {
>>              return ret;
>>          }
>> @@ -1487,6 +1493,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext 
>> *rdma, int wrid_requested,
>>      }
>>      while (1) {
>> +        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
>> +            wrid_requested == RDMA_WRID_SEND_CONTROL) {
>> +            if (ibv_req_notify_cq(rdma->send_cq, 0)) {
>> +                return -1;
>> +            }
>> +        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
>> +            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
>> +                return -1;
>> +            }
>> +        }
>> +
>>          /*
>>           * Coroutine doesn't start until process_incoming_migration()
>>           * so don't yield unless we know we're running inside of a 
>> coroutine.
>> @@ -1502,12 +1519,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext 
>> *rdma, int wrid_requested,
>>          num_cq_events++;
>> -        if (ibv_req_notify_cq(cq, 0)) {
>> -            goto err_block_for_wrid;
>> -        }
>> -
>>          while (wr_id != wrid_requested) {
>> -            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
>> +            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
>>              if (ret < 0) {
>>                  goto err_block_for_wrid;
>>              }
>> @@ -2236,9 +2249,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
>>          ibv_destroy_qp(rdma->qp);
>>          rdma->qp = NULL;
>>      }
>> -    if (rdma->cq) {
>> -        ibv_destroy_cq(rdma->cq);
>> -        rdma->cq = NULL;
>> +    if (rdma->send_cq) {
>> +        ibv_destroy_cq(rdma->send_cq);
>> +        rdma->send_cq = NULL;
>> +    }
>> +    if (rdma->recv_cq) {
>> +        ibv_destroy_cq(rdma->recv_cq);
>> +        rdma->recv_cq = NULL;
>>      }
>>      if (rdma->comp_channel) {
>>  ibv_destroy_comp_channel(rdma->comp_channel);
>> @@ -2770,7 +2787,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
>> *opaque,
>>       */
>>      while (1) {
>>          uint64_t wr_id, wr_id_in;
>> -        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
>> +        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, 
>> RDMA_WRID_RDMA_WRITE);
>>          if (ret < 0) {
>>              fprintf(stderr, "rdma migration: polling error! %d\n", ret);
>>              goto err;
>> -- 
>> 1.8.3.msysgit.0
>>
>>
>
>
Understood. Thank you. The following patch should be fine.

>From 7b7d2c5b51c53c23f7194d35b469dedd892ef89f Mon Sep 17 00:00:00 2001
From: Frank Yang <address@hidden>
Date: Tue, 3 Sep 2013 18:26:54 +0800
Subject: [PATCH] rdma: fix multiple VMs parallel migration

Signed-off-by: Frank Yang <address@hidden>
---
 migration-rdma.c | 64 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 22 deletions(-)

diff --git a/migration-rdma.c b/migration-rdma.c
index 3d1266f..30f8c11 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -362,7 +362,8 @@ typedef struct RDMAContext {
     struct ibv_qp *qp;                      /* queue pair */
     struct ibv_comp_channel *comp_channel;  /* completion channel */
     struct ibv_pd *pd;                      /* protection domain */
-    struct ibv_cq *cq;                      /* completion queue */
+    struct ibv_cq *send_cq;                 /* send completion queue */
+    struct ibv_cq *recv_cq;                 /* receive completion queue */
 
     /*
      * If a previous write failed (perhaps because of a failed
@@ -1003,12 +1004,18 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
     }
 
     /*
-     * Completion queue can be filled by both read and write work requests,
-     * so must reflect the sum of both possible queue sizes.
+     * Create two completion queues for sending and receiving
+     * respectively.
+     * Send completion queue can be filled by both send and
+     * write work requests, so must reflect the sum of both
+     * possible queue sizes.
      */
-    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
+    rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 2),
             NULL, rdma->comp_channel, 0);
-    if (!rdma->cq) {
+    rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX, NULL,
+            rdma->comp_channel, 0);
+
+    if (!rdma->send_cq || !rdma->recv_cq) {
         fprintf(stderr, "failed to allocate completion queue\n");
         goto err_alloc_pd_cq;
     }
@@ -1040,8 +1047,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
     attr.cap.max_recv_wr = 3;
     attr.cap.max_send_sge = 1;
     attr.cap.max_recv_sge = 1;
-    attr.send_cq = rdma->cq;
-    attr.recv_cq = rdma->cq;
+    attr.send_cq = rdma->send_cq;
+    attr.recv_cq = rdma->recv_cq;
     attr.qp_type = IBV_QPT_RC;
 
     ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
@@ -1361,13 +1368,18 @@ static void qemu_rdma_signal_unregister(RDMAContext 
*rdma, uint64_t index,
  * Return the work request ID that completed.
  */
 static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
-                               uint32_t *byte_len)
+                               uint32_t *byte_len, int wrid_requested)
 {
     int ret;
     struct ibv_wc wc;
     uint64_t wr_id;
 
-    ret = ibv_poll_cq(rdma->cq, 1, &wc);
+    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
+        wrid_requested == RDMA_WRID_SEND_CONTROL) {
+        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
+    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
+        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
+    }
 
     if (!ret) {
         *wr_id_out = RDMA_WRID_NONE;
@@ -1460,12 +1472,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
     void *cq_ctx;
     uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
 
-    if (ibv_req_notify_cq(rdma->cq, 0)) {
-        return -1;
-    }
     /* poll cq first */
     while (wr_id != wrid_requested) {
-        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
         if (ret < 0) {
             return ret;
         }
@@ -1487,6 +1496,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
     }
 
     while (1) {
+        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
+        wrid_requested == RDMA_WRID_SEND_CONTROL) {
+        if (ibv_req_notify_cq(rdma->send_cq, 0)) {
+                return -1;
+        }
+        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
+            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
+                return -1;
+            }
+        }
+
         /*
          * Coroutine doesn't start until process_incoming_migration()
          * so don't yield unless we know we're running inside of a coroutine.
@@ -1502,12 +1522,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
 
         num_cq_events++;
 
-        if (ibv_req_notify_cq(cq, 0)) {
-            goto err_block_for_wrid;
-        }
-
         while (wr_id != wrid_requested) {
-            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
             if (ret < 0) {
                 goto err_block_for_wrid;
             }
@@ -2236,9 +2252,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
         ibv_destroy_qp(rdma->qp);
         rdma->qp = NULL;
     }
-    if (rdma->cq) {
-        ibv_destroy_cq(rdma->cq);
-        rdma->cq = NULL;
+    if (rdma->send_cq) {
+        ibv_destroy_cq(rdma->send_cq);
+        rdma->send_cq = NULL;
+    }
+    if (rdma->recv_cq) {
+        ibv_destroy_cq(rdma->recv_cq);
+        rdma->recv_cq = NULL;
     }
     if (rdma->comp_channel) {
         ibv_destroy_comp_channel(rdma->comp_channel);
@@ -2770,7 +2790,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
*opaque,
      */
     while (1) {
         uint64_t wr_id, wr_id_in;
-        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
+        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, RDMA_WRID_RDMA_WRITE);
         if (ret < 0) {
             fprintf(stderr, "rdma migration: polling error! %d\n", ret);
             goto err;
-- 
1.8.3.msysgit.0





reply via email to

[Prev in Thread] Current Thread [Next in Thread]