qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking


From: Jason Wang
Subject: Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory
Date: Tue, 15 Jan 2019 15:52:21 +0800
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Thunderbird/60.2.1


On 2019/1/11 下午2:10, Yongji Xie wrote:
On Fri, 11 Jan 2019 at 11:56, Jason Wang <address@hidden> wrote:

On 2019/1/9 下午7:27, address@hidden wrote:
From: Xie Yongji <address@hidden>

This patch adds support for VHOST_USER_GET_INFLIGHT_FD and
VHOST_USER_SET_INFLIGHT_FD message to set/get shared memory
to/from qemu. Then we maintain a "bitmap" of all descriptors in
the shared memory for each queue to track inflight I/O.

Signed-off-by: Xie Yongji <address@hidden>
Signed-off-by: Zhang Yu <address@hidden>
---
   Makefile                              |   2 +-
   contrib/libvhost-user/libvhost-user.c | 258 ++++++++++++++++++++++++--
   contrib/libvhost-user/libvhost-user.h |  29 +++
   3 files changed, 268 insertions(+), 21 deletions(-)

diff --git a/Makefile b/Makefile
index dd53965f77..b5c9092605 100644
--- a/Makefile
+++ b/Makefile
@@ -473,7 +473,7 @@ Makefile: $(version-obj-y)
   # Build libraries

   libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)
-libvhost-user.a: $(libvhost-user-obj-y)
+libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y)

   ######################################################################

diff --git a/contrib/libvhost-user/libvhost-user.c 
b/contrib/libvhost-user/libvhost-user.c
index 23bd52264c..e73ce04619 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -41,6 +41,8 @@
   #endif

   #include "qemu/atomic.h"
+#include "qemu/osdep.h"
+#include "qemu/memfd.h"

   #include "libvhost-user.h"

@@ -53,6 +55,18 @@
               _min1 < _min2 ? _min1 : _min2; })
   #endif

+/* Round number down to multiple */
+#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
+
+/* Round number up to multiple */
+#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
+
+/* Align each region to cache line size in inflight buffer */
+#define INFLIGHT_ALIGNMENT 64
+
+/* The version of inflight buffer */
+#define INFLIGHT_VERSION 1
+
   #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)

   /* The version of the protocol we support */
@@ -66,6 +80,20 @@
           }                                       \
       } while (0)

+static inline
+bool has_feature(uint64_t features, unsigned int fbit)
+{
+    assert(fbit < 64);
+    return !!(features & (1ULL << fbit));
+}
+
+static inline
+bool vu_has_feature(VuDev *dev,
+                    unsigned int fbit)
+{
+    return has_feature(dev->features, fbit);
+}
+
   static const char *
   vu_request_to_string(unsigned int req)
   {
@@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req)
           REQ(VHOST_USER_POSTCOPY_ADVISE),
           REQ(VHOST_USER_POSTCOPY_LISTEN),
           REQ(VHOST_USER_POSTCOPY_END),
+        REQ(VHOST_USER_GET_INFLIGHT_FD),
+        REQ(VHOST_USER_SET_INFLIGHT_FD),
           REQ(VHOST_USER_MAX),
       };
   #undef REQ
@@ -890,6 +920,41 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
       return true;
   }

+static int
+vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
+{
+    int i = 0;
+
+    if (!has_feature(dev->protocol_features,
+        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (unlikely(!vq->inflight)) {
+        return -1;
+    }
+
+    vq->used_idx = vq->vring.used->idx;
+    vq->inflight_num = 0;
+    for (i = 0; i < vq->vring.num; i++) {
+        if (vq->inflight->desc[i] == 0) {
+            continue;
+        }
+
+        vq->inflight_desc[vq->inflight_num++] = i;
+        vq->inuse++;
+    }
+    vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
+
+    /* in case of I/O hang after reconnecting */
+    if (eventfd_write(vq->kick_fd, 1) ||
+        eventfd_write(vq->call_fd, 1)) {
+        return -1;
+    }
+
+    return 0;
+}
+
   static bool
   vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
   {
@@ -925,6 +990,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
                  dev->vq[index].kick_fd, index);
       }

+    if (vu_check_queue_inflights(dev, &dev->vq[index])) {
+        vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
+    }
+
       return false;
   }

@@ -1215,6 +1284,117 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
       return true;
   }

+static bool
+vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
+{
+    int fd;
+    void *addr;
+    uint64_t mmap_size;
+
+    if (vmsg->size != sizeof(vmsg->payload.inflight)) {
+        vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
+        vmsg->payload.inflight.mmap_size = 0;
+        return true;
+    }
+
+    DPRINT("set_inflight_fd num_queues: %"PRId16"\n",
+           vmsg->payload.inflight.num_queues);
+
+    mmap_size = vmsg->payload.inflight.num_queues *
+                ALIGN_UP(sizeof(VuVirtqInflight), INFLIGHT_ALIGNMENT);
+
+    addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
+                            F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
+                            &fd, NULL);
+
+    if (!addr) {
+        vu_panic(dev, "Failed to alloc vhost inflight area");
+        vmsg->payload.inflight.mmap_size = 0;
+        return true;
+    }
+
+    dev->inflight_info.addr = addr;
+    dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
+    vmsg->payload.inflight.mmap_offset = 0;
+    vmsg->payload.inflight.align = INFLIGHT_ALIGNMENT;
+    vmsg->payload.inflight.version = INFLIGHT_VERSION;
+    vmsg->fd_num = 1;
+    dev->inflight_info.fd = vmsg->fds[0] = fd;
+
+    DPRINT("send inflight mmap_size: %"PRId64"\n",
+           vmsg->payload.inflight.mmap_size);
+    DPRINT("send inflight mmap offset: %"PRId64"\n",
+           vmsg->payload.inflight.mmap_offset);
+    DPRINT("send inflight align: %"PRId32"\n",
+           vmsg->payload.inflight.align);
+    DPRINT("send inflight version: %"PRId16"\n",
+           vmsg->payload.inflight.version);
+
+    return true;
+}
+
+static bool
+vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
+{
+    int fd, i;
+    uint64_t mmap_size, mmap_offset;
+    uint32_t align;
+    uint16_t num_queues, version;
+    void *rc;
+
+    if (vmsg->fd_num != 1 ||
+        vmsg->size != sizeof(vmsg->payload.inflight)) {
+        vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
+                 vmsg->size, vmsg->fd_num);
+        return false;
+    }
+
+    fd = vmsg->fds[0];
+    mmap_size = vmsg->payload.inflight.mmap_size;
+    mmap_offset = vmsg->payload.inflight.mmap_offset;
+    align = vmsg->payload.inflight.align;
+    num_queues = vmsg->payload.inflight.num_queues;
+    version = vmsg->payload.inflight.version;
+
+    DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
+    DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
+    DPRINT("set_inflight_fd align: %"PRId32"\n", align);
+    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
+    DPRINT("set_inflight_fd version: %"PRId16"\n", version);
+
+    rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+              fd, mmap_offset);
+
+    if (rc == MAP_FAILED) {
+        vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
+        return false;
+    }
+
+    if (version != INFLIGHT_VERSION) {
+        vu_panic(dev, "Invalid set_inflight_fd version: %d", version);
+        return false;
+    }
+
+    if (dev->inflight_info.fd) {
+        close(dev->inflight_info.fd);
+    }
+
+    if (dev->inflight_info.addr) {
+        munmap(dev->inflight_info.addr, dev->inflight_info.size);
+    }
+
+    dev->inflight_info.fd = fd;
+    dev->inflight_info.addr = rc;
+    dev->inflight_info.size = mmap_size;
+
+    for (i = 0; i < num_queues; i++) {
+        dev->vq[i].inflight = (VuVirtqInflight *)rc;
+        rc = (void *)((char *)rc + ALIGN_UP(sizeof(VuVirtqInflight), align));
+    }
+
+    return false;
+}
+
   static bool
   vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
   {
@@ -1292,6 +1472,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
           return vu_set_postcopy_listen(dev, vmsg);
       case VHOST_USER_POSTCOPY_END:
           return vu_set_postcopy_end(dev, vmsg);
+    case VHOST_USER_GET_INFLIGHT_FD:
+        return vu_get_inflight_fd(dev, vmsg);
+    case VHOST_USER_SET_INFLIGHT_FD:
+        return vu_set_inflight_fd(dev, vmsg);
       default:
           vmsg_close_fds(vmsg);
           vu_panic(dev, "Unhandled request: %d", vmsg->request);
@@ -1359,8 +1543,18 @@ vu_deinit(VuDev *dev)
               close(vq->err_fd);
               vq->err_fd = -1;
           }
+        vq->inflight = NULL;
       }

+    if (dev->inflight_info.addr) {
+        munmap(dev->inflight_info.addr, dev->inflight_info.size);
+        dev->inflight_info.addr = NULL;
+    }
+
+    if (dev->inflight_info.fd > 0) {
+        close(dev->inflight_info.fd);
+        dev->inflight_info.fd = -1;
+    }

       vu_close_log(dev);
       if (dev->slave_fd != -1) {
@@ -1687,20 +1881,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq)
       return vring_avail_idx(vq) == vq->last_avail_idx;
   }

-static inline
-bool has_feature(uint64_t features, unsigned int fbit)
-{
-    assert(fbit < 64);
-    return !!(features & (1ULL << fbit));
-}
-
-static inline
-bool vu_has_feature(VuDev *dev,
-                    unsigned int fbit)
-{
-    return has_feature(dev->features, fbit);
-}
-
   static bool
   vring_notify(VuDev *dev, VuVirtq *vq)
   {
@@ -1829,12 +2009,6 @@ virtqueue_map_desc(VuDev *dev,
       *p_num_sg = num_sg;
   }

-/* Round number down to multiple */
-#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
-
-/* Round number up to multiple */
-#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
-
   static void *
   virtqueue_alloc_element(size_t sz,
                                        unsigned out_num, unsigned in_num)
@@ -1935,9 +2109,44 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int 
idx, size_t sz)
       return elem;
   }

+static int
+vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
+{
+    if (!has_feature(dev->protocol_features,
+        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (unlikely(!vq->inflight)) {
+        return -1;
+    }
+

Just wonder what happens if backend get killed at this point?

We will re-caculate last_avail_idx like: last_avail_idx = inuse + used_idx


I'm not sure I get you here, but it looks to me at least one pending descriptor is missed since you don't set vq->inflight->desc[desc_idx] to 1?



At this point, backend could consume this entry correctly after reconnect.

You want to survive from the backend crash but you still depend on
backend to get and put inflight descriptors which seems somehow conflict.

But if backend get killed in vu_queue_inflight_put(), I think you are
right, there is something conflict. One descriptor is consumed by
guest but still marked as inused in inflight buffer. Then we will
re-send this old descriptor after restart.

Maybe we can add something like that to fix this issue:

void vu_queue_push()
{
     vq->inflight->elem_idx = elem->idx;
     vu_queue_fill();
     vu_queue_flush();
     vq->inflight->desc[elem->idx] = 0;


Does this safe to be killed here?


     vq->inflight->used_idx = vq->vring.used->idx;
}

static int vu_check_queue_inflights()
{
     ....
     if (vq->inflight->used_idx != vq->vring.used->idx) {
         /* Crash in vu_queue_push() */
         vq->inflight->desc[vq->inflight->elem_idx] = 0;
     }
     ....
}

Thanks,
Yongji


Well, this may work but here're my points:

1) The code want to recover from backed crash by introducing extra space to store inflight data, but it still depends on the backend to set/get the inflight state

2) Since the backend could be killed at any time, the backend must have the ability to recover from the partial inflight state

So it looks to me 1) tends to be self-contradictory and 2) tends to be recursive. The above lines show how tricky could the code looks like.

Solving this at vhost-user level through at backend is probably wrong. It's time to consider the support from virtio itself.

Thanks




reply via email to

[Prev in Thread] Current Thread [Next in Thread]