[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking
From: |
Yongji Xie |
Subject: |
Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory |
Date: |
Fri, 11 Jan 2019 14:10:46 +0800 |
On Fri, 11 Jan 2019 at 11:56, Jason Wang <address@hidden> wrote:
>
>
> On 2019/1/9 下午7:27, address@hidden wrote:
> > From: Xie Yongji <address@hidden>
> >
> > This patch adds support for VHOST_USER_GET_INFLIGHT_FD and
> > VHOST_USER_SET_INFLIGHT_FD message to set/get shared memory
> > to/from qemu. Then we maintain a "bitmap" of all descriptors in
> > the shared memory for each queue to track inflight I/O.
> >
> > Signed-off-by: Xie Yongji <address@hidden>
> > Signed-off-by: Zhang Yu <address@hidden>
> > ---
> > Makefile | 2 +-
> > contrib/libvhost-user/libvhost-user.c | 258 ++++++++++++++++++++++++--
> > contrib/libvhost-user/libvhost-user.h | 29 +++
> > 3 files changed, 268 insertions(+), 21 deletions(-)
> >
> > diff --git a/Makefile b/Makefile
> > index dd53965f77..b5c9092605 100644
> > --- a/Makefile
> > +++ b/Makefile
> > @@ -473,7 +473,7 @@ Makefile: $(version-obj-y)
> > # Build libraries
> >
> > libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)
> > -libvhost-user.a: $(libvhost-user-obj-y)
> > +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y)
> >
> > ######################################################################
> >
> > diff --git a/contrib/libvhost-user/libvhost-user.c
> > b/contrib/libvhost-user/libvhost-user.c
> > index 23bd52264c..e73ce04619 100644
> > --- a/contrib/libvhost-user/libvhost-user.c
> > +++ b/contrib/libvhost-user/libvhost-user.c
> > @@ -41,6 +41,8 @@
> > #endif
> >
> > #include "qemu/atomic.h"
> > +#include "qemu/osdep.h"
> > +#include "qemu/memfd.h"
> >
> > #include "libvhost-user.h"
> >
> > @@ -53,6 +55,18 @@
> > _min1 < _min2 ? _min1 : _min2; })
> > #endif
> >
> > +/* Round number down to multiple */
> > +#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> > +
> > +/* Round number up to multiple */
> > +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> > +
> > +/* Align each region to cache line size in inflight buffer */
> > +#define INFLIGHT_ALIGNMENT 64
> > +
> > +/* The version of inflight buffer */
> > +#define INFLIGHT_VERSION 1
> > +
> > #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
> >
> > /* The version of the protocol we support */
> > @@ -66,6 +80,20 @@
> > } \
> > } while (0)
> >
> > +static inline
> > +bool has_feature(uint64_t features, unsigned int fbit)
> > +{
> > + assert(fbit < 64);
> > + return !!(features & (1ULL << fbit));
> > +}
> > +
> > +static inline
> > +bool vu_has_feature(VuDev *dev,
> > + unsigned int fbit)
> > +{
> > + return has_feature(dev->features, fbit);
> > +}
> > +
> > static const char *
> > vu_request_to_string(unsigned int req)
> > {
> > @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req)
> > REQ(VHOST_USER_POSTCOPY_ADVISE),
> > REQ(VHOST_USER_POSTCOPY_LISTEN),
> > REQ(VHOST_USER_POSTCOPY_END),
> > + REQ(VHOST_USER_GET_INFLIGHT_FD),
> > + REQ(VHOST_USER_SET_INFLIGHT_FD),
> > REQ(VHOST_USER_MAX),
> > };
> > #undef REQ
> > @@ -890,6 +920,41 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
> > return true;
> > }
> >
> > +static int
> > +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
> > +{
> > + int i = 0;
> > +
> > + if (!has_feature(dev->protocol_features,
> > + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> > + return 0;
> > + }
> > +
> > + if (unlikely(!vq->inflight)) {
> > + return -1;
> > + }
> > +
> > + vq->used_idx = vq->vring.used->idx;
> > + vq->inflight_num = 0;
> > + for (i = 0; i < vq->vring.num; i++) {
> > + if (vq->inflight->desc[i] == 0) {
> > + continue;
> > + }
> > +
> > + vq->inflight_desc[vq->inflight_num++] = i;
> > + vq->inuse++;
> > + }
> > + vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
> > +
> > + /* in case of I/O hang after reconnecting */
> > + if (eventfd_write(vq->kick_fd, 1) ||
> > + eventfd_write(vq->call_fd, 1)) {
> > + return -1;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > static bool
> > vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
> > {
> > @@ -925,6 +990,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
> > dev->vq[index].kick_fd, index);
> > }
> >
> > + if (vu_check_queue_inflights(dev, &dev->vq[index])) {
> > + vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
> > + }
> > +
> > return false;
> > }
> >
> > @@ -1215,6 +1284,117 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
> > return true;
> > }
> >
> > +static bool
> > +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> > +{
> > + int fd;
> > + void *addr;
> > + uint64_t mmap_size;
> > +
> > + if (vmsg->size != sizeof(vmsg->payload.inflight)) {
> > + vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
> > + vmsg->payload.inflight.mmap_size = 0;
> > + return true;
> > + }
> > +
> > + DPRINT("set_inflight_fd num_queues: %"PRId16"\n",
> > + vmsg->payload.inflight.num_queues);
> > +
> > + mmap_size = vmsg->payload.inflight.num_queues *
> > + ALIGN_UP(sizeof(VuVirtqInflight), INFLIGHT_ALIGNMENT);
> > +
> > + addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
> > + F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
> > + &fd, NULL);
> > +
> > + if (!addr) {
> > + vu_panic(dev, "Failed to alloc vhost inflight area");
> > + vmsg->payload.inflight.mmap_size = 0;
> > + return true;
> > + }
> > +
> > + dev->inflight_info.addr = addr;
> > + dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
> > + vmsg->payload.inflight.mmap_offset = 0;
> > + vmsg->payload.inflight.align = INFLIGHT_ALIGNMENT;
> > + vmsg->payload.inflight.version = INFLIGHT_VERSION;
> > + vmsg->fd_num = 1;
> > + dev->inflight_info.fd = vmsg->fds[0] = fd;
> > +
> > + DPRINT("send inflight mmap_size: %"PRId64"\n",
> > + vmsg->payload.inflight.mmap_size);
> > + DPRINT("send inflight mmap offset: %"PRId64"\n",
> > + vmsg->payload.inflight.mmap_offset);
> > + DPRINT("send inflight align: %"PRId32"\n",
> > + vmsg->payload.inflight.align);
> > + DPRINT("send inflight version: %"PRId16"\n",
> > + vmsg->payload.inflight.version);
> > +
> > + return true;
> > +}
> > +
> > +static bool
> > +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> > +{
> > + int fd, i;
> > + uint64_t mmap_size, mmap_offset;
> > + uint32_t align;
> > + uint16_t num_queues, version;
> > + void *rc;
> > +
> > + if (vmsg->fd_num != 1 ||
> > + vmsg->size != sizeof(vmsg->payload.inflight)) {
> > + vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
> > + vmsg->size, vmsg->fd_num);
> > + return false;
> > + }
> > +
> > + fd = vmsg->fds[0];
> > + mmap_size = vmsg->payload.inflight.mmap_size;
> > + mmap_offset = vmsg->payload.inflight.mmap_offset;
> > + align = vmsg->payload.inflight.align;
> > + num_queues = vmsg->payload.inflight.num_queues;
> > + version = vmsg->payload.inflight.version;
> > +
> > + DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
> > + DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
> > + DPRINT("set_inflight_fd align: %"PRId32"\n", align);
> > + DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
> > + DPRINT("set_inflight_fd version: %"PRId16"\n", version);
> > +
> > + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > + fd, mmap_offset);
> > +
> > + if (rc == MAP_FAILED) {
> > + vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
> > + return false;
> > + }
> > +
> > + if (version != INFLIGHT_VERSION) {
> > + vu_panic(dev, "Invalid set_inflight_fd version: %d", version);
> > + return false;
> > + }
> > +
> > + if (dev->inflight_info.fd) {
> > + close(dev->inflight_info.fd);
> > + }
> > +
> > + if (dev->inflight_info.addr) {
> > + munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > + }
> > +
> > + dev->inflight_info.fd = fd;
> > + dev->inflight_info.addr = rc;
> > + dev->inflight_info.size = mmap_size;
> > +
> > + for (i = 0; i < num_queues; i++) {
> > + dev->vq[i].inflight = (VuVirtqInflight *)rc;
> > + rc = (void *)((char *)rc + ALIGN_UP(sizeof(VuVirtqInflight),
> > align));
> > + }
> > +
> > + return false;
> > +}
> > +
> > static bool
> > vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
> > {
> > @@ -1292,6 +1472,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
> > return vu_set_postcopy_listen(dev, vmsg);
> > case VHOST_USER_POSTCOPY_END:
> > return vu_set_postcopy_end(dev, vmsg);
> > + case VHOST_USER_GET_INFLIGHT_FD:
> > + return vu_get_inflight_fd(dev, vmsg);
> > + case VHOST_USER_SET_INFLIGHT_FD:
> > + return vu_set_inflight_fd(dev, vmsg);
> > default:
> > vmsg_close_fds(vmsg);
> > vu_panic(dev, "Unhandled request: %d", vmsg->request);
> > @@ -1359,8 +1543,18 @@ vu_deinit(VuDev *dev)
> > close(vq->err_fd);
> > vq->err_fd = -1;
> > }
> > + vq->inflight = NULL;
> > }
> >
> > + if (dev->inflight_info.addr) {
> > + munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > + dev->inflight_info.addr = NULL;
> > + }
> > +
> > + if (dev->inflight_info.fd > 0) {
> > + close(dev->inflight_info.fd);
> > + dev->inflight_info.fd = -1;
> > + }
> >
> > vu_close_log(dev);
> > if (dev->slave_fd != -1) {
> > @@ -1687,20 +1881,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq)
> > return vring_avail_idx(vq) == vq->last_avail_idx;
> > }
> >
> > -static inline
> > -bool has_feature(uint64_t features, unsigned int fbit)
> > -{
> > - assert(fbit < 64);
> > - return !!(features & (1ULL << fbit));
> > -}
> > -
> > -static inline
> > -bool vu_has_feature(VuDev *dev,
> > - unsigned int fbit)
> > -{
> > - return has_feature(dev->features, fbit);
> > -}
> > -
> > static bool
> > vring_notify(VuDev *dev, VuVirtq *vq)
> > {
> > @@ -1829,12 +2009,6 @@ virtqueue_map_desc(VuDev *dev,
> > *p_num_sg = num_sg;
> > }
> >
> > -/* Round number down to multiple */
> > -#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> > -
> > -/* Round number up to multiple */
> > -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> > -
> > static void *
> > virtqueue_alloc_element(size_t sz,
> > unsigned out_num, unsigned in_num)
> > @@ -1935,9 +2109,44 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned
> > int idx, size_t sz)
> > return elem;
> > }
> >
> > +static int
> > +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
> > +{
> > + if (!has_feature(dev->protocol_features,
> > + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> > + return 0;
> > + }
> > +
> > + if (unlikely(!vq->inflight)) {
> > + return -1;
> > + }
> > +
>
>
> Just wonder what happens if backend get killed at this point?
>
We will re-caculate last_avail_idx like: last_avail_idx = inuse + used_idx
At this point, backend could consume this entry correctly after reconnect.
> You want to survive from the backend crash but you still depend on
> backend to get and put inflight descriptors which seems somehow conflict.
>
But if backend get killed in vu_queue_inflight_put(), I think you are
right, there is something conflict. One descriptor is consumed by
guest but still marked as inused in inflight buffer. Then we will
re-send this old descriptor after restart.
Maybe we can add something like that to fix this issue:
void vu_queue_push()
{
vq->inflight->elem_idx = elem->idx;
vu_queue_fill();
vu_queue_flush();
vq->inflight->desc[elem->idx] = 0;
vq->inflight->used_idx = vq->vring.used->idx;
}
static int vu_check_queue_inflights()
{
....
if (vq->inflight->used_idx != vq->vring.used->idx) {
/* Crash in vu_queue_push() */
vq->inflight->desc[vq->inflight->elem_idx] = 0;
}
....
}
Thanks,
Yongji
- Re: [Qemu-devel] [PATCH v4 for-4.0 1/7] char-socket: Enable "nowait" option on client sockets, (continued)
[Qemu-devel] [PATCH v4 for-4.0 3/7] libvhost-user: Introduce vu_queue_map_desc(), elohimes, 2019/01/09
[Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory, elohimes, 2019/01/09
- Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory, Jason Wang, 2019/01/10
- Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory,
Yongji Xie <=
- Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory, Jason Wang, 2019/01/15
- Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory, Yongji Xie, 2019/01/15
- Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory, Jason Wang, 2019/01/17
- Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory, Michael S. Tsirkin, 2019/01/17
- Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory, Jason Wang, 2019/01/17
- Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory, Yongji Xie, 2019/01/17
- Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory, Michael S. Tsirkin, 2019/01/17
- Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory, Jason Wang, 2019/01/17
- Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory, Michael S. Tsirkin, 2019/01/17
- Re: [Qemu-devel] [PATCH v4 for-4.0 4/7] libvhost-user: Support tracking inflight I/O in shared memory, Yongji Xie, 2019/01/18