[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PULL 22/26] libvhost-user: Support tracking inflight I
From: |
Marc-André Lureau |
Subject: |
Re: [Qemu-devel] [PULL 22/26] libvhost-user: Support tracking inflight I/O in shared memory |
Date: |
Sat, 16 Nov 2019 21:42:55 +0400 |
On Wed, Mar 13, 2019 at 6:59 AM Michael S. Tsirkin <address@hidden> wrote:
>
> From: Xie Yongji <address@hidden>
>
> This patch adds support for VHOST_USER_GET_INFLIGHT_FD and
> VHOST_USER_SET_INFLIGHT_FD message to set/get shared buffer
> to/from qemu. Then backend can track inflight I/O in this buffer.
>
> Signed-off-by: Xie Yongji <address@hidden>
> Signed-off-by: Zhang Yu <address@hidden>
> Message-Id: <address@hidden>
> Reviewed-by: Michael S. Tsirkin <address@hidden>
> Signed-off-by: Michael S. Tsirkin <address@hidden>
> ---
> Makefile | 2 +-
> contrib/libvhost-user/libvhost-user.h | 70 ++++++
> contrib/libvhost-user/libvhost-user.c | 349 ++++++++++++++++++++++++--
> 3 files changed, 400 insertions(+), 21 deletions(-)
>
> diff --git a/Makefile b/Makefile
> index 6ccb8639b0..abd78a9826 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -497,7 +497,7 @@ Makefile: $(version-obj-y)
> # Build libraries
>
> libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)
> -libvhost-user.a: $(libvhost-user-obj-y)
> +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y)
>
> ######################################################################
>
> diff --git a/contrib/libvhost-user/libvhost-user.h
> b/contrib/libvhost-user/libvhost-user.h
> index 3de8414898..414ceb0a2f 100644
> --- a/contrib/libvhost-user/libvhost-user.h
> +++ b/contrib/libvhost-user/libvhost-user.h
> @@ -53,6 +53,7 @@ enum VhostUserProtocolFeature {
> VHOST_USER_PROTOCOL_F_CONFIG = 9,
> VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
> VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
> + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
>
> VHOST_USER_PROTOCOL_F_MAX
> };
> @@ -91,6 +92,8 @@ typedef enum VhostUserRequest {
> VHOST_USER_POSTCOPY_ADVISE = 28,
> VHOST_USER_POSTCOPY_LISTEN = 29,
> VHOST_USER_POSTCOPY_END = 30,
> + VHOST_USER_GET_INFLIGHT_FD = 31,
> + VHOST_USER_SET_INFLIGHT_FD = 32,
> VHOST_USER_MAX
> } VhostUserRequest;
>
> @@ -138,6 +141,13 @@ typedef struct VhostUserVringArea {
> uint64_t offset;
> } VhostUserVringArea;
>
> +typedef struct VhostUserInflight {
> + uint64_t mmap_size;
> + uint64_t mmap_offset;
> + uint16_t num_queues;
> + uint16_t queue_size;
> +} VhostUserInflight;
> +
> #if defined(_WIN32)
> # define VU_PACKED __attribute__((gcc_struct, packed))
> #else
> @@ -163,6 +173,7 @@ typedef struct VhostUserMsg {
> VhostUserLog log;
> VhostUserConfig config;
> VhostUserVringArea area;
> + VhostUserInflight inflight;
> } payload;
>
> int fds[VHOST_MEMORY_MAX_NREGIONS];
> @@ -234,9 +245,61 @@ typedef struct VuRing {
> uint32_t flags;
> } VuRing;
>
> +typedef struct VuDescStateSplit {
> + /* Indicate whether this descriptor is inflight or not.
> + * Only available for head-descriptor. */
> + uint8_t inflight;
> +
> + /* Padding */
> + uint8_t padding[5];
> +
> + /* Maintain a list for the last batch of used descriptors.
> + * Only available when batching is used for submitting */
> + uint16_t next;
> +
> + /* Used to preserve the order of fetching available descriptors.
> + * Only available for head-descriptor. */
> + uint64_t counter;
> +} VuDescStateSplit;
> +
> +typedef struct VuVirtqInflight {
> + /* The feature flags of this region. Now it's initialized to 0. */
> + uint64_t features;
> +
> + /* The version of this region. It's 1 currently.
> + * Zero value indicates a vm reset happened. */
> + uint16_t version;
> +
> + /* The size of VuDescStateSplit array. It's equal to the virtqueue
> + * size. Slave could get it from queue size field of VhostUserInflight.
> */
> + uint16_t desc_num;
> +
> + /* The head of list that track the last batch of used descriptors. */
> + uint16_t last_batch_head;
> +
> + /* Storing the idx value of used ring */
> + uint16_t used_idx;
> +
> + /* Used to track the state of each descriptor in descriptor table */
> + VuDescStateSplit desc[0];
> +} VuVirtqInflight;
> +
> +typedef struct VuVirtqInflightDesc {
> + uint16_t index;
> + uint64_t counter;
> +} VuVirtqInflightDesc;
> +
> typedef struct VuVirtq {
> VuRing vring;
>
> + VuVirtqInflight *inflight;
> +
> + VuVirtqInflightDesc *resubmit_list;
> +
> + uint16_t resubmit_num;
> +
> + uint64_t counter;
> +
> /* Next head to pop */
> uint16_t last_avail_idx;
>
> @@ -279,11 +342,18 @@ typedef void (*vu_set_watch_cb) (VuDev *dev, int fd,
> int condition,
> vu_watch_cb cb, void *data);
> typedef void (*vu_remove_watch_cb) (VuDev *dev, int fd);
>
> +typedef struct VuDevInflightInfo {
> + int fd;
> + void *addr;
> + uint64_t size;
> +} VuDevInflightInfo;
> +
> struct VuDev {
> int sock;
> uint32_t nregions;
> VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
> VuVirtq vq[VHOST_MAX_NR_VIRTQUEUE];
> + VuDevInflightInfo inflight_info;
> int log_call_fd;
> int slave_fd;
> uint64_t log_size;
> diff --git a/contrib/libvhost-user/libvhost-user.c
> b/contrib/libvhost-user/libvhost-user.c
> index ddd15d79cf..e08d6c7b97 100644
> --- a/contrib/libvhost-user/libvhost-user.c
> +++ b/contrib/libvhost-user/libvhost-user.c
> @@ -41,6 +41,8 @@
> #endif
>
> #include "qemu/atomic.h"
> +#include "qemu/osdep.h"
> +#include "qemu/memfd.h"
>
> #include "libvhost-user.h"
>
> @@ -53,6 +55,18 @@
> _min1 < _min2 ? _min1 : _min2; })
> #endif
>
> +/* Round number down to multiple */
> +#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> +
> +/* Round number up to multiple */
> +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> +
> +/* Align each region to cache line size in inflight buffer */
> +#define INFLIGHT_ALIGNMENT 64
> +
> +/* The version of inflight buffer */
> +#define INFLIGHT_VERSION 1
> +
> #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
>
> /* The version of the protocol we support */
> @@ -66,6 +80,20 @@
> } \
> } while (0)
>
> +static inline
> +bool has_feature(uint64_t features, unsigned int fbit)
> +{
> + assert(fbit < 64);
> + return !!(features & (1ULL << fbit));
> +}
> +
> +static inline
> +bool vu_has_feature(VuDev *dev,
> + unsigned int fbit)
> +{
> + return has_feature(dev->features, fbit);
> +}
> +
> static const char *
> vu_request_to_string(unsigned int req)
> {
> @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req)
> REQ(VHOST_USER_POSTCOPY_ADVISE),
> REQ(VHOST_USER_POSTCOPY_LISTEN),
> REQ(VHOST_USER_POSTCOPY_END),
> + REQ(VHOST_USER_GET_INFLIGHT_FD),
> + REQ(VHOST_USER_SET_INFLIGHT_FD),
> REQ(VHOST_USER_MAX),
> };
> #undef REQ
> @@ -890,6 +920,91 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
> return true;
> }
>
> +static int
> +inflight_desc_compare(const void *a, const void *b)
> +{
> + VuVirtqInflightDesc *desc0 = (VuVirtqInflightDesc *)a,
> + *desc1 = (VuVirtqInflightDesc *)b;
> +
> + if (desc1->counter > desc0->counter &&
> + (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
> + return 1;
> + }
> +
> + return -1;
> +}
> +
> +static int
> +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
> +{
> + int i = 0;
> +
> + if (!has_feature(dev->protocol_features,
> + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> + return 0;
> + }
> +
> + if (unlikely(!vq->inflight)) {
> + return -1;
> + }
> +
> + if (unlikely(!vq->inflight->version)) {
> + /* initialize the buffer */
> + vq->inflight->version = INFLIGHT_VERSION;
> + return 0;
> + }
> +
> + vq->used_idx = vq->vring.used->idx;
> + vq->resubmit_num = 0;
> + vq->resubmit_list = NULL;
> + vq->counter = 0;
> +
> + if (unlikely(vq->inflight->used_idx != vq->used_idx)) {
> + vq->inflight->desc[vq->inflight->last_batch_head].inflight = 0;
> +
> + barrier();
> +
> + vq->inflight->used_idx = vq->used_idx;
> + }
> +
> + for (i = 0; i < vq->inflight->desc_num; i++) {
> + if (vq->inflight->desc[i].inflight == 1) {
> + vq->inuse++;
> + }
> + }
> +
> + vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
> +
> + if (vq->inuse) {
> + vq->resubmit_list = malloc(sizeof(VuVirtqInflightDesc) * vq->inuse);
> + if (!vq->resubmit_list) {
> + return -1;
> + }
> +
> + for (i = 0; i < vq->inflight->desc_num; i++) {
> + if (vq->inflight->desc[i].inflight) {
> + vq->resubmit_list[vq->resubmit_num].index = i;
> + vq->resubmit_list[vq->resubmit_num].counter =
> + vq->inflight->desc[i].counter;
> + vq->resubmit_num++;
> + }
> + }
> +
> + if (vq->resubmit_num > 1) {
> + qsort(vq->resubmit_list, vq->resubmit_num,
> + sizeof(VuVirtqInflightDesc), inflight_desc_compare);
> + }
> + vq->counter = vq->resubmit_list[0].counter + 1;
scan-build reports that vq->resubmit_list[0].counter may be garbage
value, if it's not initialized in the loop above.
Xie, could you provide a fix?
> + }
> +
> + /* in case of I/O hang after reconnecting */
> + if (eventfd_write(vq->kick_fd, 1)) {
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> static bool
> vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
> {
> @@ -923,6 +1038,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
> dev->vq[index].kick_fd, index);
> }
>
> + if (vu_check_queue_inflights(dev, &dev->vq[index])) {
> + vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
> + }
> +
> return false;
> }
>
> @@ -995,6 +1114,11 @@ vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
>
> dev->vq[index].call_fd = vmsg->fds[0];
>
> + /* in case of I/O hang after reconnecting */
> + if (eventfd_write(vmsg->fds[0], 1)) {
> + return -1;
> + }
> +
> DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
>
> return false;
> @@ -1209,6 +1333,116 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
> return true;
> }
>
> +static inline uint64_t
> +vu_inflight_queue_size(uint16_t queue_size)
> +{
> + return ALIGN_UP(sizeof(VuDescStateSplit) * queue_size +
> + sizeof(uint16_t), INFLIGHT_ALIGNMENT);
> +}
> +
> +static bool
> +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> +{
> + int fd;
> + void *addr;
> + uint64_t mmap_size;
> + uint16_t num_queues, queue_size;
> +
> + if (vmsg->size != sizeof(vmsg->payload.inflight)) {
> + vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
> + vmsg->payload.inflight.mmap_size = 0;
> + return true;
> + }
> +
> + num_queues = vmsg->payload.inflight.num_queues;
> + queue_size = vmsg->payload.inflight.queue_size;
> +
> + DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
> + DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
> +
> + mmap_size = vu_inflight_queue_size(queue_size) * num_queues;
> +
> + addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
> + F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
> + &fd, NULL);
> +
> + if (!addr) {
> + vu_panic(dev, "Failed to alloc vhost inflight area");
> + vmsg->payload.inflight.mmap_size = 0;
> + return true;
> + }
> +
> + memset(addr, 0, mmap_size);
> +
> + dev->inflight_info.addr = addr;
> + dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
> + dev->inflight_info.fd = vmsg->fds[0] = fd;
> + vmsg->fd_num = 1;
> + vmsg->payload.inflight.mmap_offset = 0;
> +
> + DPRINT("send inflight mmap_size: %"PRId64"\n",
> + vmsg->payload.inflight.mmap_size);
> + DPRINT("send inflight mmap offset: %"PRId64"\n",
> + vmsg->payload.inflight.mmap_offset);
> +
> + return true;
> +}
> +
> +static bool
> +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> +{
> + int fd, i;
> + uint64_t mmap_size, mmap_offset;
> + uint16_t num_queues, queue_size;
> + void *rc;
> +
> + if (vmsg->fd_num != 1 ||
> + vmsg->size != sizeof(vmsg->payload.inflight)) {
> + vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
> + vmsg->size, vmsg->fd_num);
> + return false;
> + }
> +
> + fd = vmsg->fds[0];
> + mmap_size = vmsg->payload.inflight.mmap_size;
> + mmap_offset = vmsg->payload.inflight.mmap_offset;
> + num_queues = vmsg->payload.inflight.num_queues;
> + queue_size = vmsg->payload.inflight.queue_size;
> +
> + DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
> + DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
> + DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
> + DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
> +
> + rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> + fd, mmap_offset);
> +
> + if (rc == MAP_FAILED) {
> + vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
> + return false;
> + }
> +
> + if (dev->inflight_info.fd) {
> + close(dev->inflight_info.fd);
> + }
> +
> + if (dev->inflight_info.addr) {
> + munmap(dev->inflight_info.addr, dev->inflight_info.size);
> + }
> +
> + dev->inflight_info.fd = fd;
> + dev->inflight_info.addr = rc;
> + dev->inflight_info.size = mmap_size;
> +
> + for (i = 0; i < num_queues; i++) {
> + dev->vq[i].inflight = (VuVirtqInflight *)rc;
> + dev->vq[i].inflight->desc_num = queue_size;
> + rc = (void *)((char *)rc + vu_inflight_queue_size(queue_size));
> + }
> +
> + return false;
> +}
> +
> static bool
> vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
> {
> @@ -1287,6 +1521,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
> return vu_set_postcopy_listen(dev, vmsg);
> case VHOST_USER_POSTCOPY_END:
> return vu_set_postcopy_end(dev, vmsg);
> + case VHOST_USER_GET_INFLIGHT_FD:
> + return vu_get_inflight_fd(dev, vmsg);
> + case VHOST_USER_SET_INFLIGHT_FD:
> + return vu_set_inflight_fd(dev, vmsg);
> default:
> vmsg_close_fds(vmsg);
> vu_panic(dev, "Unhandled request: %d", vmsg->request);
> @@ -1354,8 +1592,24 @@ vu_deinit(VuDev *dev)
> close(vq->err_fd);
> vq->err_fd = -1;
> }
> +
> + if (vq->resubmit_list) {
> + free(vq->resubmit_list);
> + vq->resubmit_list = NULL;
> + }
> +
> + vq->inflight = NULL;
> }
>
> + if (dev->inflight_info.addr) {
> + munmap(dev->inflight_info.addr, dev->inflight_info.size);
> + dev->inflight_info.addr = NULL;
> + }
> +
> + if (dev->inflight_info.fd > 0) {
> + close(dev->inflight_info.fd);
> + dev->inflight_info.fd = -1;
> + }
>
> vu_close_log(dev);
> if (dev->slave_fd != -1) {
> @@ -1682,20 +1936,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq)
> return vring_avail_idx(vq) == vq->last_avail_idx;
> }
>
> -static inline
> -bool has_feature(uint64_t features, unsigned int fbit)
> -{
> - assert(fbit < 64);
> - return !!(features & (1ULL << fbit));
> -}
> -
> -static inline
> -bool vu_has_feature(VuDev *dev,
> - unsigned int fbit)
> -{
> - return has_feature(dev->features, fbit);
> -}
> -
> static bool
> vring_notify(VuDev *dev, VuVirtq *vq)
> {
> @@ -1824,12 +2064,6 @@ virtqueue_map_desc(VuDev *dev,
> *p_num_sg = num_sg;
> }
>
> -/* Round number down to multiple */
> -#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> -
> -/* Round number up to multiple */
> -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> -
> static void *
> virtqueue_alloc_element(size_t sz,
> unsigned out_num, unsigned in_num)
> @@ -1930,9 +2164,68 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned
> int idx, size_t sz)
> return elem;
> }
>
> +static int
> +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
> +{
> + if (!has_feature(dev->protocol_features,
> + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> + return 0;
> + }
> +
> + if (unlikely(!vq->inflight)) {
> + return -1;
> + }
> +
> + vq->inflight->desc[desc_idx].counter = vq->counter++;
> + vq->inflight->desc[desc_idx].inflight = 1;
> +
> + return 0;
> +}
> +
> +static int
> +vu_queue_inflight_pre_put(VuDev *dev, VuVirtq *vq, int desc_idx)
> +{
> + if (!has_feature(dev->protocol_features,
> + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> + return 0;
> + }
> +
> + if (unlikely(!vq->inflight)) {
> + return -1;
> + }
> +
> + vq->inflight->last_batch_head = desc_idx;
> +
> + return 0;
> +}
> +
> +static int
> +vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx)
> +{
> + if (!has_feature(dev->protocol_features,
> + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> + return 0;
> + }
> +
> + if (unlikely(!vq->inflight)) {
> + return -1;
> + }
> +
> + barrier();
> +
> + vq->inflight->desc[desc_idx].inflight = 0;
> +
> + barrier();
> +
> + vq->inflight->used_idx = vq->used_idx;
> +
> + return 0;
> +}
> +
> void *
> vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
> {
> + int i;
> unsigned int head;
> VuVirtqElement *elem;
>
> @@ -1941,6 +2234,18 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
> return NULL;
> }
>
> + if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
> + i = (--vq->resubmit_num);
> + elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz);
> +
> + if (!vq->resubmit_num) {
> + free(vq->resubmit_list);
> + vq->resubmit_list = NULL;
> + }
> +
> + return elem;
> + }
> +
> if (vu_queue_empty(dev, vq)) {
> return NULL;
> }
> @@ -1971,6 +2276,8 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
>
> vq->inuse++;
>
> + vu_queue_inflight_get(dev, vq, head);
> +
> return elem;
> }
>
> @@ -2131,5 +2438,7 @@ vu_queue_push(VuDev *dev, VuVirtq *vq,
> const VuVirtqElement *elem, unsigned int len)
> {
> vu_queue_fill(dev, vq, elem, len, 0);
> + vu_queue_inflight_pre_put(dev, vq, elem->index);
> vu_queue_flush(dev, vq, 1);
> + vu_queue_inflight_post_put(dev, vq, elem->index);
> }
> --
> MST
>
>
--
Marc-André Lureau
- Re: [Qemu-devel] [PULL 22/26] libvhost-user: Support tracking inflight I/O in shared memory,
Marc-André Lureau <=