qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 5/5] libvduse: Add support for reconnecting


From: Yongji Xie
Subject: Re: [PATCH 5/5] libvduse: Add support for reconnecting
Date: Tue, 8 Feb 2022 15:35:27 +0800

On Mon, Feb 7, 2022 at 10:39 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> On Tue, Jan 25, 2022 at 09:18:00PM +0800, Xie Yongji wrote:
> > To support reconnecting after restart or crash, VDUSE backend
> > might need to resubmit inflight I/Os. This stores the metadata
> > such as the index of inflight I/O's descriptors to a shm file so
> > that VDUSE backend can restore them during reconnecting.
> >
> > Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> > ---
> >  block/export/vduse-blk.c        |   4 +-
> >  subprojects/libvduse/libvduse.c | 254 +++++++++++++++++++++++++++++++-
> >  subprojects/libvduse/libvduse.h |   4 +-
> >  3 files changed, 254 insertions(+), 8 deletions(-)
> >
> > diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
> > index 83845e9a9a..bc14fd798b 100644
> > --- a/block/export/vduse-blk.c
> > +++ b/block/export/vduse-blk.c
> > @@ -232,6 +232,8 @@ static void vduse_blk_enable_queue(VduseDev *dev, 
> > VduseVirtq *vq)
> >
> >      aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
> >                         true, on_vduse_vq_kick, NULL, NULL, NULL, vq);
> > +    /* Make sure we don't miss any kick afer reconnecting */
> > +    eventfd_write(vduse_queue_get_fd(vq), 1);
> >  }
> >
> >  static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
> > @@ -388,7 +390,7 @@ static int vduse_blk_exp_create(BlockExport *exp, 
> > BlockExportOptions *opts,
> >                                       features, num_queues,
> >                                       sizeof(struct virtio_blk_config),
> >                                       (char *)&config, &vduse_blk_ops,
> > -                                     vblk_exp);
> > +                                     g_get_tmp_dir(), vblk_exp);
> >      if (!vblk_exp->dev) {
> >          error_setg(errp, "failed to create vduse device");
> >          return -ENOMEM;
> > diff --git a/subprojects/libvduse/libvduse.c 
> > b/subprojects/libvduse/libvduse.c
> > index 7671864bca..ce2f6c7949 100644
> > --- a/subprojects/libvduse/libvduse.c
> > +++ b/subprojects/libvduse/libvduse.c
> > @@ -41,6 +41,8 @@
> >  #define VDUSE_VQ_ALIGN 4096
> >  #define MAX_IOVA_REGIONS 256
> >
> > +#define LOG_ALIGNMENT 64
> > +
> >  /* Round number down to multiple */
> >  #define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> >
> > @@ -51,6 +53,31 @@
> >  #define unlikely(x)   __builtin_expect(!!(x), 0)
> >  #endif
> >
> > +typedef struct VduseDescStateSplit {
> > +    uint8_t inflight;
> > +    uint8_t padding[5];
> > +    uint16_t next;
> > +    uint64_t counter;
> > +} VduseDescStateSplit;
> > +
> > +typedef struct VduseVirtqLogInflight {
> > +    uint64_t features;
> > +    uint16_t version;
> > +    uint16_t desc_num;
> > +    uint16_t last_batch_head;
> > +    uint16_t used_idx;
> > +    VduseDescStateSplit desc[];
> > +} VduseVirtqLogInflight;
> > +
> > +typedef struct VduseVirtqLog {
> > +    VduseVirtqLogInflight inflight;
> > +} VduseVirtqLog;
> > +
> > +typedef struct VduseVirtqInflightDesc {
> > +    uint16_t index;
> > +    uint64_t counter;
> > +} VduseVirtqInflightDesc;
> > +
> >  typedef struct VduseRing {
> >      unsigned int num;
> >      uint64_t desc_addr;
> > @@ -73,6 +100,10 @@ struct VduseVirtq {
> >      bool ready;
> >      int fd;
> >      VduseDev *dev;
> > +    VduseVirtqInflightDesc *resubmit_list;
> > +    uint16_t resubmit_num;
> > +    uint64_t counter;
> > +    VduseVirtqLog *log;
> >  };
> >
> >  typedef struct VduseIovaRegion {
> > @@ -96,8 +127,67 @@ struct VduseDev {
> >      int fd;
> >      int ctrl_fd;
> >      void *priv;
> > +    char *shm_log_dir;
> > +    void *log;
> > +    bool reconnect;
> >  };
> >
> > +static inline size_t vduse_vq_log_size(uint16_t queue_size)
> > +{
> > +    return ALIGN_UP(sizeof(VduseDescStateSplit) * queue_size +
> > +                    sizeof(VduseVirtqLogInflight), LOG_ALIGNMENT);
> > +}
> > +
> > +static void *vduse_log_get(const char *dir, const char *name, size_t size)
> > +{
> > +    void *ptr = MAP_FAILED;
> > +    char *path;
> > +    int fd;
> > +
> > +    path = (char *)malloc(strlen(dir) + strlen(name) +
> > +                          strlen("/vduse-log-") + 1);
> > +    if (!path) {
> > +        return ptr;
> > +    }
> > +    sprintf(path, "%s/vduse-log-%s", dir, name);
>
> Please use g_strdup_printf() and g_autofree in QEMU code. In libvduse
> code it's okay to use malloc(3), but regular QEMU code should use glib.
>

But this code resides in libvduse currently.

> > +
> > +    fd = open(path, O_RDWR | O_CREAT, 0600);
> > +    if (fd == -1) {
> > +        goto out;
> > +    }
> > +
> > +    if (ftruncate(fd, size) == -1) {
> > +        goto out;
> > +    }
> > +
> > +    ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> > +    if (ptr == MAP_FAILED) {
> > +        goto out;
> > +    }
> > +out:
> > +    if (fd > 0) {
> > +        close(fd);
> > +    }
> > +    free(path);
> > +
> > +    return ptr;
> > +}
> > +
> > +static void vduse_log_destroy(const char *dir, const char *name)
> > +{
> > +    char *path;
> > +
> > +    path = (char *)malloc(strlen(dir) + strlen(name) +
> > +                          strlen("/vduse-log-") + 1);
> > +    if (!path) {
> > +        return;
> > +    }
> > +    sprintf(path, "%s/vduse-log-%s", dir, name);
> > +
> > +    unlink(path);
> > +    free(path);
> > +}
> > +
> >  static inline bool has_feature(uint64_t features, unsigned int fbit)
> >  {
> >      assert(fbit < 64);
> > @@ -139,6 +229,98 @@ static int vduse_inject_irq(VduseDev *dev, int index)
> >      return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
> >  }
> >
> > +static int inflight_desc_compare(const void *a, const void *b)
> > +{
> > +    VduseVirtqInflightDesc *desc0 = (VduseVirtqInflightDesc *)a,
> > +                           *desc1 = (VduseVirtqInflightDesc *)b;
> > +
> > +    if (desc1->counter > desc0->counter &&
> > +        (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
> > +        return 1;
> > +    }
> > +
> > +    return -1;
> > +}
> > +
> > +static int vduse_queue_check_inflights(VduseVirtq *vq)
> > +{
> > +    int i = 0;
> > +    VduseDev *dev = vq->dev;
> > +
> > +    vq->used_idx = vq->vring.used->idx;
>
> Is this reading struct vring_used->idx without le16toh()?
>
> > +    vq->resubmit_num = 0;
> > +    vq->resubmit_list = NULL;
> > +    vq->counter = 0;
> > +
> > +    if (unlikely(vq->log->inflight.used_idx != vq->used_idx)) {
> > +        vq->log->inflight.desc[vq->log->inflight.last_batch_head].inflight 
> > = 0;
>
> I suggest validating vq->log->inflight fields before using them.
> last_batch_head must be less than the virtqueue size. Although the log
> file is somewhat trusted, there may still be ways to corrupt it or
> confuse the new process that loads it.
>

I can validate the last_batch_head field. But it's hard to validate
the inflight field, so we might still meet some issues if the file is
corrupted.

> > +
> > +        barrier();
> > +
> > +        vq->log->inflight.used_idx = vq->used_idx;
> > +    }
> > +
> > +    for (i = 0; i < vq->log->inflight.desc_num; i++) {
> > +        if (vq->log->inflight.desc[i].inflight == 1) {
> > +            vq->inuse++;
> > +        }
> > +    }
> > +
> > +    vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
> > +
> > +    if (vq->inuse) {
> > +        vq->resubmit_list = calloc(vq->inuse, 
> > sizeof(VduseVirtqInflightDesc));
> > +        if (!vq->resubmit_list) {
> > +            return -1;
> > +        }
> > +
> > +        for (i = 0; i < vq->log->inflight.desc_num; i++) {
> > +            if (vq->log->inflight.desc[i].inflight) {
> > +                vq->resubmit_list[vq->resubmit_num].index = i;
> > +                vq->resubmit_list[vq->resubmit_num].counter =
> > +                                        vq->log->inflight.desc[i].counter;
> > +                vq->resubmit_num++;
> > +            }
> > +        }
> > +
> > +        if (vq->resubmit_num > 1) {
> > +            qsort(vq->resubmit_list, vq->resubmit_num,
> > +                  sizeof(VduseVirtqInflightDesc), inflight_desc_compare);
> > +        }
> > +        vq->counter = vq->resubmit_list[0].counter + 1;
> > +    }
> > +
> > +    vduse_inject_irq(dev, vq->index);
> > +
> > +    return 0;
> > +}
> > +
> > +static int vduse_queue_inflight_get(VduseVirtq *vq, int desc_idx)
> > +{
> > +    vq->log->inflight.desc[desc_idx].counter = vq->counter++;
> > +    vq->log->inflight.desc[desc_idx].inflight = 1;
> > +
> > +    return 0;
> > +}
> > +
> > +static int vduse_queue_inflight_pre_put(VduseVirtq *vq, int desc_idx)
> > +{
> > +    vq->log->inflight.last_batch_head = desc_idx;
> > +
> > +    return 0;
> > +}
> > +
> > +static int vduse_queue_inflight_post_put(VduseVirtq *vq, int desc_idx)
> > +{
> > +    vq->log->inflight.desc[desc_idx].inflight = 0;
> > +
> > +    barrier();
> > +
> > +    vq->log->inflight.used_idx = vq->used_idx;
> > +
> > +    return 0;
> > +}
> > +
> >  static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
> >                                       uint64_t last)
> >  {
> > @@ -578,11 +760,24 @@ void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
> >      unsigned int head;
> >      VduseVirtqElement *elem;
> >      VduseDev *dev = vq->dev;
> > +    int i;
> >
> >      if (unlikely(!vq->vring.avail)) {
> >          return NULL;
> >      }
> >
> > +    if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
> > +        i = (--vq->resubmit_num);
> > +        elem = vduse_queue_map_desc(vq, vq->resubmit_list[i].index, sz);
> > +
> > +        if (!vq->resubmit_num) {
> > +            free(vq->resubmit_list);
> > +            vq->resubmit_list = NULL;
> > +        }
>
> resubmit_list is only freed when vduse_queue_pop() is called often
> enough to empty the list. Please free the list when the vduse instance
> is destroyed too, just in case vduse_queue_pop() wasn't called often
> enough to free it.
>

Will do it.

> > +
> > +        return elem;
> > +    }
> > +
> >      if (vduse_queue_empty(vq)) {
> >          return NULL;
> >      }
> > @@ -610,6 +805,8 @@ void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
> >
> >      vq->inuse++;
> >
> > +    vduse_queue_inflight_get(vq, head);
> > +
> >      return elem;
> >  }
> >
> > @@ -667,7 +864,9 @@ void vduse_queue_push(VduseVirtq *vq, const 
> > VduseVirtqElement *elem,
> >                        unsigned int len)
> >  {
> >      vduse_queue_fill(vq, elem, len, 0);
> > +    vduse_queue_inflight_pre_put(vq, elem->index);
> >      vduse_queue_flush(vq, 1);
> > +    vduse_queue_inflight_post_put(vq, elem->index);
> >  }
> >
> >  static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
> > @@ -740,12 +939,11 @@ static void vduse_queue_enable(VduseVirtq *vq)
> >      }
> >
> >      vq->fd = fd;
> > -    vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index;
> > -    vq->inuse = 0;
> > -    vq->used_idx = 0;
> >      vq->signalled_used_valid = false;
> >      vq->ready = true;
> >
> > +    vduse_queue_check_inflights(vq);
> > +
> >      dev->ops->enable_queue(dev, vq);
> >  }
> >
> > @@ -903,13 +1101,18 @@ int vduse_dev_setup_queue(VduseDev *dev, int index, 
> > int max_size)
> >          return -errno;
> >      }
> >
> > +    if (dev->reconnect) {
> > +        vduse_queue_enable(vq);
> > +    }
> > +
> >      return 0;
> >  }
> >
> >  VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
> >                             uint32_t vendor_id, uint64_t features,
> >                             uint16_t num_queues, uint32_t config_size,
> > -                           char *config, const VduseOps *ops, void *priv)
> > +                           char *config, const VduseOps *ops,
> > +                           const char *shm_log_dir, void *priv)
> >  {
> >      VduseDev *dev;
> >      int i, ret, ctrl_fd, fd = -1;
> > @@ -918,6 +1121,8 @@ VduseDev *vduse_dev_create(const char *name, uint32_t 
> > device_id,
> >      VduseVirtq *vqs = NULL;
> >      struct vduse_dev_config *dev_config = NULL;
> >      size_t size = offsetof(struct vduse_dev_config, config);
> > +    size_t log_size = num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
> > +    void *log = NULL;
> >
> >      if (!name || strlen(name) > VDUSE_NAME_MAX || !config ||
> >          !config_size || !ops || !ops->enable_queue || !ops->disable_queue) 
> > {
> > @@ -932,6 +1137,15 @@ VduseDev *vduse_dev_create(const char *name, uint32_t 
> > device_id,
> >      }
> >      memset(dev, 0, sizeof(VduseDev));
> >
> > +    if (shm_log_dir) {
> > +        dev->log = log = vduse_log_get(shm_log_dir, name, log_size);
> > +        if (!log) {
> > +            fprintf(stderr, "Failed to get vduse log\n");
> > +            goto err_ctrl;
> > +        }
> > +        dev->shm_log_dir = strdup(shm_log_dir);
> > +    }
> > +
> >      ctrl_fd = open("/dev/vduse/control", O_RDWR);
> >      if (ctrl_fd < 0) {
> >          fprintf(stderr, "Failed to open /dev/vduse/control: %s\n",
> > @@ -964,7 +1178,11 @@ VduseDev *vduse_dev_create(const char *name, uint32_t 
> > device_id,
> >
> >      ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
> >      free(dev_config);
> > -    if (ret < 0) {
> > +    if (!ret && log) {
> > +        memset(log, 0, log_size);
> > +    } else if (errno == EEXIST && log) {
> > +        dev->reconnect = true;
> > +    } else {
> >          fprintf(stderr, "Failed to create vduse dev %s: %s\n",
> >                  name, strerror(errno));
> >          goto err_dev;
> > @@ -978,6 +1196,12 @@ VduseDev *vduse_dev_create(const char *name, uint32_t 
> > device_id,
> >          goto err;
> >      }
> >
> > +    if (dev->reconnect &&
> > +        ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
> > +        fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
> > +        goto err;
> > +    }
> > +
> >      vqs = calloc(sizeof(VduseVirtq), num_queues);
> >      if (!vqs) {
> >          fprintf(stderr, "Failed to allocate virtqueues\n");
> > @@ -988,6 +1212,12 @@ VduseDev *vduse_dev_create(const char *name, uint32_t 
> > device_id,
> >          vqs[i].index = i;
> >          vqs[i].dev = dev;
> >          vqs[i].fd = -1;
> > +        if (log) {
> > +            vqs[i].log = log;
> > +            vqs[i].log->inflight.desc_num = VIRTQUEUE_MAX_SIZE;
> > +            log = (void *)((char *)log +
> > +                  vduse_vq_log_size(VIRTQUEUE_MAX_SIZE));
>
> The size of the log needs to be verified. The file is mmapped but
> there's no guarantee that the size matches num_queues *
> vduse_vq_log_size(VIRTQUEUE_MAX_SIZE).
>

We will call ftruncate() in vduse_log_get(). Is it enough?

Thanks,
Yongji



reply via email to

[Prev in Thread] Current Thread [Next in Thread]