Re: [PATCH v4 3/5] vhost-user block device backend server

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v4 3/5] vhost-user block device backend server

From:	Kevin Wolf
Subject:	Re: [PATCH v4 3/5] vhost-user block device backend server
Date:	Tue, 25 Feb 2020 17:09:58 +0100
User-agent:	Mutt/1.12.1 (2019-06-15)

Am 18.02.2020 um 06:07 hat Coiby Xu geschrieben:
> By making use of libvhost, multiple block device drives can be exported
> and each drive can serve multiple clients simultaneously.
> Since vhost-user-server needs a block drive to be created first, delay
> the creation of this object.
> 
> Signed-off-by: Coiby Xu <address@hidden>
> ---
>  Makefile.target                  |   1 +
>  backends/Makefile.objs           |   2 +
>  backends/vhost-user-blk-server.c | 718 +++++++++++++++++++++++++++++++
>  backends/vhost-user-blk-server.h |  21 +
>  vl.c                             |   4 +
>  5 files changed, 746 insertions(+)
>  create mode 100644 backends/vhost-user-blk-server.c
>  create mode 100644 backends/vhost-user-blk-server.h
> 
> diff --git a/Makefile.target b/Makefile.target
> index 6e61f607b1..8c6c01eb3a 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -159,6 +159,7 @@ obj-y += monitor/
>  obj-y += qapi/
>  obj-y += memory.o
>  obj-y += memory_mapping.o
> +obj-$(CONFIG_LINUX) += ../contrib/libvhost-user/libvhost-user.o
>  obj-y += migration/ram.o
>  LIBS := $(libs_softmmu) $(LIBS)
>  
> diff --git a/backends/Makefile.objs b/backends/Makefile.objs
> index 28a847cd57..4e7be731e0 100644
> --- a/backends/Makefile.objs
> +++ b/backends/Makefile.objs
> @@ -14,6 +14,8 @@ common-obj-y += cryptodev-vhost.o
>  common-obj-$(CONFIG_VHOST_CRYPTO) += cryptodev-vhost-user.o
>  endif
>  
> +common-obj-$(CONFIG_LINUX) += vhost-user-blk-server.o
> +
>  common-obj-$(call land,$(CONFIG_VHOST_USER),$(CONFIG_VIRTIO)) += vhost-user.o
>  
>  common-obj-$(CONFIG_LINUX) += hostmem-memfd.o
> diff --git a/backends/vhost-user-blk-server.c 
> b/backends/vhost-user-blk-server.c
> new file mode 100644
> index 0000000000..1bf7f7b544
> --- /dev/null
> +++ b/backends/vhost-user-blk-server.c

Please move this to block/export/vhost-user-blk.c. This will
automatically have it covered as a block layer component in MAINTAINERS,
and it will provide a place where other exports can be put, too.

> @@ -0,0 +1,718 @@
> +/*
> + * Sharing QEMU block devices via vhost-user protocal
> + *
> + * Author: Coiby Xu <address@hidden>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +#include "qemu/osdep.h"
> +#include "block/block.h"
> +#include "vhost-user-blk-server.h"
> +#include "qapi/error.h"
> +#include "qom/object_interfaces.h"
> +#include "sysemu/block-backend.h"
> +
> +enum {
> +    VHOST_USER_BLK_MAX_QUEUES = 1,
> +};
> +struct virtio_blk_inhdr {
> +    unsigned char status;
> +};
> +
> +static QTAILQ_HEAD(, VuBlockDev) vu_block_devs =
> +                                 QTAILQ_HEAD_INITIALIZER(vu_block_devs);
> +
> +
> +typedef struct VuBlockReq {
> +    VuVirtqElement *elem;
> +    int64_t sector_num;
> +    size_t size;
> +    struct virtio_blk_inhdr *in;
> +    struct virtio_blk_outhdr out;
> +    VuClient *client;
> +    struct VuVirtq *vq;
> +} VuBlockReq;
> +
> +
> +static void vu_block_req_complete(VuBlockReq *req)
> +{
> +    VuDev *vu_dev = &req->client->parent;
> +
> +    /* IO size with 1 extra status byte */
> +    vu_queue_push(vu_dev, req->vq, req->elem, req->size + 1);
> +    vu_queue_notify(vu_dev, req->vq);
> +
> +    if (req->elem) {
> +        free(req->elem);
> +    }
> +
> +    g_free(req);
> +}
> +
> +static VuBlockDev *get_vu_block_device_by_client(VuClient *client)
> +{
> +    return container_of(client->server->ptr_in_device, VuBlockDev, 
> vu_server);
> +}
> +
> +static int coroutine_fn
> +vu_block_discard_write_zeroes(VuBlockReq *req, struct iovec *iov,
> +                              uint32_t iovcnt, uint32_t type)
> +{
> +    struct virtio_blk_discard_write_zeroes desc;
> +    ssize_t size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
> +    if (unlikely(size != sizeof(desc))) {
> +        error_report("Invalid size %ld, expect %ld", size, sizeof(desc));
> +        return -EINVAL;
> +    }
> +
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_client(req->client);
> +    uint64_t range[2] = { le64toh(desc.sector) << 9,
> +                          le32toh(desc.num_sectors) << 9 };
> +    if (type == VIRTIO_BLK_T_DISCARD) {
> +        if (blk_co_pdiscard(vdev_blk->backend, range[0], range[1]) == 0) {
> +            return 0;
> +        }
> +    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
> +        if (blk_co_pwrite_zeroes(vdev_blk->backend,
> +                                 range[0], range[1], 0) == 0) {
> +            return 0;
> +        }
> +    }
> +
> +    return -EINVAL;
> +}
> +
> +
> +static void coroutine_fn vu_block_flush(VuBlockReq *req)
> +{
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_client(req->client);
> +    BlockBackend *backend = vdev_blk->backend;
> +    blk_co_flush(backend);
> +}
> +
> +
> +static int coroutine_fn vu_block_virtio_process_req(VuClient *client,
> +                                                    VuVirtq *vq)
> +{
> +    VuDev *vu_dev = &client->parent;
> +    VuVirtqElement *elem;
> +    uint32_t type;
> +    VuBlockReq *req;
> +
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_client(client);
> +    BlockBackend *backend = vdev_blk->backend;
> +    elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) +
> +                                    sizeof(VuBlockReq));
> +    if (!elem) {
> +        return -1;
> +    }
> +
> +    struct iovec *in_iov = elem->in_sg;
> +    struct iovec *out_iov = elem->out_sg;
> +    unsigned in_num = elem->in_num;
> +    unsigned out_num = elem->out_num;
> +    /* refer to hw/block/virtio_blk.c */
> +    if (elem->out_num < 1 || elem->in_num < 1) {
> +        error_report("virtio-blk request missing headers");
> +        free(elem);
> +        return -EINVAL;
> +    }
> +
> +    req = g_new0(VuBlockReq, 1);
> +    req->client = client;
> +    req->vq = vq;
> +    req->elem = elem;

You can keep req on the stack, it will be freed before this function
returns.

> +    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
> +                            sizeof(req->out)) != sizeof(req->out))) {
> +        error_report("virtio-blk request outhdr too short");
> +        goto err;
> +    }
> +
> +    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
> +
> +    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
> +        error_report("virtio-blk request inhdr too short");
> +        goto err;
> +    }
> +
> +    /* We always touch the last byte, so just see how big in_iov is.  */
> +    req->in = (void *)in_iov[in_num - 1].iov_base
> +              + in_iov[in_num - 1].iov_len
> +              - sizeof(struct virtio_blk_inhdr);
> +    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
> +
> +
> +    type = le32toh(req->out.type);
> +    switch (type & ~VIRTIO_BLK_T_BARRIER) {
> +    case VIRTIO_BLK_T_IN:
> +    case VIRTIO_BLK_T_OUT: {
> +        ssize_t ret = 0;
> +        bool is_write = type & VIRTIO_BLK_T_OUT;
> +        req->sector_num = le64toh(req->out.sector);
> +
> +        int64_t offset = req->sector_num * vdev_blk->blk_size;
> +        QEMUIOVector *qiov = g_new0(QEMUIOVector, 1);
> +        if (is_write) {
> +            qemu_iovec_init_external(qiov, out_iov, out_num);
> +            ret = blk_co_pwritev(backend, offset, qiov->size,
> +                                 qiov, 0);
> +        } else {
> +            qemu_iovec_init_external(qiov, in_iov, in_num);
> +            ret = blk_co_preadv(backend, offset, qiov->size,
> +                                 qiov, 0);
> +        }
> +        aio_wait_kick();

What purpose does this aio_wait_kick() serve? Usually you do this if you
want to signal completion fo something to a different thread, but I
don't see that we changed any control state that could be visible to
other threads.

> +        if (ret >= 0) {
> +            req->in->status = VIRTIO_BLK_S_OK;
> +        } else {
> +            req->in->status = VIRTIO_BLK_S_IOERR;
> +        }
> +        vu_block_req_complete(req);
> +        break;
> +    }
> +    case VIRTIO_BLK_T_FLUSH:
> +        vu_block_flush(req);
> +        req->in->status = VIRTIO_BLK_S_OK;
> +        vu_block_req_complete(req);
> +        break;
> +    case VIRTIO_BLK_T_GET_ID: {
> +        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
> +                          VIRTIO_BLK_ID_BYTES);
> +        snprintf(elem->in_sg[0].iov_base, size, "%s", 
> "vhost_user_blk_server");
> +        req->in->status = VIRTIO_BLK_S_OK;
> +        req->size = elem->in_sg[0].iov_len;
> +        vu_block_req_complete(req);
> +        break;
> +    }
> +    case VIRTIO_BLK_T_DISCARD:
> +    case VIRTIO_BLK_T_WRITE_ZEROES: {
> +        int rc;
> +        rc = vu_block_discard_write_zeroes(req, &elem->out_sg[1],
> +                                           out_num, type);
> +        if (rc == 0) {
> +            req->in->status = VIRTIO_BLK_S_OK;
> +        } else {
> +            req->in->status = VIRTIO_BLK_S_IOERR;
> +        }
> +        vu_block_req_complete(req);
> +        break;
> +    }
> +    default:
> +        req->in->status = VIRTIO_BLK_S_UNSUPP;
> +        vu_block_req_complete(req);
> +        break;
> +    }

You have vu_block_req_complete() as the last thing in every branch. You
could have it just once after the switch block.

> +    return 0;
> +
> +err:
> +    free(elem);
> +    g_free(req);
> +    return -EINVAL;
> +}

Okay, so vu_block_virtio_process_req() takes a single request from the
virtqueue and processes it. This makes sense as a building block, but it
doesn't allow parallelism yet. I expect to see the creation of multiple
coroutines below that can run in parallel.

> +
> +static void vu_block_process_vq(VuDev *vu_dev, int idx)
> +{
> +    VuClient *client;
> +    VuVirtq *vq;
> +    int ret;
> +
> +    client = container_of(vu_dev, VuClient, parent);
> +    assert(client);
> +
> +    vq = vu_get_queue(vu_dev, idx);
> +    assert(vq);
> +
> +    while (1) {
> +        ret = vu_block_virtio_process_req(client, vq);

If you call vu_block_virtio_process_req(), then this one need to be a
coroutine_fn, too.

> +        if (ret) {
> +            break;
> +        }
> +    }
> +}
> +
> +static void vu_block_queue_set_started(VuDev *vu_dev, int idx, bool started)
> +{
> +    VuVirtq *vq;
> +
> +    assert(vu_dev);
> +
> +    vq = vu_get_queue(vu_dev, idx);
> +    vu_set_queue_handler(vu_dev, vq, started ? vu_block_process_vq : NULL);
> +}

If vu_block_process_vq() is used as a vq->handler, will it not be called
outside coroutine context? How can this work when
vu_block_virtio_process_req() required to be run in coroutine context?

Another problem is that vu_block_process_vq() runs a loop as long as
there are requests that aren't completed yet. During this time,
vu_kick_cb() will block. I don't think this is what was intended.
Instead, we should start the requests (without waiting for their
completion) and return.

> diff --git a/vl.c b/vl.c
> index 794f2e5733..0332ab70da 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -2538,6 +2538,10 @@ static bool object_create_initial(const char *type, 
> QemuOpts *opts)
>      }
>  #endif
>  
> +    /* Reason: vhost-user-server property "name" */

"node-name" now.

> +    if (g_str_equal(type, "vhost-user-blk-server")) {
> +        return false;
> +    }
>      /*
>       * Reason: filter-* property "netdev" etc.
>       */

Kevin

[Prev in Thread]

Current Thread

[Next in Thread]

[PATCH v4 0/5] vhost-user block device backend implementation, Coiby Xu, 2020/02/18
- [PATCH v4 1/5] extend libvhost to support IOThread and coroutine, Coiby Xu, 2020/02/18
  - Re: [PATCH v4 1/5] extend libvhost to support IOThread and coroutine, Stefan Hajnoczi, 2020/02/19
  - Re: [PATCH v4 1/5] extend libvhost to support IOThread and coroutine, Kevin Wolf, 2020/02/25
- [PATCH v4 2/5] generic vhost user server, Coiby Xu, 2020/02/18
  - Re: [PATCH v4 2/5] generic vhost user server, Kevin Wolf, 2020/02/25
- [PATCH v4 3/5] vhost-user block device backend server, Coiby Xu, 2020/02/18
  - Re: [PATCH v4 3/5] vhost-user block device backend server, Kevin Wolf <=
- [PATCH v4 4/5] a standone-alone tool to directly share disk image file via vhost-user protocol, Coiby Xu, 2020/02/18
- [PATCH v4 5/5] new qTest case to test the vhost-user-blk-server, Coiby Xu, 2020/02/18
- Re: [PATCH v4 0/5] vhost-user block device backend implementation, Stefan Hajnoczi, 2020/02/19
  - Re: [PATCH v4 0/5] vhost-user block device backend implementation, Coiby Xu, 2020/02/26
    - Re: [PATCH v4 0/5] vhost-user block device backend implementation, Stefan Hajnoczi, 2020/02/27
    - Re: [PATCH v4 0/5] vhost-user block device backend implementation, Coiby Xu, 2020/02/27
    - Re: [PATCH v4 0/5] vhost-user block device backend implementation, Kevin Wolf, 2020/02/27
    - Re: [PATCH v4 0/5] vhost-user block device backend implementation, Coiby Xu, 2020/02/27
    - Re: [PATCH v4 0/5] vhost-user block device backend implementation, Kevin Wolf, 2020/02/27
    - Re: [PATCH v4 0/5] vhost-user block device backend implementation, Marc-André Lureau, 2020/02/27

Prev by Date: Problem with virtual to physical memory translation when KVM is enabled.
Next by Date: Re: [PATCH v2 10/13] migration/ram: Handle RAM block resizes during postcopy
Previous by thread: [PATCH v4 3/5] vhost-user block device backend server
Next by thread: [PATCH v4 4/5] a standone-alone tool to directly share disk image file via vhost-user protocol
Index(es):
- Date
- Thread