[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v2 7/8] dataplane: add virtio-blk data plane cod
From: |
Michael S. Tsirkin |
Subject: |
Re: [Qemu-devel] [PATCH v2 7/8] dataplane: add virtio-blk data plane code |
Date: |
Tue, 20 Nov 2012 15:04:53 +0200 |
On Tue, Nov 20, 2012 at 01:31:51PM +0100, Stefan Hajnoczi wrote:
> virtio-blk-data-plane is a subset implementation of virtio-blk. It only
> handles read, write, and flush requests. It does this using a dedicated
> thread that executes an epoll(2)-based event loop and processes I/O
> using Linux AIO.
>
> This approach performs very well but can be used for raw image files
> only. The number of IOPS achieved has been reported to be several times
> higher than the existing virtio-blk implementation.
>
> Eventually it should be possible to unify virtio-blk-data-plane with the
> main body of QEMU code once the block layer and hardware emulation is
> able to run outside the global mutex.
>
> Signed-off-by: Stefan Hajnoczi <address@hidden>
> ---
> hw/dataplane/Makefile.objs | 2 +-
> hw/dataplane/virtio-blk.c | 418
> +++++++++++++++++++++++++++++++++++++++++++++
> hw/dataplane/virtio-blk.h | 41 +++++
> trace-events | 6 +
> 4 files changed, 466 insertions(+), 1 deletion(-)
> create mode 100644 hw/dataplane/virtio-blk.c
> create mode 100644 hw/dataplane/virtio-blk.h
>
> diff --git a/hw/dataplane/Makefile.objs b/hw/dataplane/Makefile.objs
> index abd408f..682aa9e 100644
> --- a/hw/dataplane/Makefile.objs
> +++ b/hw/dataplane/Makefile.objs
> @@ -1,3 +1,3 @@
> ifeq ($(CONFIG_VIRTIO), y)
> -common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o vring.o event-poll.o
> ioq.o
> +common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o vring.o event-poll.o
> ioq.o virtio-blk.o
> endif
> diff --git a/hw/dataplane/virtio-blk.c b/hw/dataplane/virtio-blk.c
> new file mode 100644
> index 0000000..5dcc872
> --- /dev/null
> +++ b/hw/dataplane/virtio-blk.c
> @@ -0,0 +1,418 @@
> +/*
> + * Dedicated thread for virtio-blk I/O processing
> + *
> + * Copyright 2012 IBM, Corp.
> + * Copyright 2012 Red Hat, Inc. and/or its affiliates
> + *
> + * Authors:
> + * Stefan Hajnoczi <address@hidden>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "trace.h"
> +#include "event-poll.h"
> +#include "qemu-thread.h"
> +#include "vring.h"
> +#include "ioq.h"
> +#include "hw/virtio-blk.h"
> +#include "hw/dataplane/virtio-blk.h"
> +
> +enum {
> + SEG_MAX = 126, /* maximum number of I/O segments */
> + VRING_MAX = SEG_MAX + 2, /* maximum number of vring descriptors */
> + REQ_MAX = VRING_MAX, /* maximum number of requests in the
> vring,
> + * is VRING_MAX / 2 with traditional and
> + * VRING_MAX with indirect descriptors */
> +};
> +
> +typedef struct {
> + struct iocb iocb; /* Linux AIO control block */
> + unsigned char *status; /* virtio block status code */
> + unsigned int head; /* vring descriptor index */
> +} VirtIOBlockRequest;
> +
> +struct VirtIOBlockDataPlane {
> + bool started;
> + QEMUBH *start_bh;
> + QemuThread thread;
> +
> + int fd; /* image file descriptor */
> +
> + VirtIODevice *vdev;
> + Vring vring; /* virtqueue vring */
> + EventNotifier *guest_notifier; /* irq */
> +
> + EventPoll event_poll; /* event poller */
> + EventHandler io_handler; /* Linux AIO completion handler */
> + EventHandler notify_handler; /* virtqueue notify handler */
> +
> + IOQueue ioqueue; /* Linux AIO queue (should really be per
> + dataplane thread) */
> + VirtIOBlockRequest requests[REQ_MAX]; /* pool of requests, managed by the
> + queue */
> +
> + unsigned int num_reqs;
> + QemuMutex num_reqs_lock;
> + QemuCond no_reqs_cond;
> +};
> +
> +/* Raise an interrupt to signal guest, if necessary */
> +static void notify_guest(VirtIOBlockDataPlane *s)
> +{
> + if (!vring_should_notify(s->vdev, &s->vring)) {
> + return;
> + }
> +
> + event_notifier_set(s->guest_notifier);
> +}
> +
> +static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque)
> +{
> + VirtIOBlockDataPlane *s = opaque;
> + VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
> + int len;
> +
> + if (likely(ret >= 0)) {
> + *req->status = VIRTIO_BLK_S_OK;
> + len = ret;
> + } else {
> + *req->status = VIRTIO_BLK_S_IOERR;
> + len = 0;
> + }
> +
> + trace_virtio_blk_data_plane_complete_request(s, req->head, ret);
> +
> + /* According to the virtio specification len should be the number of
> bytes
> + * written to, but for virtio-blk it seems to be the number of bytes
> + * transferred plus the status bytes.
> + */
> + vring_push(&s->vring, req->head, len + sizeof(*req->status));
> +
> + qemu_mutex_lock(&s->num_reqs_lock);
> + if (--s->num_reqs == 0) {
> + qemu_cond_broadcast(&s->no_reqs_cond);
> + }
> + qemu_mutex_unlock(&s->num_reqs_lock);
> +}
> +
> +static void process_request(IOQueue *ioq, struct iovec iov[],
> + unsigned int out_num, unsigned int in_num,
> + unsigned int head)
> +{
> + /* Virtio block requests look like this: */
> + struct virtio_blk_outhdr *outhdr; /* iov[0] */
> + /* data[] ... */
> + struct virtio_blk_inhdr *inhdr; /* iov[out_num + in_num - 1] */
> +
> + if (unlikely(out_num == 0 || in_num == 0 ||
> + iov[0].iov_len != sizeof *outhdr ||
> + iov[out_num + in_num - 1].iov_len != sizeof *inhdr)) {
> + fprintf(stderr, "virtio-blk invalid request\n");
> + exit(1);
> + }
> +
> + outhdr = iov[0].iov_base;
> + inhdr = iov[out_num + in_num - 1].iov_base;
> +
Rusty is trying to get rid of hard-coded layout assumptions,
let's not add any more if we can help it.
> + /* TODO Linux sets the barrier bit even when not advertised! */
> + uint32_t type = outhdr->type & ~VIRTIO_BLK_T_BARRIER;
> + struct iocb *iocb;
> + switch (type & (VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_SCSI_CMD |
> + VIRTIO_BLK_T_FLUSH)) {
> + case VIRTIO_BLK_T_IN:
> + if (unlikely(out_num != 1)) {
> + fprintf(stderr, "virtio-blk invalid read request\n");
> + exit(1);
> + }
> + iocb = ioq_rdwr(ioq, true, &iov[1], in_num - 1, outhdr->sector *
> 512);
> + break;
> +
> + case VIRTIO_BLK_T_OUT:
> + if (unlikely(in_num != 1)) {
> + fprintf(stderr, "virtio-blk invalid write request\n");
> + exit(1);
> + }
> + iocb = ioq_rdwr(ioq, false, &iov[1], out_num - 1, outhdr->sector *
> 512);
> + break;
> +
> + case VIRTIO_BLK_T_SCSI_CMD:
> + if (unlikely(in_num == 0)) {
> + fprintf(stderr, "virtio-blk invalid SCSI command request\n");
> + exit(1);
> + }
> +
> + /* TODO support SCSI commands */
> + {
> + VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane,
> + ioqueue);
> + inhdr->status = VIRTIO_BLK_S_UNSUPP;
> + vring_push(&s->vring, head, sizeof *inhdr);
> + notify_guest(s);
> + }
> + return;
> +
> + case VIRTIO_BLK_T_FLUSH:
> + if (unlikely(in_num != 1 || out_num != 1)) {
> + fprintf(stderr, "virtio-blk invalid flush request\n");
> + exit(1);
> + }
> +
> + /* TODO fdsync not supported by Linux AIO, do it synchronously here!
> */
> + {
> + VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane,
> + ioqueue);
> + fdatasync(s->fd);
> + inhdr->status = VIRTIO_BLK_S_OK;
> + vring_push(&s->vring, head, sizeof *inhdr);
> + notify_guest(s);
> + }
> + return;
> +
> + default:
> + fprintf(stderr, "virtio-blk unsupported request type %#x\n",
> + outhdr->type);
> + exit(1);
> + }
> +
> + /* Fill in virtio block metadata needed for completion */
> + VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
> + req->head = head;
> + req->status = &inhdr->status;
> +}
> +
> +static bool handle_notify(EventHandler *handler)
> +{
> + VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane,
> + notify_handler);
> +
> + /* There is one array of iovecs into which all new requests are extracted
> + * from the vring. Requests are read from the vring and the translated
> + * descriptors are written to the iovecs array. The iovecs do not have
> to
> + * persist across handle_notify() calls because the kernel copies the
> + * iovecs on io_submit().
> + *
> + * Handling io_submit() EAGAIN may require storing the requests across
> + * handle_notify() calls until the kernel has sufficient resources to
> + * accept more I/O. This is not implemented yet.
> + */
> + struct iovec iovec[VRING_MAX];
> + struct iovec *end = &iovec[VRING_MAX];
> + struct iovec *iov = iovec;
> +
> + /* When a request is read from the vring, the index of the first
> descriptor
> + * (aka head) is returned so that the completed request can be pushed
> onto
> + * the vring later.
> + *
> + * The number of hypervisor read-only iovecs is out_num. The number of
> + * hypervisor write-only iovecs is in_num.
> + */
> + int head;
> + unsigned int out_num = 0, in_num = 0;
> + unsigned int num_queued;
> +
> + for (;;) {
> + /* Disable guest->host notifies to avoid unnecessary vmexits */
> + vring_set_notification(s->vdev, &s->vring, false);
> +
> + for (;;) {
> + head = vring_pop(s->vdev, &s->vring, iov, end, &out_num,
> &in_num);
> + if (head < 0) {
> + break; /* no more requests */
> + }
> +
> + trace_virtio_blk_data_plane_process_request(s, out_num, in_num,
> + head);
> +
> + process_request(&s->ioqueue, iov, out_num, in_num, head);
> + iov += out_num + in_num;
> + }
> +
> + if (likely(head == -EAGAIN)) { /* vring emptied */
> + /* Re-enable guest->host notifies and stop processing the vring.
> + * But if the guest has snuck in more descriptors, keep
> processing.
> + */
> + vring_set_notification(s->vdev, &s->vring, true);
> + smp_mb();
> + if (!vring_more_avail(&s->vring)) {
> + break;
> + }
> + } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */
> + /* Since there are no iovecs[] left, stop processing for now. Do
> + * not re-enable guest->host notifies since the I/O completion
> + * handler knows to check for more vring descriptors anyway.
> + */
> + break;
> + }
> + }
> +
> + num_queued = ioq_num_queued(&s->ioqueue);
> + if (num_queued > 0) {
> + qemu_mutex_lock(&s->num_reqs_lock);
> + s->num_reqs += num_queued;
> + qemu_mutex_unlock(&s->num_reqs_lock);
> +
> + int rc = ioq_submit(&s->ioqueue);
> + if (unlikely(rc < 0)) {
> + fprintf(stderr, "ioq_submit failed %d\n", rc);
> + exit(1);
> + }
> + }
> + return true;
> +}
> +
> +static bool handle_io(EventHandler *handler)
> +{
> + VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane,
> + io_handler);
> +
> + if (ioq_run_completion(&s->ioqueue, complete_request, s) > 0) {
> + notify_guest(s);
> + }
> +
> + /* If there were more requests than iovecs, the vring will not be empty
> yet
> + * so check again. There should now be enough resources to process more
> + * requests.
> + */
> + if (unlikely(vring_more_avail(&s->vring))) {
> + return handle_notify(&s->notify_handler);
> + }
> +
> + return true;
> +}
> +
> +static void *data_plane_thread(void *opaque)
> +{
> + VirtIOBlockDataPlane *s = opaque;
> + event_poll_run(&s->event_poll);
> + return NULL;
> +}
> +
> +static void start_data_plane_bh(void *opaque)
> +{
> + VirtIOBlockDataPlane *s = opaque;
> +
> + qemu_bh_delete(s->start_bh);
> + s->start_bh = NULL;
> + qemu_thread_create(&s->thread, data_plane_thread,
> + s, QEMU_THREAD_JOINABLE);
> +}
> +
> +VirtIOBlockDataPlane *virtio_blk_data_plane_create(VirtIODevice *vdev, int
> fd)
> +{
> + VirtIOBlockDataPlane *s;
> +
> + s = g_new0(VirtIOBlockDataPlane, 1);
> + s->vdev = vdev;
> + s->fd = fd;
> + return s;
> +}
> +
> +void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
> +{
> + if (!s) {
> + return;
> + }
> + virtio_blk_data_plane_stop(s);
> + g_free(s);
> +}
> +
> +/* Block until pending requests have completed
> + *
> + * The vring continues to be serviced so ensure no new requests will be added
> + * to avoid races.
> + */
> +void virtio_blk_data_plane_drain(VirtIOBlockDataPlane *s)
> +{
> + qemu_mutex_lock(&s->num_reqs_lock);
> + while (s->num_reqs > 0) {
> + qemu_cond_wait(&s->no_reqs_cond, &s->num_reqs_lock);
> + }
> + qemu_mutex_unlock(&s->num_reqs_lock);
> +}
> +
> +void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
> +{
> + VirtQueue *vq;
> + int i;
> +
> + if (s->started) {
> + return;
> + }
> +
> + vq = virtio_get_queue(s->vdev, 0);
> + if (!vring_setup(&s->vring, s->vdev, 0)) {
> + return;
> + }
> +
> + event_poll_init(&s->event_poll);
> +
> + /* Set up guest notifier (irq) */
> + if (s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque,
> + true) != 0) {
> + fprintf(stderr, "virtio-blk failed to set guest notifier, "
> + "ensure -enable-kvm is set\n");
> + exit(1);
> + }
> + s->guest_notifier = virtio_queue_get_guest_notifier(vq);
> +
> + /* Set up virtqueue notify */
> + if (s->vdev->binding->set_host_notifier(s->vdev->binding_opaque,
> + 0, true) != 0) {
> + fprintf(stderr, "virtio-blk failed to set host notifier\n");
> + exit(1);
> + }
> + event_poll_add(&s->event_poll, &s->notify_handler,
> + virtio_queue_get_host_notifier(vq),
> + handle_notify);
> +
> + /* Set up ioqueue */
> + ioq_init(&s->ioqueue, s->fd, REQ_MAX);
> + for (i = 0; i < ARRAY_SIZE(s->requests); i++) {
> + ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb);
> + }
> + event_poll_add(&s->event_poll, &s->io_handler,
> + ioq_get_notifier(&s->ioqueue), handle_io);
> +
> + s->started = true;
> + trace_virtio_blk_data_plane_start(s);
> +
> + /* Kick right away to begin processing requests already in vring */
> + event_notifier_set(virtio_queue_get_host_notifier(vq));
> +
> + /* Spawn thread in BH so it inherits iothread cpusets */
> + s->start_bh = qemu_bh_new(start_data_plane_bh, s);
> + qemu_bh_schedule(s->start_bh);
> +}
> +
> +void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
> +{
> + if (!s->started) {
> + return;
> + }
> + s->started = false;
> + trace_virtio_blk_data_plane_stop(s);
> +
> + /* Stop thread or cancel pending thread creation BH */
> + if (s->start_bh) {
> + qemu_bh_delete(s->start_bh);
> + s->start_bh = NULL;
> + } else {
> + virtio_blk_data_plane_drain(s);
> + event_poll_stop(&s->event_poll);
> + qemu_thread_join(&s->thread);
> + }
> +
> + ioq_cleanup(&s->ioqueue);
> +
> + s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, 0, false);
> +
> + event_poll_cleanup(&s->event_poll);
> +
> + /* Clean up guest notifier (irq) */
> + s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, false);
> +
> + vring_teardown(&s->vring);
> +}
> diff --git a/hw/dataplane/virtio-blk.h b/hw/dataplane/virtio-blk.h
> new file mode 100644
> index 0000000..ddf1115
> --- /dev/null
> +++ b/hw/dataplane/virtio-blk.h
> @@ -0,0 +1,41 @@
> +/*
> + * Dedicated thread for virtio-blk I/O processing
> + *
> + * Copyright 2012 IBM, Corp.
> + * Copyright 2012 Red Hat, Inc. and/or its affiliates
> + *
> + * Authors:
> + * Stefan Hajnoczi <address@hidden>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef HW_DATAPLANE_VIRTIO_BLK_H
> +#define HW_DATAPLANE_VIRTIO_BLK_H
> +
> +#include "hw/virtio.h"
> +
> +typedef struct VirtIOBlockDataPlane VirtIOBlockDataPlane;
> +
> +#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
> +VirtIOBlockDataPlane *virtio_blk_data_plane_create(VirtIODevice *vdev, int
> fd);
> +void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s);
> +void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s);
> +void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s);
> +void virtio_blk_data_plane_drain(VirtIOBlockDataPlane *s);
> +#else
> +static inline VirtIOBlockDataPlane *virtio_blk_data_plane_create(
> + VirtIODevice *vdev, int fd)
> +{
> + return NULL;
> +}
> +
> +static inline void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s) {}
> +static inline void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s) {}
> +static inline void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s) {}
> +static inline void virtio_blk_data_plane_drain(VirtIOBlockDataPlane *s) {}
> +#endif
> +
> +#endif /* HW_DATAPLANE_VIRTIO_BLK_H */
> diff --git a/trace-events b/trace-events
> index a9a791b..1edc2ae 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -98,6 +98,12 @@ virtio_blk_rw_complete(void *req, int ret) "req %p ret %d"
> virtio_blk_handle_write(void *req, uint64_t sector, size_t nsectors) "req %p
> sector %"PRIu64" nsectors %zu"
> virtio_blk_handle_read(void *req, uint64_t sector, size_t nsectors) "req %p
> sector %"PRIu64" nsectors %zu"
>
> +# hw/dataplane/virtio-blk.c
> +virtio_blk_data_plane_start(void *s) "dataplane %p"
> +virtio_blk_data_plane_stop(void *s) "dataplane %p"
> +virtio_blk_data_plane_process_request(void *s, unsigned int out_num,
> unsigned int in_num, unsigned int head) "dataplane %p out_num %u in_num %u
> head %u"
> +virtio_blk_data_plane_complete_request(void *s, unsigned int head, int ret)
> "dataplane %p head %u ret %d"
> +
> # hw/dataplane/vring.c
> vring_setup(uint64_t physical, void *desc, void *avail, void *used) "vring
> physical %#"PRIx64" desc %p avail %p used %p"
>
> --
> 1.8.0