+
+ /* Next head to consume from the device */
+ uint16_t last_used_idx;
+
+ /* Cache for the exposed notification flag */
+ bool notification;
} VhostShadowVirtqueue;
bool vhost_svq_valid_features(uint64_t *features);
@@ -47,6 +74,8 @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+ VirtQueue *vq);
void vhost_svq_stop(VhostShadowVirtqueue *svq);
VhostShadowVirtqueue *vhost_svq_new(void);
diff --git a/hw/virtio/vhost-shadow-virtqueue.c
b/hw/virtio/vhost-shadow-virtqueue.c
index 2150e2b071..a38d313755 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -12,6 +12,7 @@
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
+#include "qemu/log.h"
#include "linux-headers/linux/vhost.h"
/**
@@ -53,22 +54,309 @@ bool vhost_svq_valid_features(uint64_t *features)
return r;
}
-/** Forward guest notifications */
-static void vhost_handle_guest_kick(EventNotifier *n)
+/**
+ * Number of descriptors that the SVQ can make available from the guest.
+ *
+ * @svq The svq
+
+ /*
+ * Put the entry in the available array (but don't update avail->idx until
+ * they do sync).
+ */
+ avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1);
+ avail->ring[avail_idx] = cpu_to_le16(*head);
+ svq->avail_idx_shadow++;
+
+ /* Update the avail index after write the descriptor */
+ smp_wmb();
+ avail->idx = cpu_to_le16(svq->avail_idx_shadow);
+
+ return true;
+}
+
+static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+{
+ unsigned qemu_head;
+ bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
+ if (unlikely(!ok)) {
+ return false;
+ }
+
+ svq->ring_id_maps[qemu_head] = elem;
+ return true;
+}
+
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+{
+ /*
+ * We need to expose the available array entries before checking the used
+ * flags
+ */
+ smp_mb();
+ if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
+ return;
+ }
+
+ event_notifier_set(&svq->hdev_kick);
+}
+
+/**
+ * Forward available buffers.
+ *
+ * @svq Shadow VirtQueue
+ *
+ * Note that this function does not guarantee that all guest's available
+ * buffers are available to the device in SVQ avail ring. The guest may have
+ * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
+ * qemu vaddr.
+ *
+ * If that happens, guest's kick notifications will be disabled until the
+ * device uses some buffers.
+ */
+static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+{
+ /* Clear event notifier */
+ event_notifier_test_and_clear(&svq->svq_kick);
+
+ /* Forward to the device as many available buffers as possible */
+ do {
+ virtio_queue_set_notification(svq->vq, false);
+
+ while (true) {
+ VirtQueueElement *elem;
+ bool ok;
+
+ if (svq->next_guest_avail_elem) {
+ elem = g_steal_pointer(&svq->next_guest_avail_elem);
+ } else {
+ elem = virtqueue_pop(svq->vq, sizeof(*elem));
+ }
+
+ if (!elem) {
+ break;
+ }
+
+ if (elem->out_num + elem->in_num >
+ vhost_svq_available_slots(svq)) {
+ /*
+ * This condition is possible since a contiguous buffer in GPA
+ * does not imply a contiguous buffer in qemu's VA
+ * scatter-gather segments. If that happens, the buffer exposed
+ * to the device needs to be a chain of descriptors at this
+ * moment.
+ *
+ * SVQ cannot hold more available buffers if we are here:
+ * queue the current guest descriptor and ignore further kicks
+ * until some elements are used.
+ */
+ svq->next_guest_avail_elem = elem;
+ return;
+ }
+
+ ok = vhost_svq_add(svq, elem);
+ if (unlikely(!ok)) {
+ /* VQ is broken, just return and ignore any other kicks */
+ return;
+ }
+ vhost_svq_kick(svq);
+ }
+
+ virtio_queue_set_notification(svq->vq, true);
+ } while (!virtio_queue_empty(svq->vq));
+}
+
+/**
+ * Handle guest's kick.
+ *
+ * @n guest kick event notifier, the one that guest set to notify svq.
+ */
+static void vhost_handle_guest_kick_notifier(EventNotifier *n)
{
VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
svq_kick);
event_notifier_test_and_clear(n);
- event_notifier_set(&svq->hdev_kick);
+ vhost_handle_guest_kick(svq);
+}
+
+static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
+{
+ if (svq->last_used_idx != svq->shadow_used_idx) {
+ return true;
+ }
+
+ svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
+
+ return svq->last_used_idx != svq->shadow_used_idx;
}
-/* Forward vhost notifications */
+static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
+{
+ vring_desc_t *descs = svq->vring.desc;
+ const vring_used_t *used = svq->vring.used;
+ vring_used_elem_t used_elem;
+ uint16_t last_used;
+
+ if (!vhost_svq_more_used(svq)) {
+ return NULL;
+ }
+
+ /* Only get used array entries after they have been exposed by dev */
+ smp_rmb();
+ last_used = svq->last_used_idx & (svq->vring.num - 1);
+ used_elem.id = le32_to_cpu(used->ring[last_used].id);
+ used_elem.len = le32_to_cpu(used->ring[last_used].len);
+
+ svq->last_used_idx++;
+ if (unlikely(used_elem.id >= svq->vring.num)) {
+ qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
+ svq->vdev->name, used_elem.id);
+ return NULL;
+ }
+
+ if (unlikely(!svq->ring_id_maps[used_elem.id])) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "Device %s says index %u is used, but it was not available",
+ svq->vdev->name, used_elem.id);
+ return NULL;
+ }
+
+ descs[used_elem.id].next = svq->free_head;
+ svq->free_head = used_elem.id;
+
+ svq->ring_id_maps[used_elem.id]->len = used_elem.len;
+ return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
+}
+
+static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+ bool check_for_avail_queue)
+{
+ VirtQueue *vq = svq->vq;
+
+ /* Forward as many used buffers as possible. */
+ do {
+ unsigned i = 0;
+
+ vhost_svq_set_notification(svq, false);
+ while (true) {
+ g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
+ if (!elem) {
+ break;
+ }
+
+ if (unlikely(i >= svq->vring.num)) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "More than %u used buffers obtained in a %u size SVQ",
+ i, svq->vring.num);
+ virtqueue_fill(vq, elem, elem->len, i);
+ virtqueue_flush(vq, i);
+ return;
+ }
+ virtqueue_fill(vq, elem, elem->len, i++);
+ }
+
+ virtqueue_flush(vq, i);
+ event_notifier_set(&svq->svq_call);
+
+ if (check_for_avail_queue && svq->next_guest_avail_elem) {
+ /*
+ * Avail ring was full when vhost_svq_flush was called, so it's a
+ * good moment to make more descriptors available if possible.
+ */
+ vhost_handle_guest_kick(svq);
+ }
+
+ vhost_svq_set_notification(svq, true);
+ } while (vhost_svq_more_used(svq));
+}
+
+/**
+ * Forward used buffers.
+ *
+ * @n hdev call event notifier, the one that device set to notify svq.
+ *
+ * Note that we are not making any buffers available in the loop, there is no
+ * way that it runs more than virtqueue size times.
+ */
static void vhost_svq_handle_call(EventNotifier *n)
{
VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
hdev_call);
event_notifier_test_and_clear(n);
- event_notifier_set(&svq->svq_call);
+ vhost_svq_flush(svq, true);
}
/**
@@ -149,7 +437,41 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq,
int svq_kick_fd)
if (poll_start) {
event_notifier_init_fd(svq_kick, svq_kick_fd);
event_notifier_set(svq_kick);
- event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
+ event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
+ }
+}
+
+/**
+ * Start the shadow virtqueue operation.
+ *
+ * @svq Shadow Virtqueue
+ * @vdev VirtIO device
+ * @vq Virtqueue to shadow
+ */
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+ VirtQueue *vq)
+{
+ size_t desc_size, driver_size, device_size;
+
+ svq->next_guest_avail_elem = NULL;
+ svq->avail_idx_shadow = 0;
+ svq->shadow_used_idx = 0;
+ svq->last_used_idx = 0;
+ svq->vdev = vdev;
+ svq->vq = vq;
+
+ svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
+ driver_size = vhost_svq_driver_area_size(svq);
+ device_size = vhost_svq_device_area_size(svq);
+ svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
+ desc_size = sizeof(vring_desc_t) * svq->vring.num;
+ svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
+ memset(svq->vring.desc, 0, driver_size);
+ svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
+ memset(svq->vring.used, 0, device_size);
+ svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+ for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+ svq->vring.desc[i].next = cpu_to_le16(i + 1);
}
}
@@ -160,6 +482,32 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
void vhost_svq_stop(VhostShadowVirtqueue *svq)
{
event_notifier_set_handler(&svq->svq_kick, NULL);
+ g_autofree VirtQueueElement *next_avail_elem = NULL;
+
+ if (!svq->vq) {
+ return;
+ }
+
+ /* Send all pending used descriptors to guest */
+ vhost_svq_flush(svq, false);
+
+ for (unsigned i = 0; i < svq->vring.num; ++i) {
+ g_autofree VirtQueueElement *elem = NULL;
+ elem = g_steal_pointer(&svq->ring_id_maps[i]);
+ if (elem) {
+ virtqueue_detach_element(svq->vq, elem, elem->len);
+ }
+ }
+
+ next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
+ if (next_avail_elem) {
+ virtqueue_detach_element(svq->vq, next_avail_elem,
+ next_avail_elem->len);
+ }
+ svq->vq = NULL;
+ g_free(svq->ring_id_maps);
+ qemu_vfree(svq->vring.desc);
+ qemu_vfree(svq->vring.used);
}
/**
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index b2c4e92fcf..435b9c2e9e 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -803,10 +803,10 @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev
*dev,
* Note that this function does not rewind kick file descriptor if cannot set
* call one.
*/
-static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
- VhostShadowVirtqueue *svq,
- unsigned idx,
- Error **errp)
+static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
+ VhostShadowVirtqueue *svq,
+ unsigned idx,
+ Error **errp)
{
struct vhost_vring_file file = {
.index = dev->vq_index + idx,
@@ -818,7 +818,7 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
r = vhost_vdpa_set_vring_dev_kick(dev, &file);
if (unlikely(r != 0)) {
error_setg_errno(errp, -r, "Can't set device kick fd");
- return false;
+ return r;
}
event_notifier = &svq->hdev_call;
@@ -828,6 +828,119 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
error_setg_errno(errp, -r, "Can't set device call fd");
}
+ return r;
+}
+
+/**
+ * Unmap a SVQ area in the device
+ */
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
+ hwaddr size)
+{
+ int r;
+
+ size = ROUND_UP(size, qemu_real_host_page_size);
+ r = vhost_vdpa_dma_unmap(v, iova, size);
+ return r == 0;
+}
+
+static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
+ const VhostShadowVirtqueue *svq)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ struct vhost_vring_addr svq_addr;
+ size_t device_size = vhost_svq_device_area_size(svq);
+ size_t driver_size = vhost_svq_driver_area_size(svq);
+ bool ok;
+
+ vhost_svq_get_vring_addr(svq, &svq_addr);
+
+ ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+ if (unlikely(!ok)) {
+ return false;
+ }
+
+ return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+}
+
+/**
+ * Map shadow virtqueue rings in device
+ *
+ * @dev The vhost device
+ * @svq The shadow virtqueue
+ * @addr Assigned IOVA addresses
+ * @errp Error pointer
+ */
+static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
+ const VhostShadowVirtqueue *svq,
+ struct vhost_vring_addr *addr,
+ Error **errp)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ size_t device_size = vhost_svq_device_area_size(svq);
+ size_t driver_size = vhost_svq_driver_area_size(svq);
+ int r;
+
+ ERRP_GUARD();
+ vhost_svq_get_vring_addr(svq, addr);
+
+ r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
+ (void *)addr->desc_user_addr, true);
+ if (unlikely(r != 0)) {
+ error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+ return false;
+ }
+
+ r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
+ (void *)addr->used_user_addr, false);
+ if (unlikely(r != 0)) {
+ error_setg_errno(errp, -r, "Cannot create vq device region: ");
+ }
+
+ return r == 0;
+}
+
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+ VhostShadowVirtqueue *svq,
+ unsigned idx,
+ Error **errp)
+{
+ uint16_t vq_index = dev->vq_index + idx;
+ struct vhost_vring_state s = {
+ .index = vq_index,
+ };
+ int r;
+
+ r = vhost_vdpa_set_dev_vring_base(dev, &s);
+ if (unlikely(r)) {
+ error_setg_errno(errp, -r, "Cannot set vring base");
+ return false;
+ }
+
+ r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
+ return r == 0;
+}
+
+static bool vhost_vdpa_svq_set_addr(struct vhost_dev *dev, unsigned idx,
+ VhostShadowVirtqueue *svq,
+ Error **errp)
+{
+ struct vhost_vring_addr addr = {
+ .index = idx,
+ };
+ int r;
+
+ bool ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, errp);
+ if (unlikely(!ok)) {
+ return false;
+ }
+
+ /* Override vring GPA set by vhost subsystem */
+ r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
+ if (unlikely(r != 0)) {
+ error_setg_errno(errp, -r, "Cannot set device address");
+ }
+
return r == 0;
}
@@ -842,10 +955,46 @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
}
for (i = 0; i < v->shadow_vqs->len; ++i) {
+ VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
if (unlikely(!ok)) {
- error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+ goto err;
+ }
+ vhost_svq_start(svq, dev->vdev, vq);
+ ok = vhost_vdpa_svq_set_addr(dev, i, svq, &err);
+ if (unlikely(!ok)) {
+ vhost_svq_stop(svq);
+ goto err;