[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-block] [PATCH 3/3] block/io_uring: resubmit short buffered reads
From: |
Stefan Hajnoczi |
Subject: |
[Qemu-block] [PATCH 3/3] block/io_uring: resubmit short buffered reads |
Date: |
Mon, 15 Jul 2019 21:19:50 +0100 |
The io_uring API had unusual read behavior up until recently, where
short reads could occur when the start of the file range was in the page
cache and a later portion was not in the page cache. Normally read(2)
does not expose this detail to applications and this behavior has been
fixed in Linux commit 9d93a3f5a0c ("io_uring: punt short reads to async
* context").
In the meantime Linux distros have shipped kernels where io_uring
exhibits the old behavior and there is no simple way to detect it.
Add a slow path for resubmitting short read requests. The idea is
simple: shorten the iovecs and increment the file offset each time a
short read occurs and then resubmit the request. The implementation
requires adding additional fields to LuringAIOCB to keep track of where
we were.
Signed-off-by: Stefan Hajnoczi <address@hidden>
---
block/io_uring.c | 75 +++++++++++++++++++++++++++++++++++++++-------
block/trace-events | 3 +-
2 files changed, 67 insertions(+), 11 deletions(-)
diff --git a/block/io_uring.c b/block/io_uring.c
index 97e4f876d7..12cef71175 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -28,6 +28,12 @@ typedef struct LuringAIOCB {
QEMUIOVector *qiov;
bool is_read;
QSIMPLEQ_ENTRY(LuringAIOCB) next;
+
+ /* Buffered reads may require resubmission, see
+ * luring_resubmit_short_read().
+ */
+ int total_read;
+ QEMUIOVector resubmit_qiov;
} LuringAIOCB;
typedef struct LuringQueue {
@@ -99,6 +105,43 @@ static void luring_resubmit(LuringState *s, LuringAIOCB
*luringcb)
s->io_q.in_queue++;
}
+/* Before Linux commit 9d93a3f5a0c ("io_uring: punt short reads to async
+ * context") a buffered I/O request with the start of the file range in the
+ * page cache could result in a short read. Applications need to resubmit the
+ * remaining read request.
+ *
+ * This is a slow path but recent kernels never take it.
+ */
+static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb,
+ int nread)
+{
+ QEMUIOVector *resubmit_qiov;
+ size_t remaining;
+
+ trace_luring_resubmit_short_read(s, luringcb, nread);
+
+ /* Update read position */
+ luringcb->total_read += nread;
+ remaining = luringcb->qiov->size - luringcb->total_read;
+
+ /* Shorten qiov */
+ resubmit_qiov = &luringcb->resubmit_qiov;
+ if (resubmit_qiov->iov == NULL) {
+ qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov);
+ } else {
+ qemu_iovec_reset(resubmit_qiov);
+ }
+ qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read,
+ remaining);
+
+ /* Update sqe */
+ luringcb->sqeq.off += nread;
+ luringcb->sqeq.addr = (__u64)(uintptr_t)luringcb->resubmit_qiov.iov;
+ luringcb->sqeq.len = luringcb->resubmit_qiov.niov;
+
+ luring_resubmit(s, luringcb);
+}
+
/**
* luring_process_completions:
* @s: AIO state
@@ -135,6 +178,7 @@ static void luring_process_completions(LuringState *s)
while (io_uring_peek_cqe(&s->ring, &cqes) == 0) {
LuringAIOCB *luringcb;
int ret;
+ int total_bytes;
if (!cqes) {
break;
@@ -150,25 +194,36 @@ static void luring_process_completions(LuringState *s)
trace_luring_process_completion(s, luringcb, ret);
- if (ret == luringcb->qiov->size) {
+ /* total_read is non-zero only for resubmitted read requests */
+ total_bytes = ret + luringcb->total_read;
+
+ if (ret < 0) {
+ if (ret == -EINTR) {
+ luring_resubmit(s, luringcb);
+ continue;
+ }
+ } else if (total_bytes == luringcb->qiov->size) {
ret = 0;
- } else if (ret >= 0) {
+ } else {
/* Short Read/Write */
if (luringcb->is_read) {
- /* Read, pad with zeroes */
- qemu_iovec_memset(luringcb->qiov, ret, 0,
- luringcb->qiov->size - ret);
- ret = 0;
+ if (ret > 0) {
+ luring_resubmit_short_read(s, luringcb, ret);
+ continue;
+ } else {
+ /* Pad with zeroes */
+ qemu_iovec_memset(luringcb->qiov, total_bytes, 0,
+ luringcb->qiov->size - total_bytes);
+ ret = 0;
+ }
} else {
ret = -ENOSPC;;
}
- /* Add to overflow queue to be resubmitted later */
- } else if (ret == -EINTR) {
- luring_resubmit(s, luringcb);
- continue;
}
luringcb->ret = ret;
+ qemu_iovec_destroy(&luringcb->resubmit_qiov);
+
/*
* If the coroutine is already entered it must be in ioq_submit()
* and will notice luringcb->ret has been filled in when it
diff --git a/block/trace-events b/block/trace-events
index 02952fe4cb..f434cac634 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -60,7 +60,7 @@ qmp_block_stream(void *bs) "bs %p"
file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type)
"acb %p opaque %p offset %"PRId64" count %d type %d"
file_copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t
dst_off, int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset
%"PRIu64" dst_fd %d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64
-#io_uring.c
+# io_uring.c
luring_init_state(void *s, size_t size) "s %p size %zu"
luring_cleanup_state(void *s) "%p freed"
luring_io_plug(void *s) "LuringState %p plug"
@@ -70,6 +70,7 @@ luring_do_submit_done(void *s, int ret) "LuringState %p
submitted to kernel %d"
luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset,
size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 "
nbytes %zd type %d"
luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p
luringcb %p ret %d"
luring_io_uring_submit(void *s, int ret) "LuringState %p ret %d"
+luring_resubmit_short_read(void *s, void *luringcb, int nread) "LuringState %p
luringcb %p nread %d"
# qcow2.c
qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%"
PRIx64 " bytes %d"
--
2.21.0