[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 1/1] nvmf: introduce a new block driver
From: |
zhenwei pi |
Subject: |
[PATCH 1/1] nvmf: introduce a new block driver |
Date: |
Mon, 7 Jun 2021 21:32:53 +0800 |
Add a new qemu block driver which uses libnvmf as userspace NVMe over
fabric initiator.
Currently QEMU uses 4 NVMF IO-queues by a RR policy, test with a
linux kernel NVMF target, QEMU gets about 220K IOPS.
Thanks to Famz for several suggestions.
Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
---
block/meson.build | 1 +
block/nvmf.c | 425 ++++++++++++++++++++++++++++++++++++++++++++++
configure | 8 +-
meson.build | 8 +
meson_options.txt | 2 +
5 files changed, 443 insertions(+), 1 deletion(-)
create mode 100644 block/nvmf.c
diff --git a/block/meson.build b/block/meson.build
index 01861e1545..adf4e753ae 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -77,6 +77,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [libnvmf, 'nvmf', files('nvmf.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/block/nvmf.c b/block/nvmf.c
new file mode 100644
index 0000000000..d09ca41b58
--- /dev/null
+++ b/block/nvmf.c
@@ -0,0 +1,425 @@
+/*
+ * NVMe over fabric block driver based on libnvmf
+ *
+ * Copyright 2020-2021 zhenwei pi
+ *
+ * Authors:
+ * zhenwei pi <pizhenwei@bytedance.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include <poll.h>
+#include "qemu/error-report.h"
+#include "block/block_int.h"
+#include "block/nvme.h"
+#include "qemu/iov.h"
+#include "qemu/option.h"
+#include "qemu/module.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
+
+#include "nvmf/nvmf.h"
+
+#define DEF_IO_QUEUES 4 /* default IO queues 4 by RR policy */
+#define NVMF_KATO 30000 /* default keepalive time 30s */
+#define NVMF_URI "uri"
+#define PROTOCOL_TCP_PREF "nvmf-tcp://"
+#define PROTOCOL_RDMA_PREF "nvmf-rdma://"
+
+typedef struct NvmfQueue {
+ int qid;
+ CoQueue wait_queue;
+ CoMutex wait_lock;
+} NvmfQueue;
+
+typedef struct NvmfHost {
+ nvmf_options_t opts;
+ nvmf_ctrl_t ctrl;
+ AioContext *aio_context;
+ QemuSpin lock;
+
+ unsigned long requests;
+ int nr_ioqueues;
+ NvmfQueue *ioqueues;
+} NvmfHost;
+
+typedef struct NvmfReq {
+ NvmfHost *host;
+ nvmf_req_t req;
+ Coroutine *co;
+ int qid;
+ int retval;
+ bool done;
+} NvmfReq;
+
+typedef enum NvmfOp {
+ NvmfOpRead,
+ NvmfOpWrite,
+ NvmfOpDiscard,
+ NvmfOpWritezeroes
+} NvmfOp;
+
+static void nvmf_process(void *opaque)
+{
+ NvmfHost *host = opaque;
+
+ nvmf_ctrl_process(host->ctrl);
+}
+
+static void nvmf_attach_aio_context(BlockDriverState *bs,
+ AioContext *new_context)
+{
+ NvmfHost *host = bs->opaque;
+
+ host->aio_context = new_context;
+
+ qemu_spin_lock(&host->lock);
+ aio_set_fd_handler(new_context, nvmf_ctrl_fd(host->ctrl),
+ false, nvmf_process, NULL, NULL, host);
+ qemu_spin_unlock(&host->lock);
+}
+
+static QemuOptsList runtime_opts = {
+ .name = "nvmf",
+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+ .desc = {
+ {
+ .name = NVMF_URI,
+ .type = QEMU_OPT_STRING,
+ .help = "NVMe over fabric URI",
+ },
+ { /* end of list */ }
+ },
+};
+
+static void nvmf_parse_filename(const char *filename, QDict *options,
+ Error **errp)
+{
+ const char *uri;
+ int len;
+
+ len = strlen(PROTOCOL_TCP_PREF);
+ if (strlen(filename) > len && !strncmp(filename, PROTOCOL_TCP_PREF, len)) {
+ uri = g_strdup(filename);
+ qdict_put_str(options, NVMF_URI, uri);
+ return;
+ }
+
+ len = strlen(PROTOCOL_RDMA_PREF);
+ if (strlen(filename) > len && !strncmp(filename, PROTOCOL_RDMA_PREF, len))
{
+ uri = g_strdup(filename);
+ qdict_put_str(options, NVMF_URI, uri);
+ return;
+ }
+
+ error_setg(errp, "nvmf: invalid filename. Ex, nvmf-tcp/nvmf-rdma");
+}
+
+static int nvmf_file_open(BlockDriverState *bs, QDict *options, int flags,
+ Error **errp)
+{
+ NvmfHost *host = bs->opaque;
+ QemuOpts *qopts;
+ nvmf_ctrl_t ctrl;
+ nvmf_options_t opts;
+ const char *uri;
+ NvmfQueue *queue;
+ unsigned int nsid;
+ int i;
+
+ qopts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+ qemu_opts_absorb_qdict(qopts, options, &error_abort);
+ uri = qemu_opt_get(qopts, NVMF_URI);
+ if (!uri) {
+ error_setg(errp, "nvmf: missing URI");
+ goto free_qopts;
+ }
+
+ host->nr_ioqueues = DEF_IO_QUEUES;
+ opts = nvmf_default_options(uri, NULL);
+ if (!opts) {
+ error_setg(errp, "nvmf: parse uri failed");
+ goto free_uri;
+ }
+
+ nvmf_options_set_io_queues(opts, host->nr_ioqueues);
+ nvmf_options_set_kato(opts, NVMF_KATO);
+ ctrl = nvmf_ctrl_create(opts);
+ if (!ctrl) {
+ error_setg(errp, "nvmf: create ctrl failed");
+ goto free_nvmf_opts;
+ }
+
+ if (nvmf_ns_count(ctrl) < 1) {
+ error_setg(errp, "nvmf: no available namespace");
+ goto release_ctrl;
+ }
+
+ /* nsid should be specified by command line */
+ nsid = nvmf_ns_id(ctrl);
+ if (!nvmf_ns_lbads(ctrl, nsid)) {
+ error_setg(errp, "nvmf: invalid LBADS");
+ goto release_ctrl;
+ }
+
+ if (!nvmf_ns_nsze(ctrl, nsid)) {
+ error_setg(errp, "nvmf: invalid size");
+ goto release_ctrl;
+ }
+
+ host->ioqueues = g_new0(NvmfQueue, host->nr_ioqueues + 1);
+ memset(host->ioqueues, 0x00, sizeof(NvmfQueue) * (host->nr_ioqueues + 1));
+ for (i = 0; i < host->nr_ioqueues + 1; i++) {
+ queue = host->ioqueues + i;
+ queue->qid = i;
+ qemu_co_mutex_init(&queue->wait_lock);
+ qemu_co_queue_init(&queue->wait_queue);
+ }
+
+ host->ctrl = ctrl;
+ host->opts = opts;
+ qemu_spin_init(&host->lock);
+ nvmf_attach_aio_context(bs, bdrv_get_aio_context(bs));
+ g_free((void *)uri);
+
+ return 0;
+
+release_ctrl:
+ nvmf_ctrl_release(ctrl);
+
+free_nvmf_opts:
+ nvmf_options_free(opts);
+
+free_uri:
+ g_free((void *)uri);
+
+free_qopts:
+ qemu_opts_del(qopts);
+
+ return -EINVAL;
+}
+
+static void nvmf_close(BlockDriverState *bs)
+{
+ NvmfHost *host = bs->opaque;
+
+ nvmf_ctrl_release(host->ctrl);
+ nvmf_options_free(host->opts);
+ g_free(host->ioqueues);
+}
+
+static inline bool nvmf_is_lba_aligned(nvmf_ctrl_t ctrl, size_t offset,
+ size_t length)
+{
+ unsigned int nsid = nvmf_ns_id(ctrl);
+ unsigned char lbads = nvmf_ns_lbads(ctrl, nsid);
+ return !(offset & ((1 << lbads) - 1)) && !(length & ((1 << lbads) - 1));
+}
+
+static inline void nvmf_req_co_init(NvmfHost *host, NvmfReq *req)
+{
+ req->host = host;
+ req->req = req;
+ req->co = qemu_coroutine_self();
+ req->done = false;
+}
+
+static void nvmf_req_cb(unsigned short status, void *opaque)
+{
+ NvmfReq *req = opaque;
+
+ switch (status) {
+ case NVME_SUCCESS:
+ req->retval = 0;
+ break;
+
+ case NVME_INVALID_OPCODE:
+ req->retval = -ENOSYS;
+ break;
+
+ case NVME_INVALID_FIELD:
+ req->retval = -EINVAL;
+ break;
+
+ default:
+ req->retval = -EIO;
+ break;
+ }
+
+ req->done = true;
+ aio_co_wake(req->co);
+}
+
+static coroutine_fn int nvmf_co_io(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, QEMUIOVector *qiov,
+ int flags, NvmfOp op)
+{
+ NvmfHost *host = bs->opaque;
+ NvmfQueue *queue;
+ NvmfReq req;
+ struct iovec iov = {.iov_base = NULL, .iov_len = bytes};
+ int qid;
+
+ if (!nvmf_is_lba_aligned(host->ctrl, offset, bytes)) {
+ return -EINVAL;
+ }
+ /* RR policy */
+ qid = (host->requests++ % host->nr_ioqueues) + 1;
+ queue = host->ioqueues + qid;
+ nvmf_req_co_init(host, &req);
+ req.qid = qid;
+
+retry:
+ switch (op) {
+ case NvmfOpRead:
+ req.req = nvmf_read_async(host->ctrl, qid, qiov->iov, qiov->niov,
+ offset, 0, nvmf_req_cb, &req);
+ break;
+ case NvmfOpWrite:
+ req.req = nvmf_write_async(host->ctrl, qid, qiov->iov, qiov->niov,
+ offset, 0, nvmf_req_cb, &req);
+ break;
+ case NvmfOpDiscard:
+ req.req = nvmf_discard_async(host->ctrl, qid, &iov, 1, offset, 0,
+ nvmf_req_cb, &req);
+ break;
+ case NvmfOpWritezeroes:
+ req.req = nvmf_writezeroes_async(host->ctrl, qid, &iov, 1, offset, 0,
+ nvmf_req_cb, &req);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (!req.req) {
+ /* test queue is full */
+ if (nvmf_queue_nr_inflight(host->ctrl, qid) ==
+ nvmf_queue_depth(host->ctrl, qid)) {
+ qemu_co_mutex_lock(&queue->wait_lock);
+ qemu_co_queue_wait(&queue->wait_queue, &queue->wait_lock);
+ qemu_co_mutex_unlock(&queue->wait_lock);
+ goto retry;
+ } else {
+ assert(0);
+ }
+ }
+
+ while (!req.done) {
+ qemu_coroutine_yield();
+ }
+
+ nvmf_req_free(req.req);
+
+ qemu_co_mutex_lock(&queue->wait_lock);
+ if (!qemu_co_queue_empty(&queue->wait_queue)) {
+ qemu_co_queue_restart_all(&queue->wait_queue);
+ }
+ qemu_co_mutex_unlock(&queue->wait_lock);
+
+ return req.retval;
+}
+
+static coroutine_fn int nvmf_co_preadv(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, QEMUIOVector *qiov,
+ int flags)
+{
+ return nvmf_co_io(bs, offset, bytes, qiov, flags, NvmfOpRead);
+}
+
+static coroutine_fn int nvmf_co_pwritev(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, QEMUIOVector *qiov,
+ int flags)
+{
+ return nvmf_co_io(bs, offset, bytes, qiov, flags, NvmfOpWrite);
+}
+
+static coroutine_fn int nvmf_co_pwrite_zeroes(BlockDriverState *bs,
+ int64_t offset, int bytes,
+ BdrvRequestFlags flags)
+{
+ return nvmf_co_io(bs, offset, bytes, NULL, 0, NvmfOpWritezeroes);
+}
+
+static coroutine_fn int nvmf_co_pdiscard(BlockDriverState *bs, int64_t offset,
+ int bytes)
+{
+ return nvmf_co_io(bs, offset, bytes, NULL, 0, NvmfOpDiscard);
+}
+
+static void nvmf_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+ NvmfHost *host = bs->opaque;
+ unsigned int nsid = nvmf_ns_id(host->ctrl);
+ unsigned char lbads = nvmf_ns_lbads(host->ctrl, nsid);
+
+ bs->bl.request_alignment = MAX(BDRV_SECTOR_SIZE, (1 << lbads));
+ bs->bl.max_transfer = nvmf_ctrl_mdts(host->ctrl);
+ bs->bl.max_iov = nvmf_max_iov(host->ctrl);
+}
+
+static int64_t nvmf_getlength(BlockDriverState *bs)
+{
+ NvmfHost *host = bs->opaque;
+ unsigned int nsid = nvmf_ns_id(host->ctrl);
+ unsigned char lbads = nvmf_ns_lbads(host->ctrl, nsid);
+ unsigned long nsze = nvmf_ns_nsze(host->ctrl, nsid);
+
+ return (1 << lbads) * nsze;
+}
+
+static void nvmf_detach_aio_context(BlockDriverState *bs)
+{
+ NvmfHost *host = bs->opaque;
+
+ qemu_spin_lock(&host->lock);
+ aio_set_fd_handler(host->aio_context, nvmf_ctrl_fd(host->ctrl),
+ false, NULL, NULL, NULL, NULL);
+ qemu_spin_unlock(&host->lock);
+}
+
+static BlockDriver bdrv_nvmf_tcp = {
+ .format_name = "nvmf-tcp",
+ .protocol_name = "nvmf-tcp",
+ .instance_size = sizeof(NvmfHost),
+ .bdrv_parse_filename = nvmf_parse_filename,
+ .bdrv_file_open = nvmf_file_open,
+ .bdrv_close = nvmf_close,
+ .bdrv_getlength = nvmf_getlength,
+ .bdrv_refresh_limits = nvmf_refresh_limits,
+ .bdrv_co_pdiscard = nvmf_co_pdiscard,
+ .bdrv_co_preadv = nvmf_co_preadv,
+ .bdrv_co_pwritev = nvmf_co_pwritev,
+ .bdrv_co_pwrite_zeroes = nvmf_co_pwrite_zeroes,
+ .bdrv_detach_aio_context = nvmf_detach_aio_context,
+ .bdrv_attach_aio_context = nvmf_attach_aio_context,
+};
+
+static BlockDriver bdrv_nvmf_rdma = {
+ .format_name = "nvmf-rdma",
+ .protocol_name = "nvmf-rdma",
+ .instance_size = sizeof(NvmfHost),
+ .bdrv_parse_filename = nvmf_parse_filename,
+ .bdrv_file_open = nvmf_file_open,
+ .bdrv_close = nvmf_close,
+ .bdrv_getlength = nvmf_getlength,
+ .bdrv_refresh_limits = nvmf_refresh_limits,
+ .bdrv_co_pdiscard = nvmf_co_pdiscard,
+ .bdrv_co_preadv = nvmf_co_preadv,
+ .bdrv_co_pwritev = nvmf_co_pwritev,
+ .bdrv_co_pwrite_zeroes = nvmf_co_pwrite_zeroes,
+ .bdrv_detach_aio_context = nvmf_detach_aio_context,
+ .bdrv_attach_aio_context = nvmf_attach_aio_context,
+};
+
+static void nvmf_block_init(void)
+{
+ bdrv_register(&bdrv_nvmf_tcp);
+ bdrv_register(&bdrv_nvmf_rdma);
+}
+
+block_init(nvmf_block_init);
diff --git a/configure b/configure
index 8dcb9965b2..608b108062 100755
--- a/configure
+++ b/configure
@@ -393,6 +393,7 @@ vss_win32_sdk="$default_feature"
win_sdk="no"
want_tools="$default_feature"
libiscsi="auto"
+libnvmf="auto"
libnfs="auto"
coroutine=""
coroutine_pool="$default_feature"
@@ -1127,6 +1128,10 @@ for opt do
;;
--enable-libiscsi) libiscsi="enabled"
;;
+ --disable-libnvmf) libnvmf="disabled"
+ ;;
+ --enable-libnvmf) libnvmf="enabled"
+ ;;
--disable-libnfs) libnfs="disabled"
;;
--enable-libnfs) libnfs="enabled"
@@ -1889,6 +1894,7 @@ disabled with --disable-FEATURE, default is enabled if
available
spice-protocol spice-protocol
rbd rados block device (rbd)
libiscsi iscsi support
+ libnvmf NVMe over Fabric support
libnfs nfs support
smartcard smartcard support (libcacard)
u2f U2F support (u2f-emu)
@@ -6448,7 +6454,7 @@ if test "$skip_meson" = no; then
-Dvhost_user_blk_server=$vhost_user_blk_server
-Dmultiprocess=$multiprocess \
-Dfuse=$fuse -Dfuse_lseek=$fuse_lseek
-Dguest_agent_msi=$guest_agent_msi -Dbpf=$bpf\
$(if test "$default_features" = no; then echo
"-Dauto_features=disabled"; fi) \
- -Dtcg_interpreter=$tcg_interpreter \
+ -Dtcg_interpreter=$tcg_interpreter -Dlibnvmf=$libnvmf\
$cross_arg \
"$PWD" "$source_path"
diff --git a/meson.build b/meson.build
index 626cf932c1..ccdc36266a 100644
--- a/meson.build
+++ b/meson.build
@@ -478,6 +478,12 @@ if not get_option('libiscsi').auto() or have_block
required: get_option('libiscsi'),
method: 'pkg-config', kwargs: static_kwargs)
endif
+libnvmf = not_found
+if not get_option('libnvmf').auto() or have_block
+ libnvmf = dependency('libnvmf', version: '>=0.1',
+ required: get_option('libnvmf'),
+ method: 'pkg-config', kwargs: static_kwargs)
+endif
zstd = not_found
if not get_option('zstd').auto() or have_block
zstd = dependency('libzstd', version: '>=1.4.0',
@@ -1150,6 +1156,7 @@ config_host_data.set('CONFIG_LIBATTR', have_old_libattr)
config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found())
config_host_data.set('CONFIG_EBPF', libbpf.found())
config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
+config_host_data.set('CONFIG_LIBNVMF', libnvmf.found())
config_host_data.set('CONFIG_LIBNFS', libnfs.found())
config_host_data.set('CONFIG_RBD', rbd.found())
config_host_data.set('CONFIG_SDL', sdl.found())
@@ -2736,6 +2743,7 @@ summary_info += {'usb net redir':
config_host.has_key('CONFIG_USB_REDIR')}
summary_info += {'OpenGL support': config_host.has_key('CONFIG_OPENGL')}
summary_info += {'GBM': config_host.has_key('CONFIG_GBM')}
summary_info += {'libiscsi support': libiscsi.found()}
+summary_info += {'libnvmf support': libnvmf.found()}
summary_info += {'libnfs support': libnfs.found()}
if targetos == 'windows'
if config_host.has_key('CONFIG_GUEST_AGENT')
diff --git a/meson_options.txt b/meson_options.txt
index 3d304cac96..102705824f 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -68,6 +68,8 @@ option('glusterfs', type : 'feature', value : 'auto',
description: 'Glusterfs block device driver')
option('libiscsi', type : 'feature', value : 'auto',
description: 'libiscsi userspace initiator')
+option('libnvmf', type : 'feature', value : 'auto',
+ description: 'NVMe over Fabric userspace initiator')
option('libnfs', type : 'feature', value : 'auto',
description: 'libnfs block device driver')
option('mpath', type : 'feature', value : 'auto',
--
2.25.1
- [PATCH 0/1] introduce nvmf block driver, zhenwei pi, 2021/06/07
- [PATCH 1/1] nvmf: introduce a new block driver,
zhenwei pi <=
- Re: [PATCH 0/1] introduce nvmf block driver, Stefan Hajnoczi, 2021/06/07
- Re: Re: [PATCH 0/1] introduce nvmf block driver, zhenwei pi, 2021/06/07
- Re: Re: [PATCH 0/1] introduce nvmf block driver, Stefan Hajnoczi, 2021/06/08
- Re: Re: Re: [PATCH 0/1] introduce nvmf block driver, zhenwei pi, 2021/06/08
- Re: Re: Re: [PATCH 0/1] introduce nvmf block driver, Stefan Hajnoczi, 2021/06/08
- Re: Re: Re: Re: [PATCH 0/1] introduce nvmf block driver, zhenwei pi, 2021/06/08
- Re: Re: Re: Re: [PATCH 0/1] introduce nvmf block driver, Stefan Hajnoczi, 2021/06/08