[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support
From: |
Klaus Jensen |
Subject: |
Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support |
Date: |
Wed, 8 Jun 2022 22:55:30 +0200 |
On Jun 8 09:36, Jinhao Fan wrote:
> Implement Doorbel Buffer Config command (Section 5.7 in NVMe Spec 1.3)
> and Shadow Doorbel buffer & EventIdx buffer handling logic (Section 7.13
> in NVMe Spec 1.3). For queues created before the Doorbell Buffer Config
> command, the nvme_dbbuf_config function tries to associate each existing
> SQ and CQ with its Shadow Doorbel buffer and EventIdx buffer address.
> Queues created after the Doorbell Buffer Config command will have the
> doorbell buffers associated with them when they are initialized.
>
> In nvme_process_sq and nvme_post_cqe, proactively check for Shadow
> Doorbell buffer changes instead of wait for doorbell register changes.
> This reduces the number of MMIOs.
>
> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
> ---
> hw/nvme/ctrl.c | 95 ++++++++++++++++++++++++++++++++++++++++++--
> hw/nvme/nvme.h | 8 ++++
> include/block/nvme.h | 2 +
> 3 files changed, 102 insertions(+), 3 deletions(-)
>
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 03760ddeae..d3f6c432df 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -223,6 +223,7 @@ static const uint32_t nvme_cse_acs[256] = {
> [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
> [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
> [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
> + [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP,
> [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
> };
>
> @@ -1304,6 +1305,12 @@ static inline void nvme_blk_write(BlockBackend *blk,
> int64_t offset,
> }
> }
>
> +static void nvme_update_cq_head(NvmeCQueue *cq)
> +{
> + pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head,
> + sizeof(cq->head));
> +}
> +
> static void nvme_post_cqes(void *opaque)
> {
> NvmeCQueue *cq = opaque;
> @@ -1316,6 +1323,10 @@ static void nvme_post_cqes(void *opaque)
> NvmeSQueue *sq;
> hwaddr addr;
>
> + if (cq->cqid && n->dbbuf_enabled) {
> + nvme_update_cq_head(cq);
Shouldn't we update the cq head eventidx here (prior to reading the
doorbell buffer)? Like we do for the sq tail?
> + }
> +
> if (nvme_cq_full(cq)) {
> break;
> }
> @@ -4237,6 +4248,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest
> *req)
> static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
> uint16_t sqid, uint16_t cqid, uint16_t size)
> {
> + uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
> int i;
> NvmeCQueue *cq;
>
> @@ -4256,6 +4268,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n,
> uint64_t dma_addr,
> }
> sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
>
> + if (sqid && n->dbbuf_dbs && n->dbbuf_eis) {
> + sq->db_addr = n->dbbuf_dbs + 2 * sqid * stride;
> + sq->ei_addr = n->dbbuf_eis + 2 * sqid * stride;
> + }
> +
> assert(n->cq[cqid]);
> cq = n->cq[cqid];
> QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
> @@ -4599,6 +4616,7 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n,
> uint64_t dma_addr,
> uint16_t cqid, uint16_t vector, uint16_t size,
> uint16_t irq_enabled)
> {
> + uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
> int ret;
>
> if (msix_enabled(&n->parent_obj)) {
> @@ -4615,6 +4633,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n,
> uint64_t dma_addr,
> cq->head = cq->tail = 0;
> QTAILQ_INIT(&cq->req_list);
> QTAILQ_INIT(&cq->sq_list);
> + if (cqid && n->dbbuf_dbs && n->dbbuf_eis) {
> + cq->db_addr = n->dbbuf_dbs + (2 * cqid + 1) * stride;
> + cq->ei_addr = n->dbbuf_eis + (2 * cqid + 1) * stride;
> + }
> n->cq[cqid] = cq;
> cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
> }
> @@ -5767,6 +5789,43 @@ out:
> return status;
> }
>
> +static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
> +{
> + uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
> + uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
> + uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
> + int i;
> +
> + /* Address should be page aligned */
> + if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
> + return NVME_INVALID_FIELD | NVME_DNR;
> + }
> +
> + /* Save shadow buffer base addr for use during queue creation */
> + n->dbbuf_dbs = dbs_addr;
> + n->dbbuf_eis = eis_addr;
> + n->dbbuf_enabled = true;
> +
> + for (i = 1; i < n->params.max_ioqpairs + 1; i++) {
> + NvmeSQueue *sq = n->sq[i];
> + NvmeCQueue *cq = n->cq[i];
> +
> + if (sq) {
> + /* Submission queue tail pointer location, 2 * QID * stride */
> + sq->db_addr = dbs_addr + 2 * i * stride;
> + sq->ei_addr = eis_addr + 2 * i * stride;
> + }
> +
> + if (cq) {
> + /* Completion queue head pointer location, (2 * QID + 1) *
> stride */
> + cq->db_addr = dbs_addr + (2 * i + 1) * stride;
> + cq->ei_addr = eis_addr + (2 * i + 1) * stride;
> + }
> + }
Why no love for the admin queue? :)
You are special-casing the admin queue below in process_sq() and
process_db(), as well as above in post_cqes(). As I'm reading the spec,
I do not see why the Admin queue should be treated differently wrt.
doorbell buffer configuration. Could this be a left-over from the
behavior in the original Google extension (prior to going into NVMe)?
I peeked in to the kernel and it looks like it doesnt enable doorbell
buffers for admin queue, only for subsequently created I/O queues.
Keith, is this a bug in the kernel? If the code here would expect the
doorbell buffer to be updated for the admin queue as well, would we
break?
> +
> + return NVME_SUCCESS;
> +}
> +
> static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
> {
> trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
> @@ -5809,6 +5868,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest
> *req)
> return nvme_aer(n, req);
> case NVME_ADM_CMD_NS_ATTACHMENT:
> return nvme_ns_attachment(n, req);
> + case NVME_ADM_CMD_DBBUF_CONFIG:
> + return nvme_dbbuf_config(n, req);
> case NVME_ADM_CMD_FORMAT_NVM:
> return nvme_format(n, req);
> default:
> @@ -5818,6 +5879,18 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n,
> NvmeRequest *req)
> return NVME_INVALID_OPCODE | NVME_DNR;
> }
>
> +static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
> +{
> + pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
> + sizeof(sq->tail));
> +}
> +
> +static void nvme_update_sq_tail(NvmeSQueue *sq)
> +{
> + pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail,
> + sizeof(sq->tail));
> +}
> +
> static void nvme_process_sq(void *opaque)
> {
> NvmeSQueue *sq = opaque;
> @@ -5829,6 +5902,10 @@ static void nvme_process_sq(void *opaque)
> NvmeCmd cmd;
> NvmeRequest *req;
>
> + if (sq->sqid && n->dbbuf_enabled) {
> + nvme_update_sq_tail(sq);
> + }
> +
> while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
> addr = sq->dma_addr + sq->head * n->sqe_size;
> if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
> @@ -5852,6 +5929,11 @@ static void nvme_process_sq(void *opaque)
> req->status = status;
> nvme_enqueue_req_completion(cq, req);
> }
> +
> + if (sq->sqid && n->dbbuf_enabled) {
> + nvme_update_sq_eventidx(sq);
> + nvme_update_sq_tail(sq);
> + }
> }
> }
>
> @@ -5889,6 +5971,9 @@ static void nvme_ctrl_reset(NvmeCtrl *n)
> n->aer_queued = 0;
> n->outstanding_aers = 0;
> n->qs_created = false;
> + n->dbbuf_dbs = 0;
> + n->dbbuf_eis = 0;
> + n->dbbuf_enabled = false;
> }
>
> static void nvme_ctrl_shutdown(NvmeCtrl *n)
> @@ -6397,7 +6482,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr,
> int val)
> trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
>
> start_sqs = nvme_cq_full(cq) ? 1 : 0;
> - cq->head = new_head;
> + if (!cq->cqid || !n->dbbuf_enabled) {
> + cq->head = new_head;
> + }
> if (start_sqs) {
> NvmeSQueue *sq;
> QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
> @@ -6454,7 +6541,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr,
> int val)
>
> trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
>
> - sq->tail = new_tail;
> + if (!sq->sqid || !n->dbbuf_enabled) {
> + sq->tail = new_tail;
> + }
> timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> }
> }
> @@ -6733,7 +6822,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice
> *pci_dev)
>
> id->mdts = n->params.mdts;
> id->ver = cpu_to_le32(NVME_SPEC_VER);
> - id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
> + id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT |
> NVME_OACS_DBBUF);
> id->cntrltype = 0x1;
>
> /*
> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> index 6773819325..4452e4b1bf 100644
> --- a/hw/nvme/nvme.h
> +++ b/hw/nvme/nvme.h
> @@ -334,6 +334,7 @@ static inline const char *nvme_adm_opc_str(uint8_t opc)
> case NVME_ADM_CMD_GET_FEATURES: return "NVME_ADM_CMD_GET_FEATURES";
> case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ";
> case NVME_ADM_CMD_NS_ATTACHMENT: return "NVME_ADM_CMD_NS_ATTACHMENT";
> + case NVME_ADM_CMD_DBBUF_CONFIG: return "NVME_ADM_CMD_DBBUF_CONFIG";
> case NVME_ADM_CMD_FORMAT_NVM: return "NVME_ADM_CMD_FORMAT_NVM";
> default: return "NVME_ADM_CMD_UNKNOWN";
> }
> @@ -365,6 +366,8 @@ typedef struct NvmeSQueue {
> uint32_t tail;
> uint32_t size;
> uint64_t dma_addr;
> + uint64_t db_addr;
> + uint64_t ei_addr;
> QEMUTimer *timer;
> NvmeRequest *io_req;
> QTAILQ_HEAD(, NvmeRequest) req_list;
> @@ -382,6 +385,8 @@ typedef struct NvmeCQueue {
> uint32_t vector;
> uint32_t size;
> uint64_t dma_addr;
> + uint64_t db_addr;
> + uint64_t ei_addr;
> QEMUTimer *timer;
> QTAILQ_HEAD(, NvmeSQueue) sq_list;
> QTAILQ_HEAD(, NvmeRequest) req_list;
> @@ -432,6 +437,9 @@ typedef struct NvmeCtrl {
> uint64_t starttime_ms;
> uint16_t temperature;
> uint8_t smart_critical_warning;
> + uint64_t dbbuf_dbs;
> + uint64_t dbbuf_eis;
> + bool dbbuf_enabled;
>
> struct {
> MemoryRegion mem;
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 3737351cc8..5b522d7b0e 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -595,6 +595,7 @@ enum NvmeAdminCommands {
> NVME_ADM_CMD_ACTIVATE_FW = 0x10,
> NVME_ADM_CMD_DOWNLOAD_FW = 0x11,
> NVME_ADM_CMD_NS_ATTACHMENT = 0x15,
> + NVME_ADM_CMD_DBBUF_CONFIG = 0x7c,
> NVME_ADM_CMD_FORMAT_NVM = 0x80,
> NVME_ADM_CMD_SECURITY_SEND = 0x81,
> NVME_ADM_CMD_SECURITY_RECV = 0x82,
> @@ -1134,6 +1135,7 @@ enum NvmeIdCtrlOacs {
> NVME_OACS_FORMAT = 1 << 1,
> NVME_OACS_FW = 1 << 2,
> NVME_OACS_NS_MGMT = 1 << 3,
> + NVME_OACS_DBBUF = 1 << 8,
> };
>
> enum NvmeIdCtrlOncs {
> --
> 2.25.1
>
--
One of us - No more doubt, silence or taboo about mental illness.
signature.asc
Description: PGP signature
- [PATCH 0/2] hw/nvme: Add shadow doorbell buffer support, Jinhao Fan, 2022/06/07
- [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support, Jinhao Fan, 2022/06/07
- Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support,
Klaus Jensen <=
- Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support, Jinhao Fan, 2022/06/08
- Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support, Keith Busch, 2022/06/09
- Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support, John Levon, 2022/06/09
- Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support, Klaus Jensen, 2022/06/09
- Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support, John Levon, 2022/06/09
- Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support, Jinhao Fan, 2022/06/12
- Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support, Keith Busch, 2022/06/13
- Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support, Jinhao Fan, 2022/06/14
- Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support, Keith Busch, 2022/06/14
- Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support, Jinhao Fan, 2022/06/14