From: Klaus Jensen <k.jensen@samsung.com>
For now, support the Data Block, Segment and Last Segment descriptor
types.
See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
---
include/block/nvme.h | 6 +-
hw/block/nvme.c | 329 ++++++++++++++++++++++++++++++++++--------
hw/block/trace-events | 4 +
3 files changed, 279 insertions(+), 60 deletions(-)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 65e68a82c897..58647bcdad0b 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -412,9 +412,9 @@ typedef union NvmeCmdDptr {
} NvmeCmdDptr;
enum NvmePsdt {
- PSDT_PRP = 0x0,
- PSDT_SGL_MPTR_CONTIGUOUS = 0x1,
- PSDT_SGL_MPTR_SGL = 0x2,
+ NVME_PSDT_PRP = 0x0,
+ NVME_PSDT_SGL_MPTR_CONTIGUOUS = 0x1,
+ NVME_PSDT_SGL_MPTR_SGL = 0x2,
};
typedef struct QEMU_PACKED NvmeCmd {
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 3b901efd1ec0..c5d09ff1edf5 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -413,13 +413,262 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1,
uint64_t prp2,
return NVME_SUCCESS;
}
-static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
- uint64_t prp1, uint64_t prp2, DMADirection dir,
+/*
+ * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
+ * number of bytes mapped in len.
+ */
+static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
+ QEMUIOVector *iov,
+ NvmeSglDescriptor *segment, uint64_t nsgld,
+ size_t *len, NvmeRequest *req)
+{
+ dma_addr_t addr, trans_len;
+ uint32_t dlen;
+ uint16_t status;
+
+ for (int i = 0; i < nsgld; i++) {
+ uint8_t type = NVME_SGL_TYPE(segment[i].type);
+
+ switch (type) {
+ case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
+ break;
+ case NVME_SGL_DESCR_TYPE_SEGMENT:
+ case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
+ return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
+ default:
+ return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
+ }
+
+ dlen = le32_to_cpu(segment[i].len);
+ if (!dlen) {
+ continue;
+ }
+
+ if (*len == 0) {
+ /*
+ * All data has been mapped, but the SGL contains additional
+ * segments and/or descriptors. The controller might accept
+ * ignoring the rest of the SGL.
+ */
+ uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
+ if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
+ break;
+ }
+
+ trace_pci_nvme_err_invalid_sgl_excess_length(nvme_cid(req));
+ return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+ }
+
+ trans_len = MIN(*len, dlen);
+ addr = le64_to_cpu(segment[i].addr);
+
+ if (UINT64_MAX - addr < dlen) {
+ return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+ }
+
+ status = nvme_map_addr(n, qsg, iov, addr, trans_len);
+ if (status) {
+ return status;
+ }
+
+ *len -= trans_len;
+ }
+
+ return NVME_SUCCESS;
+}
+
+static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
+ NvmeSglDescriptor sgl, size_t len,
NvmeRequest *req)
+{
+ /*
+ * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
+ * dynamically allocating a potentially huge SGL. The spec allows the SGL
+ * to be larger (as in number of bytes required to describe the SGL
+ * descriptors and segment chain) than the command transfer size, so it is
+ * not bounded by MDTS.
+ */
+ const int SEG_CHUNK_SIZE = 256;
+
+ NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
+ uint64_t nsgld;
+ uint32_t seg_len;
+ uint16_t status;
+ bool sgl_in_cmb = false;
+ hwaddr addr;
+ int ret;
+
+ sgld = &sgl;
+ addr = le64_to_cpu(sgl.addr);
+
+ trace_pci_nvme_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), len);
+
+ /*
+ * If the entire transfer can be described with a single data block it can
+ * be mapped directly.
+ */
+ if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
+ status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req);
+ if (status) {
+ goto unmap;
+ }
+
+ goto out;
+ }
+
+ /*
+ * If the segment is located in the CMB, the submission queue of the
+ * request must also reside there.
+ */
+ if (nvme_addr_is_cmb(n, addr)) {
+ if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) {
+ return NVME_INVALID_USE_OF_CMB | NVME_DNR;
+ }
+
+ sgl_in_cmb = true;
+ }
+
+ for (;;) {
+ switch (NVME_SGL_TYPE(sgld->type)) {
+ case NVME_SGL_DESCR_TYPE_SEGMENT:
+ case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
+ break;
+ default:
+ return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
+ }
+
+ seg_len = le32_to_cpu(sgld->len);
+
+ /* check the length of the (Last) Segment descriptor */
+ if (!seg_len || seg_len & 0xf) {
+ return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
+ }
+
+ if (UINT64_MAX - addr < seg_len) {
+ return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+ }
+
+ nsgld = seg_len / sizeof(NvmeSglDescriptor);
+
+ while (nsgld > SEG_CHUNK_SIZE) {
+ if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
+ trace_pci_nvme_err_addr_read(addr);
+ status = NVME_DATA_TRAS_ERROR;
+ goto unmap;
+ }
+
+ status = nvme_map_sgl_data(n, qsg, iov, segment, SEG_CHUNK_SIZE,
+ &len, req);
+ if (status) {
+ goto unmap;
+ }
+
+ nsgld -= SEG_CHUNK_SIZE;
+ addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
+ }
+
+ ret = nvme_addr_read(n, addr, segment, nsgld *
+ sizeof(NvmeSglDescriptor));
+ if (ret) {
+ trace_pci_nvme_err_addr_read(addr);
+ status = NVME_DATA_TRAS_ERROR;
+ goto unmap;
+ }
+
+ last_sgld = &segment[nsgld - 1];
+
+ /* if the segment ends with a Data Block, then we are done */
+ if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
+ status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req);
+ if (status) {
+ goto unmap;
+ }
+
+ goto out;
+ }
+
+ /*
+ * If the last descriptor was not a Data Block, then the current
+ * segment must not be a Last Segment.
+ */
+ if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
+ status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
+ goto unmap;
+ }
+
+ sgld = last_sgld;
+ addr = le64_to_cpu(sgld->addr);
+
+ /*
+ * Do not map the last descriptor; it will be a Segment or Last Segment
+ * descriptor and is handled by the next iteration.
+ */
+ status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld - 1, &len, req);
+ if (status) {
+ goto unmap;
+ }
+
+ /*
+ * If the next segment is in the CMB, make sure that the sgl was
+ * already located there.
+ */
+ if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) {
+ status = NVME_INVALID_USE_OF_CMB | NVME_DNR;
+ goto unmap;
+ }
+ }
+
+out:
+ /* if there is any residual left in len, the SGL was too short */
+ if (len) {
+ status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+ goto unmap;
+ }
+
+ return NVME_SUCCESS;
+
+unmap:
+ if (iov->iov) {
+ qemu_iovec_destroy(iov);
+ }
+
+ if (qsg->sg) {
+ qemu_sglist_destroy(qsg);
+ }
+
+ return status;
+}
+
+static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req)
+{
+ uint64_t prp1, prp2;
+
+ switch (NVME_CMD_FLAGS_PSDT(req->cmd.flags)) {
+ case NVME_PSDT_PRP:
+ prp1 = le64_to_cpu(req->cmd.dptr.prp1);
+ prp2 = le64_to_cpu(req->cmd.dptr.prp2);
+
+ return nvme_map_prp(n, prp1, prp2, len, req);
+ case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
+ case NVME_PSDT_SGL_MPTR_SGL:
+ /* SGLs shall not be used for Admin commands in NVMe over PCIe */
+ if (!req->sq->sqid) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ return nvme_map_sgl(n, &req->qsg, &req->iov, req->cmd.dptr.sgl, len,
+ req);
+ default:
+ return NVME_INVALID_FIELD;
+ }
+}
+
+static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
+ DMADirection dir, NvmeRequest *req)
{
uint16_t status = NVME_SUCCESS;
- status = nvme_map_prp(n, prp1, prp2, len, req);
+ status = nvme_map_dptr(n, len, req);
if (status) {
return status;
}
@@ -458,15 +707,6 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr,
uint32_t len,
return status;
}
-static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req)
-{
- NvmeCmd *cmd = &req->cmd;
- uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
- uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
-
- return nvme_map_prp(n, prp1, prp2, len, req);
-}