[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH] block: introduce zone append write for zoned devices
From: |
Sam Li |
Subject: |
Re: [PATCH] block: introduce zone append write for zoned devices |
Date: |
Sun, 11 Sep 2022 16:00:36 +0800 |
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年9月11日周日 14:06写道:
>
> On 2022/09/10 15:38, Sam Li wrote:
> > A zone append command is a write operation that specifies the first
> > logical block of a zone as the write position. When writing to a zoned
> > block device using zone append, the byte offset of the write is pointing
> > to the write pointer of that zone. Upon completion the device will
> > respond with the position the data has been placed in the zone.
>
> s/placed/written
>
> You need to explain more about what this patch does:
>
> Since Linux does not provide a user API to issue zone append operations to
> zoned
> devices from user space, the file-posix driver is modified to add zone append
> emulation using regular write operations. To do this, the file-posix driver
> tracks the wp location of all zones of the device.... Blah.
>
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> > block/block-backend.c | 65 +++++++++++
> > block/file-posix.c | 169 ++++++++++++++++++++++++++++-
> > block/io.c | 21 ++++
> > block/raw-format.c | 7 ++
> > include/block/block-common.h | 2 +
> > include/block/block-io.h | 3 +
> > include/block/block_int-common.h | 9 ++
> > include/block/raw-aio.h | 4 +-
> > include/sysemu/block-backend-io.h | 9 ++
> > qemu-io-cmds.c | 62 +++++++++++
> > tests/qemu-iotests/tests/zoned.out | 7 ++
> > tests/qemu-iotests/tests/zoned.sh | 9 ++
> > 12 files changed, 360 insertions(+), 7 deletions(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index ebe8d7bdf3..b77a1cb24b 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
> > struct {
> > BlockZoneOp op;
> > } zone_mgmt;
> > + struct {
> > + int64_t *append_sector;
> > + } zone_append;
> > };
> > } BlkRwCo;
> >
> > @@ -1869,6 +1872,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk,
> > BlockZoneOp op,
> > return &acb->common;
> > }
> >
> > +static void blk_aio_zone_append_entry(void *opaque) {
> > + BlkAioEmAIOCB *acb = opaque;
> > + BlkRwCo *rwco = &acb->rwco;
> > +
> > + rwco->ret = blk_co_zone_append(rwco->blk,
> > rwco->zone_append.append_sector,
> > + rwco->iobuf, rwco->flags);
> > + blk_aio_complete(acb);
> > +}
> > +
> > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> > + QEMUIOVector *qiov, BdrvRequestFlags flags,
> > + BlockCompletionFunc *cb, void *opaque) {
> > + BlkAioEmAIOCB *acb;
> > + Coroutine *co;
> > + IO_CODE();
> > +
> > + blk_inc_in_flight(blk);
> > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> > + acb->rwco = (BlkRwCo) {
> > + .blk = blk,
> > + .ret = NOT_DONE,
> > + .flags = flags,
> > + .iobuf = qiov,
> > + .zone_append = {
> > + .append_sector = offset,
> > + },
> > + };
> > + acb->has_returned = false;
> > +
> > + co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> > + bdrv_coroutine_enter(blk_bs(blk), co);
> > +
> > + acb->has_returned = true;
> > + if (acb->rwco.ret != NOT_DONE) {
> > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> > + blk_aio_complete_bh, acb);
> > + }
> > +
> > + return &acb->common;
> > +}
> > +
> > /*
> > * Send a zone_report command.
> > * offset is a byte offset from the start of the device. No alignment
> > @@ -1920,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk,
> > BlockZoneOp op,
> > return ret;
> > }
> >
> > +/*
> > + * Send a zone_append command.
> > + */
> > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> > + QEMUIOVector *qiov, BdrvRequestFlags flags)
> > +{
> > + int ret;
> > + IO_CODE();
> > +
> > + blk_inc_in_flight(blk);
> > + blk_wait_while_drained(blk);
> > + if (!blk_is_available(blk)) {
> > + blk_dec_in_flight(blk);
> > + return -ENOMEDIUM;
> > + }
> > +
> > + ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> > + blk_dec_in_flight(blk);
> > + return ret;
> > +}
> > +
> > void blk_drain(BlockBackend *blk)
> > {
> > BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 354de22860..65500e43f4 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -173,6 +173,7 @@ typedef struct BDRVRawState {
> > } stats;
> >
> > PRManager *pr_mgr;
> > + CoRwlock zones_lock;
> > } BDRVRawState;
> >
> > typedef struct BDRVRawReopenState {
> > @@ -206,6 +207,8 @@ typedef struct RawPosixAIOData {
> > struct {
> > struct iovec *iov;
> > int niov;
> > + int64_t *append_sector;
> > + BlockZoneDescriptor *zone;
> > } io;
> > struct {
> > uint64_t cmd;
> > @@ -1333,6 +1336,9 @@ static int hdev_get_max_segments(int fd, struct stat
> > *st) {
> > #endif
> > }
> >
> > +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> > + const struct blk_zone *blkz);
> > +static int do_zone_report(int64_t offset, int fd, struct
> > BlockZoneDescriptor *zones, unsigned int nrz);
> > static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> > {
> > BDRVRawState *s = bs->opaque;
> > @@ -1402,6 +1408,19 @@ static void raw_refresh_limits(BlockDriverState *bs,
> > Error **errp)
> > if (ret >= 0) {
> > bs->bl.max_active_zones = ret;
> > }
> > +
> > + ret = get_sysfs_long_val(&st, "logical_block_size");
> > + if (ret >= 0) {
> > + bs->bl.logical_block_size = ret;
> > + }
> > +
> > + ret = get_sysfs_long_val(&st, "nr_zones");
> > + if (ret > 0) {
> > + bs->bl.nr_zones = ret;
> > + int64_t zones_size = ret * sizeof(BlockZoneDescriptor);
> > + bs->bl.zones = g_malloc(zones_size);
> > + do_zone_report(0, s->fd, bs->bl.zones, ret);
>
> What happens if this fails ?
There are two cases:
1) ioctl fails: error reports in do_zone_report
2) the number of zones that reported are less than expected
I was wondering if 2) happens. If that happens, then do_zone_report
again or simply report an error.
>
> > + }
> > }
> > }
> >
> > @@ -1569,18 +1588,24 @@ static ssize_t
> > handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
> > ssize_t len;
> >
> > do {
> > - if (aiocb->aio_type & QEMU_AIO_WRITE)
> > + if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > len = qemu_pwritev(aiocb->aio_fildes,
> > aiocb->io.iov,
> > aiocb->io.niov,
> > aiocb->aio_offset);
> > - else
> > + } else if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> > + uint64_t wp = aiocb->aio_offset;
> > + len = qemu_pwritev(aiocb->aio_fildes,
> > + aiocb->io.iov,
> > + aiocb->io.niov,
> > + wp);
>
> That wp variable is not necessary, which makes this "else if" also
> unnecessary.
> What about simply:
>
> if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
> len = qemu_pwritev(aiocb->aio_fildes,
> aiocb->io.iov,
> aiocb->io.niov,
> aiocb->aio_offset);
> else
> ...
>
> > len = qemu_preadv(aiocb->aio_fildes,
> > aiocb->io.iov,
> > aiocb->io.niov,
> > aiocb->aio_offset);
> > + }
> > } while (len == -1 && errno == EINTR);
> > -
>
> whiteline change.
>
> > if (len == -1) {
> > return -errno;
> > }
> > @@ -1604,6 +1629,12 @@ static ssize_t
> > handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
> > (const char *)buf + offset,
> > aiocb->aio_nbytes - offset,
> > aiocb->aio_offset + offset);
> > + } else if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) {
> > + uint64_t wp = aiocb->aio_offset;
>
> This variable is not necessary.
>
> > + len = pwrite(aiocb->aio_fildes,
> > + (const char *)buf + offset,
> > + aiocb->aio_nbytes - offset,
> > + wp + offset);
> > } else {
> > len = pread(aiocb->aio_fildes,
> > buf + offset,
> > @@ -1638,7 +1669,6 @@ static int handle_aiocb_rw(void *opaque)
> > RawPosixAIOData *aiocb = opaque;
> > ssize_t nbytes;
> > char *buf;
> > -
>
> whiteline change.
>
> > if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
> > /*
> > * If there is just a single buffer, and it is properly aligned
> > @@ -1692,7 +1722,7 @@ static int handle_aiocb_rw(void *opaque)
> > }
> >
> > nbytes = handle_aiocb_rw_linear(aiocb, buf);
> > - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
> > + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
> > char *p = buf;
> > size_t count = aiocb->aio_nbytes, copy;
> > int i;
> > @@ -1713,6 +1743,25 @@ static int handle_aiocb_rw(void *opaque)
> >
> > out:
> > if (nbytes == aiocb->aio_nbytes) {
> > +#if defined(CONFIG_BLKZONED)
> > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> > + BlockZoneDescriptor *zone = aiocb->io.zone;
> > + int64_t nr_sectors = aiocb->aio_nbytes / 512;
> > + if (zone) {
> > + qemu_mutex_init(&zone->lock);
> > + if (zone->type == BLK_ZT_SWR) {
> > + qemu_mutex_lock(&zone->lock);
> > + zone->wp += nr_sectors;
> > + qemu_mutex_unlock(&zone->lock);
> > + }
> > + qemu_mutex_destroy(&zone->lock);
>
> This is weird. you init the mutex, lock/unlock it and destroy it. So it has
> absolutely no meaning at all.
I was thinking that init every lock for all the zones or init the lock
for the zone that needed it. The confusion I have here is the cost of
initializing/destroying the lock.
>
> > +
> > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> > + *aiocb->io.append_sector = zone->wp;
>
> This needs to be done with the zone mutex locked. Otherwise, when reaching
> this
> code, the zone wp may have changed already and so you would be returning the
> wrong position.
I see.
>
> > + }
> > + }
> > + }
> > +#endif
> > return 0;
> > } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
> > if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > @@ -1724,6 +1773,13 @@ out:
> > }
> > } else {
> > assert(nbytes < 0);
> > + if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) {
> > + BDRVRawState *s = aiocb->bs->opaque;
> > + qemu_co_rwlock_init(&s->zones_lock);
> > + qemu_co_rwlock_rdlock(&s->zones_lock);
> > + do_zone_report(0, aiocb->aio_fildes, aiocb->bs->bl.zones,
> > aiocb->bs->bl.nr_zones);
> > + qemu_co_rwlock_unlock(&s->zones_lock);
> > + }
> > return nbytes;
> > }
> > }
> > @@ -2012,17 +2068,23 @@ static int handle_aiocb_zone_report(void *opaque) {
> > static int handle_aiocb_zone_mgmt(void *opaque) {
> > #if defined(CONFIG_BLKZONED)
> > RawPosixAIOData *aiocb = opaque;
> > + BlockDriverState *bs = aiocb->bs;
> > int fd = aiocb->aio_fildes;
> > int64_t sector = aiocb->aio_offset / 512;
> > int64_t nr_sectors = aiocb->aio_nbytes / 512;
> > struct blk_zone_range range;
> > + unsigned long zone_op = aiocb->zone_mgmt.zone_op;
> > int ret;
> >
> > + BlockZoneDescriptor *zones;
> > + zones = bs->bl.zones;
> > + int index = sector / bs->bl.zone_sectors;
> > +
> > /* Execute the operation */
> > range.sector = sector;
> > range.nr_sectors = nr_sectors;
> > do {
> > - ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
> > + ret = ioctl(fd, zone_op, &range);
> > } while (ret != 0 && errno == EINTR);
> >
> > if (ret != 0) {
> > @@ -2030,6 +2092,28 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
> > errno);
> > return -errno;
> > }
> > +
> > + if (aiocb->zone_mgmt.all) {
> > + for (int i = 0; i < bs->bl.nr_zones; ++i) {
> > + qemu_mutex_init(&zones[i].lock);
> > + qemu_mutex_lock(&zones[i].lock);
> > + zones[i].wp = zones[i].start;
> > + qemu_mutex_unlock(&zones[i].lock);
> > + qemu_mutex_destroy(&zones[i].lock);
>
> Same comment here. The mutex init/destroy calls should not be here. The zone
> mutex should be initialized when the zone array is first allocated and
> destroyed
> when the zone array is freed. In between these events, the zone mutex should
> NEVER be reinitialized.
>
> > + }
> > + } else if (zone_op == BLKRESETZONE) {
> > + qemu_mutex_init(&zones[index].lock);
> > + qemu_mutex_lock(&zones[index].lock);
> > + zones[index].wp = zones[index].start;
> > + qemu_mutex_unlock(&zones[index].lock);
> > + qemu_mutex_destroy(&zones[index].lock);
> > + } else if (zone_op == BLKFINISHZONE) {
> > + qemu_mutex_init(&zones[index].lock);
> > + qemu_mutex_lock(&zones[index].lock);
> > + zones[index].wp = zones[index].start + zones[index].length;
> > + qemu_mutex_unlock(&zones[index].lock);
> > + qemu_mutex_destroy(&zones[index].lock);
> > + }
> > return ret;
> > #else
> > return -ENOTSUP;
> > @@ -2340,6 +2424,17 @@ static int coroutine_fn raw_co_prw(BlockDriverState
> > *bs, uint64_t offset,
> > },
> > };
> >
> > + int64_t sector = offset / 512;
> > + struct stat st;
> > + if (fstat(s->fd, &st)) {
> > + return -1;
> > + }
> > + int64_t zone_sector = get_sysfs_long_val(&st, "chunk_sectors");
> > + if (zone_sector > 0) {
> > + int index = sector / bs->bl.zone_sectors;
> > + BlockZoneDescriptor *zone = &bs->bl.zones[index];
> > + acb.io.zone = zone;
>
> The zone variable is not necessary. You may also want to add a small inline
> helper function to get a zone pointer from a sector value.
>
> > + }
> > assert(qiov->size == bytes);
> > return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> > }
> > @@ -2448,6 +2543,9 @@ static void
> > raw_aio_attach_aio_context(BlockDriverState *bs,
> > static void raw_close(BlockDriverState *bs)
> > {
> > BDRVRawState *s = bs->opaque;
> > +#if defined(CONFIG_BLKZONED)
> > + g_free(bs->bl.zones);
> > +#endif
> >
> > if (s->fd >= 0) {
> > qemu_close(s->fd);
> > @@ -3283,6 +3381,11 @@ static int coroutine_fn
> > raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> > zone_op_name = "BLKRESETZONE";
> > zone_op = BLKRESETZONE;
> > break;
> > + case BLK_ZO_RESET_ALL:
> > + zone_op_name = "BLKRESETZONE";
> > + zone_op = BLKRESETZONE;
> > + is_all = true;
> > + break;
> > default:
> > g_assert_not_reached();
> > }
> > @@ -3306,6 +3409,59 @@ static int coroutine_fn
> > raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> > #endif
> > }
> >
> > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> > + int64_t *offset,
> > + QEMUIOVector *qiov,
> > + BdrvRequestFlags flags) {
> > +#if defined(CONFIG_BLKZONED)
> > + BDRVRawState *s = bs->opaque;
> > + int64_t zone_sector = bs->bl.zone_sectors;
> > + int64_t zone_sector_mask = zone_sector - 1;
> > + int64_t iov_len = 0;
> > + int64_t len = 0;
> > + RawPosixAIOData acb;
> > +
> > + if (*offset & zone_sector_mask) {
> > + error_report("offset %" PRId64 " is not aligned to zone size "
> > + "%" PRId64 "", *offset, zone_sector);
> > + return -EINVAL;
> > + }
> > +
> > + int64_t lbsz = bs->bl.logical_block_size;> + int64_t lbsz_mask =
> > lbsz - 1;
> > + for (int i = 0; i < qiov->niov; i++) {
> > + iov_len = qiov->iov[i].iov_len;
> > + if (iov_len & lbsz_mask) {
> > + error_report("len of IOVector[%d] %" PRId64 " is not aligned to
> > block "
> > + "size %" PRId64 "", i, iov_len, lbsz);
> > + return -EINVAL;
> > + }
>
> This alignment check should be against the device write granularity, not the
> logical block size. The write granularity will always be equal to the device
> physical block size, which may or may not be equal to the device logical block
> size. E.g. a 512e SMR disk has a 512B logical block size but a 4096B physical
> block size. And the ZBC & ZAC specifications mandate that all write be aligned
> to the physical block size.
I see. I'll change it to physical block size.
>
> > + len += iov_len;
> > + }
> > +
> > + int64_t sector = *offset / 512; //??? did not multiply before
> > + BlockZoneDescriptor *zone = &bs->bl.zones[sector/zone_sector];
> > +
> > + acb = (RawPosixAIOData) {
> > + .bs = bs,
> > + .aio_fildes = s->fd,
> > + .aio_type = QEMU_AIO_ZONE_APPEND,
> > + .aio_offset = zone->wp * 512,
> > + .aio_nbytes = len,
> > + .io = {
> > + .iov = qiov->iov,
> > + .niov = qiov->niov,
> > + .zone = zone,
> > + .append_sector = offset,
> > + },
> > + };
> > +
> > + return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> > +#else
> > + return -ENOTSUP;
> > +#endif
> > +}
> > +
> > static coroutine_fn int
> > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
> > bool blkdev)
> > @@ -4081,6 +4237,7 @@ static BlockDriver bdrv_zoned_host_device = {
> > /* zone management operations */
> > .bdrv_co_zone_report = raw_co_zone_report,
> > .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > + .bdrv_co_zone_append = raw_co_zone_append,
> > };
> > #endif
> >
> > diff --git a/block/io.c b/block/io.c
> > index de9ec1d740..3ade9361e4 100644
> > --- a/block/io.c
> > +++ b/block/io.c
> > @@ -3239,6 +3239,27 @@ out:
> > return co.ret;
> > }
> >
> > +int bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> > + QEMUIOVector *qiov,
> > + BdrvRequestFlags flags)
> > +{
> > + BlockDriver *drv = bs->drv;
> > + CoroutineIOCompletion co = {
> > + .coroutine = qemu_coroutine_self(),
> > + };
> > + IO_CODE();
> > +
> > + bdrv_inc_in_flight(bs);
> > + if (!drv || !drv->bdrv_co_zone_append) {
> > + co.ret = -ENOTSUP;
> > + goto out;
> > + }
> > + co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
> > +out:
> > + bdrv_dec_in_flight(bs);
> > + return co.ret;
> > +}
> > +
> > void *qemu_blockalign(BlockDriverState *bs, size_t size)
> > {
> > IO_CODE();
> > diff --git a/block/raw-format.c b/block/raw-format.c
> > index 9441536819..df8cc33467 100644
> > --- a/block/raw-format.c
> > +++ b/block/raw-format.c
> > @@ -325,6 +325,12 @@ static int coroutine_fn
> > raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> > return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
> > }
> >
> > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, int64_t
> > *offset,
> > + QEMUIOVector *qiov,
> > + BdrvRequestFlags flags) {
> > + return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
> > +}
> > +
> > static int64_t raw_getlength(BlockDriverState *bs)
> > {
> > int64_t len;
> > @@ -628,6 +634,7 @@ BlockDriver bdrv_raw = {
> > .bdrv_co_pdiscard = &raw_co_pdiscard,
> > .bdrv_co_zone_report = &raw_co_zone_report,
> > .bdrv_co_zone_mgmt = &raw_co_zone_mgmt,
> > + .bdrv_co_zone_append = &raw_co_zone_append,
> > .bdrv_co_block_status = &raw_co_block_status,
> > .bdrv_co_copy_range_from = &raw_co_copy_range_from,
> > .bdrv_co_copy_range_to = &raw_co_copy_range_to,
> > diff --git a/include/block/block-common.h b/include/block/block-common.h
> > index 36bd0e480e..a55e9b390e 100644
> > --- a/include/block/block-common.h
> > +++ b/include/block/block-common.h
> > @@ -54,6 +54,7 @@ typedef enum BlockZoneOp {
> > BLK_ZO_CLOSE,
> > BLK_ZO_FINISH,
> > BLK_ZO_RESET,
> > + BLK_ZO_RESET_ALL,
> > } BlockZoneOp;
> >
> > typedef enum BlockZoneModel {
> > @@ -84,6 +85,7 @@ typedef enum BlockZoneType {
> > * Provides information on a zone with all position and size values in
> > bytes.
> > */
> > typedef struct BlockZoneDescriptor {
> > + QemuMutex lock;
> > uint64_t start;
> > uint64_t length;
> > uint64_t cap;
> > diff --git a/include/block/block-io.h b/include/block/block-io.h
> > index 65463b88d9..a792164018 100644
> > --- a/include/block/block-io.h
> > +++ b/include/block/block-io.h
> > @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState
> > *bs, int64_t offset,
> > BlockZoneDescriptor *zones);
> > int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> > int64_t offset, int64_t len);
> > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> > + QEMUIOVector *qiov,
> > + BdrvRequestFlags flags);
> >
> > int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
> > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
> > diff --git a/include/block/block_int-common.h
> > b/include/block/block_int-common.h
> > index 043aa161a0..ebda5953dc 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -701,6 +701,9 @@ struct BlockDriver {
> > BlockZoneDescriptor *zones);
> > int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs,
> > BlockZoneOp op,
> > int64_t offset, int64_t len);
> > + int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
> > + int64_t *offset, QEMUIOVector *qiov,
> > + BdrvRequestFlags flags);
> >
> > /* removable device specific */
> > bool (*bdrv_is_inserted)(BlockDriverState *bs);
> > @@ -854,6 +857,12 @@ typedef struct BlockLimits {
> >
> > /* maximum number of active zones */
> > int64_t max_active_zones;
> > +
> > + /* array of zones in the zoned block device. Only tracks write
> > pointer's
> > + * location of each zone as a helper for zone_append API */
> > + BlockZoneDescriptor *zones;
>
> This is a lot of memory for only tracking the wp... Why not reduce this to an
> array of int64 values, for the wp only ? As you may need the zone type too
> (conventional vs sequential), you can use the most significant bit (a zone wp
> value will never use all 64 bits !).
>
> Or device another zone structure with zone type, zone wp and mutex only, so
> smaller than the regular zone report structure.
I was just trying to reuse do_zone_report. It is better to only track
the wp only. Then a new struct and smaller zone_report should be
introduced for it.
>
> > +
> > + int64_t logical_block_size;
> > } BlockLimits;
> >
> > typedef struct BdrvOpBlocker BdrvOpBlocker;
> > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> > index 3d26929cdd..f13cc1887b 100644
> > --- a/include/block/raw-aio.h
> > +++ b/include/block/raw-aio.h
> > @@ -31,6 +31,7 @@
> > #define QEMU_AIO_TRUNCATE 0x0080
> > #define QEMU_AIO_ZONE_REPORT 0x0100
> > #define QEMU_AIO_ZONE_MGMT 0x0200
> > +#define QEMU_AIO_ZONE_APPEND 0x0400
> > #define QEMU_AIO_TYPE_MASK \
> > (QEMU_AIO_READ | \
> > QEMU_AIO_WRITE | \
> > @@ -41,7 +42,8 @@
> > QEMU_AIO_COPY_RANGE | \
> > QEMU_AIO_TRUNCATE | \
> > QEMU_AIO_ZONE_REPORT | \
> > - QEMU_AIO_ZONE_MGMT)
> > + QEMU_AIO_ZONE_MGMT | \
> > + QEMU_AIO_ZONE_APPEND)
> >
> > /* AIO flags */
> > #define QEMU_AIO_MISALIGNED 0x1000
> > diff --git a/include/sysemu/block-backend-io.h
> > b/include/sysemu/block-backend-io.h
> > index 6835525582..33e35ae5d7 100644
> > --- a/include/sysemu/block-backend-io.h
> > +++ b/include/sysemu/block-backend-io.h
> > @@ -51,6 +51,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk,
> > int64_t offset,
> > BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> > int64_t offset, int64_t len,
> > BlockCompletionFunc *cb, void *opaque);
> > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> > + QEMUIOVector *qiov, BdrvRequestFlags flags,
> > + BlockCompletionFunc *cb, void *opaque);
> > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t
> > bytes,
> > BlockCompletionFunc *cb, void *opaque);
> > void blk_aio_cancel_async(BlockAIOCB *acb);
> > @@ -172,6 +175,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk,
> > BlockZoneOp op,
> > int64_t offset, int64_t len);
> > int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> > int64_t offset, int64_t len);
> > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> > + QEMUIOVector *qiov,
> > + BdrvRequestFlags flags);
> > +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t
> > *offset,
> > + QEMUIOVector *qiov,
> > + BdrvRequestFlags flags);
> >
> > int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
> > int64_t bytes);
> > diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
> > index 446a059603..8868c88290 100644
> > --- a/qemu-io-cmds.c
> > +++ b/qemu-io-cmds.c
> > @@ -1850,6 +1850,67 @@ static const cmdinfo_t zone_reset_cmd = {
> > .oneline = "reset a zone write pointer in zone block device",
> > };
> >
> > +static int do_aio_zone_append(BlockBackend *blk, QEMUIOVector *qiov,
> > + int64_t *offset, int flags, int *total)
> > +{
> > + int async_ret = NOT_DONE;
> > +
> > + blk_aio_zone_append(blk, offset, qiov, flags, aio_rw_done, &async_ret);
> > + while (async_ret == NOT_DONE) {
> > + main_loop_wait(false);
> > + }
> > +
> > + *total = qiov->size;
> > + return async_ret < 0 ? async_ret : 1;
> > +}
> > +static int zone_append_f(BlockBackend *blk, int argc, char **argv) {
> > + int ret;
> > +// struct timespec t1, t2;
> > + int flags = 0;
> > + int total = 0;
> > + int64_t offset;
> > + char *buf;
> > + int nr_iov;
> > + int pattern = 0xcd;
> > + QEMUIOVector qiov;
> > +
> > + if (optind > argc - 2) {
> > + return -EINVAL;
> > + }
> > + optind++;
> > + offset = cvtnum(argv[optind]);
> > + if (offset < 0) {
> > + print_cvtnum_err(offset, argv[optind]);
> > + return offset;
> > + }
> > + optind++;
> > + nr_iov = argc - optind;
> > + buf = create_iovec(blk, &qiov, &argv[optind], nr_iov, pattern);
> > + if (buf == NULL) {
> > + return -EINVAL;
> > + }
> > + ret = do_aio_zone_append(blk, &qiov, &offset, flags, &total);
> > + if (ret < 0) {
> > + printf("zone append failed: %s\n", strerror(-ret));
> > + goto out;
> > + }
> > +
> > + out:
> > + qemu_iovec_destroy(&qiov);
> > + qemu_io_free(buf);
> > + return ret;
> > +}
> > +
> > +static const cmdinfo_t zone_append_cmd = {
> > + .name = "zone_append",
> > + .altname = "zap",
> > + .cfunc = zone_append_f,
> > + .argmin = 3,
> > + .argmax = 3,
> > + .args = "offset len [len..]",
> > + .oneline = "append write a number of bytes at a specified offset",
> > +};
> > +
> > static int truncate_f(BlockBackend *blk, int argc, char **argv);
> > static const cmdinfo_t truncate_cmd = {
> > .name = "truncate",
> > @@ -2647,6 +2708,7 @@ static void __attribute((constructor))
> > init_qemuio_commands(void)
> > qemuio_add_command(&zone_close_cmd);
> > qemuio_add_command(&zone_finish_cmd);
> > qemuio_add_command(&zone_reset_cmd);
> > + qemuio_add_command(&zone_append_cmd);
> > qemuio_add_command(&truncate_cmd);
> > qemuio_add_command(&length_cmd);
> > qemuio_add_command(&info_cmd);
> > diff --git a/tests/qemu-iotests/tests/zoned.out
> > b/tests/qemu-iotests/tests/zoned.out
> > index 0c8f96deb9..b3b139b4ec 100644
> > --- a/tests/qemu-iotests/tests/zoned.out
> > +++ b/tests/qemu-iotests/tests/zoned.out
> > @@ -50,4 +50,11 @@ start: 0x80000, len 0x80000, cap 0x80000, wptr 0x100000,
> > zcond:14, [type: 2]
> > (5) resetting the second zone
> > After resetting a zone:
> > start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
> > +
> > +
> > +(6) append write
> > +After appending the first zone:
> > +start: 0x0, len 0x80000, cap 0x80000, wptr 0x18, zcond:2, [type: 2]
> > +After appending the second zone:
> > +start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80018, zcond:2, [type: 2]
> > *** done
> > diff --git a/tests/qemu-iotests/tests/zoned.sh
> > b/tests/qemu-iotests/tests/zoned.sh
> > index 871f47efed..b4dc89aaac 100755
> > --- a/tests/qemu-iotests/tests/zoned.sh
> > +++ b/tests/qemu-iotests/tests/zoned.sh
> > @@ -79,6 +79,15 @@ echo "(5) resetting the second zone"
> > sudo $QEMU_IO $IMG -c "zrs 268435456 268435456"
> > echo "After resetting a zone:"
> > sudo $QEMU_IO $IMG -c "zrp 268435456 1"
> > +echo
> > +echo
> > +echo "(6) append write" # assuming block size of device is 4096
> > +sudo $QEMU_IO $IMG -c "zap 0 0x1000 0x2000"
> > +echo "After appending the first zone:"
> > +sudo $QEMU_IO $IMG -c "zrp 0 1"
> > +sudo $QEMU_IO $IMG -c "zap 268435456 0x1000 0x2000"
> > +echo "After appending the second zone:"
> > +sudo $QEMU_IO $IMG -c "zrp 268435456 1"
> > # success, all done
> > echo "*** done"
> > rm -f $seq.full
>
> --
> Damien Le Moal
> Western Digital Research
>