[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v11 3/7] block: add block layer APIs resembling Linux ZonedBl
From: |
Damien Le Moal |
Subject: |
Re: [PATCH v11 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls |
Date: |
Thu, 13 Oct 2022 14:45:32 +0900 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Thunderbird/102.3.1 |
On 10/13/22 14:33, Sam Li wrote:
> Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月13日周四 12:41写道:
>>
>> On 10/10/22 11:21, Sam Li wrote:
>>> Add a new zoned_host_device BlockDriver. The zoned_host_device option
>>> accepts only zoned host block devices. By adding zone management
>>> operations in this new BlockDriver, users can use the new block
>>> layer APIs including Report Zone and four zone management operations
>>> (open, close, finish, reset, reset_all).
>>>
>>> Qemu-io uses the new APIs to perform zoned storage commands of the device:
>>> zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs),
>>> zone_finish(zf).
>>>
>>> For example, to test zone_report, use following command:
>>> $ ./build/qemu-io --image-opts -n driver=zoned_host_device,
>>> filename=/dev/nullb0
>>> -c "zrp offset nr_zones"
>>>
>>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>>> Reviewed-by: Hannes Reinecke <hare@suse.de>
>>> ---
>>> block/block-backend.c | 146 +++++++++++++
>>> block/file-posix.c | 329 ++++++++++++++++++++++++++++++
>>> block/io.c | 41 ++++
>>> include/block/block-common.h | 1 +
>>> include/block/block-io.h | 7 +
>>> include/block/block_int-common.h | 24 +++
>>> include/block/raw-aio.h | 6 +-
>>> include/sysemu/block-backend-io.h | 17 ++
>>> meson.build | 4 +
>>> qapi/block-core.json | 8 +-
>>> qemu-io-cmds.c | 148 ++++++++++++++
>>> 11 files changed, 728 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>> index d4a5df2ac2..ddc569e3ac 100644
>>> --- a/block/block-backend.c
>>> +++ b/block/block-backend.c
>>> @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo {
>>> void *iobuf;
>>> int ret;
>>> BdrvRequestFlags flags;
>>> + union {
>>> + struct {
>>> + unsigned int *nr_zones;
>>> + BlockZoneDescriptor *zones;
>>> + } zone_report;
>>> + struct {
>>> + BlockZoneOp op;
>>> + } zone_mgmt;
>>> + };
>>> } BlkRwCo;
>>>
>>> int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
>>> @@ -1775,6 +1784,143 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
>>> return ret;
>>> }
>>>
>>> +static void coroutine_fn blk_aio_zone_report_entry(void *opaque) {
>>> + BlkAioEmAIOCB *acb = opaque;
>>> + BlkRwCo *rwco = &acb->rwco;
>>> +
>>> + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
>>> + rwco->zone_report.nr_zones,
>>> + rwco->zone_report.zones);
>>> + blk_aio_complete(acb);
>>> +}
>>> +
>>> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
>>> + unsigned int *nr_zones,
>>> + BlockZoneDescriptor *zones,
>>> + BlockCompletionFunc *cb, void *opaque)
>>> +{
>>> + BlkAioEmAIOCB *acb;
>>> + Coroutine *co;
>>> + IO_CODE();
>>> +
>>> + blk_inc_in_flight(blk);
>>> + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
>>> + acb->rwco = (BlkRwCo) {
>>> + .blk = blk,
>>> + .offset = offset,
>>> + .ret = NOT_DONE,
>>> + .zone_report = {
>>> + .zones = zones,
>>> + .nr_zones = nr_zones,
>>> + },
>>> + };
>>> + acb->has_returned = false;
>>> +
>>> + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
>>> + bdrv_coroutine_enter(blk_bs(blk), co);
>>> +
>>> + acb->has_returned = true;
>>> + if (acb->rwco.ret != NOT_DONE) {
>>> + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
>>> + blk_aio_complete_bh, acb);
>>> + }
>>> +
>>> + return &acb->common;
>>> +}
>>> +
>>> +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque) {
>>> + BlkAioEmAIOCB *acb = opaque;
>>> + BlkRwCo *rwco = &acb->rwco;
>>> +
>>> + rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op,
>>> + rwco->offset, acb->bytes);
>>> + blk_aio_complete(acb);
>>> +}
>>> +
>>> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>> + int64_t offset, int64_t len,
>>> + BlockCompletionFunc *cb, void *opaque) {
>>> + BlkAioEmAIOCB *acb;
>>> + Coroutine *co;
>>> + IO_CODE();
>>> +
>>> + blk_inc_in_flight(blk);
>>> + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
>>> + acb->rwco = (BlkRwCo) {
>>> + .blk = blk,
>>> + .offset = offset,
>>> + .ret = NOT_DONE,
>>> + .zone_mgmt = {
>>> + .op = op,
>>> + },
>>> + };
>>> + acb->bytes = len;
>>> + acb->has_returned = false;
>>> +
>>> + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
>>> + bdrv_coroutine_enter(blk_bs(blk), co);
>>> +
>>> + acb->has_returned = true;
>>> + if (acb->rwco.ret != NOT_DONE) {
>>> + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
>>> + blk_aio_complete_bh, acb);
>>> + }
>>> +
>>> + return &acb->common;
>>> +}
>>> +
>>> +/*
>>> + * Send a zone_report command.
>>> + * offset is a byte offset from the start of the device. No alignment
>>> + * required for offset.
>>> + * nr_zones represents IN maximum and OUT actual.
>>> + */
>>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>>> + unsigned int *nr_zones,
>>> + BlockZoneDescriptor *zones)
>>> +{
>>> + int ret;
>>> + IO_CODE();
>>> +
>>> + blk_inc_in_flight(blk); /* increase before waiting */
>>> + blk_wait_while_drained(blk);
>>> + if (!blk_is_available(blk)) {
>>> + blk_dec_in_flight(blk);
>>> + return -ENOMEDIUM;
>>> + }
>>> + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
>>> + blk_dec_in_flight(blk);
>>> + return ret;
>>> +}
>>> +
>>> +/*
>>> + * Send a zone_management command.
>>> + * op is the zone operation;
>>> + * offset is the byte offset from the start of the zoned device;
>>> + * len is the maximum number of bytes the command should operate on. It
>>> + * should be aligned with the device zone size.
>>> + */
>>> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>> + int64_t offset, int64_t len)
>>> +{
>>> + int ret;
>>> + IO_CODE();
>>> +
>>> +
>>> + blk_inc_in_flight(blk);
>>> + blk_wait_while_drained(blk);
>>> +
>>> + ret = blk_check_byte_request(blk, offset, len);
>>> + if (ret < 0) {
>>> + blk_dec_in_flight(blk);
>>> + return ret;
>>> + }
>>> +
>>> + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
>>> + blk_dec_in_flight(blk);
>>> + return ret;
>>> +}
>>> +
>>> void blk_drain(BlockBackend *blk)
>>> {
>>> BlockDriverState *bs = blk_bs(blk);
>>> diff --git a/block/file-posix.c b/block/file-posix.c
>>> index 0db4b04e8a..226f5d48f5 100644
>>> --- a/block/file-posix.c
>>> +++ b/block/file-posix.c
>>> @@ -67,6 +67,9 @@
>>> #include <sys/param.h>
>>> #include <sys/syscall.h>
>>> #include <sys/vfs.h>
>>> +#if defined(CONFIG_BLKZONED)
>>> +#include <linux/blkzoned.h>
>>> +#endif
>>> #include <linux/cdrom.h>
>>> #include <linux/fd.h>
>>> #include <linux/fs.h>
>>> @@ -216,6 +219,14 @@ typedef struct RawPosixAIOData {
>>> PreallocMode prealloc;
>>> Error **errp;
>>> } truncate;
>>> + struct {
>>> + unsigned int *nr_zones;
>>> + BlockZoneDescriptor *zones;
>>> + } zone_report;
>>> + struct {
>>> + unsigned long zone_op;
>>
>> Why use unsigned long type here ? Shouldn't this be a BlockZoneOp,
>> similarly to struct BlkRwCo ?
>
> zone_op represents Linux's macros BLK*ZONE. Maybe I'll change op's
> type to unsigned long instead.
>
>>
>>> + const char *zone_op_name;
>>> + } zone_mgmt;
>>> };
>>> } RawPosixAIOData;
>>>
>>> @@ -1356,6 +1367,41 @@ static void raw_refresh_limits(BlockDriverState *bs,
>>> Error **errp)
>>> zoned = BLK_Z_NONE;
>>> }
>>> bs->bl.zoned = zoned;
>>> + if (zoned != BLK_Z_NONE) {
>>> + ret = get_sysfs_long_val(&st, "chunk_sectors");
>>> + if (ret <= 0) {
>>> + error_report("Invalid zone size %" PRId32 " sectors ", ret);
>>> + bs->bl.zoned = BLK_Z_NONE;
>>> + return;
>>> + }
>>> + bs->bl.zone_size = ret * 512;
>>
>> = ret << BDRV_SECTOR_BITS; ?
>
> For power-of-two zone size, yes. I'll change it.
The properties of the zone size are irrelevant here. This is a
conversion between sectors to bytes, regardless of the zone size value.
The convention in qemu block drivers seems to be to use BDRV_SECTOR_BITS
bit shifts instead of hardcoded operations with 512.
>
>>
>>> +
>>> + ret = get_sysfs_long_val(&st, "zone_append_max_bytes");
>>> + if (ret > 0) {
>>> + bs->bl.max_append_sectors = ret / 512;
>>
>> = ret >> BDRV_SECTOR_BITS; ?
>
> Same as above.
Same comment :)
>
>>
>>> + }
>>> +
>>> + ret = get_sysfs_long_val(&st, "max_open_zones");
>>> + if (ret >= 0) {
>>> + bs->bl.max_open_zones = ret;
>>> + }
>>> +
>>> + ret = get_sysfs_long_val(&st, "max_active_zones");
>>> + if (ret >= 0) {
>>> + bs->bl.max_active_zones = ret;
>>> + }
>>> +
>>> + ret = get_sysfs_long_val(&st, "nr_zones");
>>> + if (ret >= 0) {
>>> + bs->bl.nr_zones = ret;
>>> + }
>>
>> If getting this parameter fails, we should do the same as for
>> chunk_sectors: warn about the error and degrade to BLK_Z_NONE model. You
>> should move this parameter handling right after chunk_sectors handling.
>
> So zoned devices(SWR,SWP) must at least set zone_size and nr_zones fields.
Yes, unless the host kernel has a bug. Eventhough that is unlikely,
better to check !
>
>>
>>> +
>>> + ret = ioctl(s->fd, BLKGETSIZE64, &bs->bl.capacity);
>>> + if (ret != 0) {
>>> + error_report("Invalid device capacity %" PRId64 " bytes ",
>>> bs->bl.capacity);
>>
>> Hmmm. The message is a little off here. The error means that ioctl()
>> failed to get the capacity. It does not explicitly mean that the device
>> capacity is incorrect. So something like "failed to get device capacity"
>> message would be more correct I think. I would also move this parameter
>> handling up, together with chunk_sectors and nr_zones.
>>
>>> + return;
>>> + }
>>> + }
>>> }
>>>
>>> static int check_for_dasd(int fd)
>>> @@ -1850,6 +1896,141 @@ static off_t copy_file_range(int in_fd, off_t
>>> *in_off, int out_fd,
>>> }
>>> #endif
>>>
>>> +/*
>>> + * parse_zone - Fill a zone descriptor
>>> + */
>>> +#if defined(CONFIG_BLKZONED)
>>> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
>>> + const struct blk_zone *blkz) {
>>> + zone->start = blkz->start << BDRV_SECTOR_BITS;
>>> + zone->length = blkz->len << BDRV_SECTOR_BITS;
>>> + zone->wp = blkz->wp << BDRV_SECTOR_BITS;
>>> +
>>> + zone->cap = blkz->len << BDRV_SECTOR_BITS;
>>
>> This line should go into a #else below.
>>
>>> +#ifdef HAVE_BLK_ZONE_REP_CAPACITY
>>> + zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
>>
>> Indentation is off.
>>
>>> +#endif
>>> +
>>> + switch (blkz->type) {
>>> + case BLK_ZONE_TYPE_SEQWRITE_REQ:
>>> + zone->type = BLK_ZT_SWR;
>>> + break;
>>> + case BLK_ZONE_TYPE_SEQWRITE_PREF:
>>> + zone->type = BLK_ZT_SWP;
>>> + break;
>>> + case BLK_ZONE_TYPE_CONVENTIONAL:
>>> + zone->type = BLK_ZT_CONV;
>>> + break;
>>> + default:
>>> + g_assert_not_reached();
>>
>> I really do not like this... If the kernel is changed and a new zone
>> type introduced, then this will fail instead of warning and returning an
>> error for an unknown zone type. So I would really prefer an error
>> message and error return here.
>
> Under the assumption that no other zone type should happen,
> g_assert_not_reached() will indicate the process to abort and
> terminate with a coredump file for debugging. Since the assumption is
> broken, I'll change it and following zone state check back to
> error_report() like:
> + error_report("Invalid zone type: 0x%x", blkz->type);
Yes. That will avoid problems if the host kernel changes.
>
>>
>>> + }
>>> +
>>> + switch (blkz->cond) {
>>> + case BLK_ZONE_COND_NOT_WP:
>>> + zone->cond = BLK_ZS_NOT_WP;
>>> + break;
>>> + case BLK_ZONE_COND_EMPTY:
>>> + zone->cond = BLK_ZS_EMPTY;
>>> + break;
>>> + case BLK_ZONE_COND_IMP_OPEN:
>>> + zone->cond = BLK_ZS_IOPEN;
>>> + break;
>>> + case BLK_ZONE_COND_EXP_OPEN:
>>> + zone->cond = BLK_ZS_EOPEN;
>>> + break;
>>> + case BLK_ZONE_COND_CLOSED:
>>> + zone->cond = BLK_ZS_CLOSED;
>>> + break;
>>> + case BLK_ZONE_COND_READONLY:
>>> + zone->cond = BLK_ZS_RDONLY;
>>> + break;
>>> + case BLK_ZONE_COND_FULL:
>>> + zone->cond = BLK_ZS_FULL;
>>> + break;
>>> + case BLK_ZONE_COND_OFFLINE:
>>> + zone->cond = BLK_ZS_OFFLINE;
>>> + break;
>>> + default:
>>> + g_assert_not_reached();
>>
>> Same comment here.
>>
>>> + }
>>> +}
>>> +#endif
>>> +
>>> +#if defined(CONFIG_BLKZONED)
>>> +static int handle_aiocb_zone_report(void *opaque) {
>>> + RawPosixAIOData *aiocb = opaque;
>>> + int fd = aiocb->aio_fildes;
>>> + unsigned int *nr_zones = aiocb->zone_report.nr_zones;
>>> + BlockZoneDescriptor *zones = aiocb->zone_report.zones;
>>> + /* zoned block devices use 512-byte sectors */
>>> + int64_t sector = aiocb->aio_offset / 512;
>>> +
>>> + struct blk_zone *blkz;
>>> + int64_t rep_size;
>>> + unsigned int nrz;
>>> + int ret, n = 0, i = 0;
>>> +
>>> + nrz = *nr_zones;
>>> + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct
>>> blk_zone);
>>> + g_autofree struct blk_zone_report *rep = NULL;
>>> + rep = g_malloc(rep_size);
>>> +
>>> + blkz = (struct blk_zone *)(rep + 1);
>>> + while (n < nrz) {
>>> + memset(rep, 0, rep_size);
>>> + rep->sector = sector;
>>> + rep->nr_zones = nrz - n;
>>> +
>>> + do {
>>> + ret = ioctl(fd, BLKREPORTZONE, rep);
>>> + } while (ret != 0 && errno == EINTR);
>>> + if (ret != 0) {
>>> + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed
>>> %d",
>>> + fd, sector, errno);
>>> + return -errno;
>>> + }
>>> +
>>> + if (!rep->nr_zones) {
>>> + break;
>>> + }
>>> +
>>> + for (i = 0; i < rep->nr_zones; i++, n++) {
>>> + parse_zone(&zones[n], &blkz[i]);
>>> + /* The next report should start after the last zone reported */
>>> + sector = blkz[i].start + blkz[i].len;
>>> + }
>>> + }
>>> +
>>> + *nr_zones = n;
>>> + return 0;
>>> +}
>>> +#endif
>>> +
>>> +#if defined(CONFIG_BLKZONED)
>>> +static int handle_aiocb_zone_mgmt(void *opaque) {
>>> + RawPosixAIOData *aiocb = opaque;
>>> + int fd = aiocb->aio_fildes;
>>> + int64_t sector = aiocb->aio_offset / 512;
>>> + int64_t nr_sectors = aiocb->aio_nbytes / 512;
>>> + struct blk_zone_range range;
>>> + int ret;
>>> +
>>> + /* Execute the operation */
>>> + range.sector = sector;
>>> + range.nr_sectors = nr_sectors;
>>> + do {
>>> + ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
>>> + } while (ret != 0 && errno == EINTR);
>>> + if (ret != 0) {
>>> + ret = -errno;
>>> + error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
>>> + ret);
>>> + return ret;
>>> + }
>>> + return ret;
>>
>> This can be "return 0". Or remove the "return ret" inside the "if" above.
>
> Will change to return 0.
>
>>
>>> +}
>>> +#endif
>>> +
>>> static int handle_aiocb_copy_range(void *opaque)
>>> {
>>> RawPosixAIOData *aiocb = opaque;
>>> @@ -3044,6 +3225,103 @@ static void raw_account_discard(BDRVRawState *s,
>>> uint64_t nbytes, int ret)
>>> }
>>> }
>>>
>>> +/*
>>> + * zone report - Get a zone block device's information in the form
>>> + * of an array of zone descriptors.
>>> + * zones is an array of zone descriptors to hold zone information on reply;
>>> + * offset can be any byte within the entire size of the device;
>>> + * nr_zones is the maxium number of sectors the command should operate on.
>>> + */
>>> +#if defined(CONFIG_BLKZONED)
>>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t
>>> offset,
>>> + unsigned int *nr_zones,
>>> + BlockZoneDescriptor *zones) {
>>> + BDRVRawState *s = bs->opaque;
>>> + RawPosixAIOData acb;
>>> +
>>> + acb = (RawPosixAIOData) {
>>> + .bs = bs,
>>> + .aio_fildes = s->fd,
>>> + .aio_type = QEMU_AIO_ZONE_REPORT,
>>> + .aio_offset = offset,
>>> + .zone_report = {
>>> + .nr_zones = nr_zones,
>>> + .zones = zones,
>>> + },
>>> + };
>>> +
>>> + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
>>> +}
>>> +#endif
>>> +
>>> +/*
>>> + * zone management operations - Execute an operation on a zone
>>> + */
>>> +#if defined(CONFIG_BLKZONED)
>>> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp
>>> op,
>>> + int64_t offset, int64_t len) {
>>> + BDRVRawState *s = bs->opaque;
>>> + RawPosixAIOData acb;
>>> + int64_t zone_size, zone_size_mask;
>>> + const char *zone_op_name;
>>> + unsigned long zone_op;
>>> +
>>> + zone_size = bs->bl.zone_size;
>>> + zone_size_mask = zone_size - 1;
>>> + if (offset & zone_size_mask) {
>>> + error_report("sector offset %" PRId64 " is not aligned to zone
>>> size "
>>> + "%" PRId64 "", offset / 512, zone_size / 512);
>>> + return -EINVAL;
>>> + }
>>> +
>>> + if (((offset + len) < bs->bl.capacity && len & zone_size_mask) ||
>>> + offset + len > bs->bl.capacity) {
>>> + error_report("number of sectors %" PRId64 " is not aligned to zone
>>> size"
>>> + " %" PRId64 "", len / 512, zone_size / 512);
>>> + return -EINVAL;
>>> + }
>>> +
>>> + switch (op) {
>>> + case BLK_ZO_OPEN:
>>> + zone_op_name = "BLKOPENZONE";
>>> + zone_op = BLKOPENZONE;
>>> + break;
>>> + case BLK_ZO_CLOSE:
>>> + zone_op_name = "BLKCLOSEZONE";
>>> + zone_op = BLKCLOSEZONE;
>>> + break;
>>> + case BLK_ZO_FINISH:
>>> + zone_op_name = "BLKFINISHZONE";
>>> + zone_op = BLKFINISHZONE;
>>> + break;
>>> + case BLK_ZO_RESET:
>>> + zone_op_name = "BLKRESETZONE";
>>> + zone_op = BLKRESETZONE;
>>> + break;
>>> + case BLK_ZO_RESET_ALL:
>>> + zone_op_name = "BLKRESETZONE";
>>> + zone_op = BLKRESETZONE;
>>> + break;
>>> + default:
>>> + g_assert_not_reached();
>>> + }
>>> +
>>> + acb = (RawPosixAIOData) {
>>> + .bs = bs,
>>> + .aio_fildes = s->fd,
>>> + .aio_type = QEMU_AIO_ZONE_MGMT,
>>> + .aio_offset = offset,
>>> + .aio_nbytes = len,
>>> + .zone_mgmt = {
>>> + .zone_op = zone_op,
>>> + .zone_op_name = zone_op_name,
>>> + },
>>> + };
>>> +
>>> + return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
>>> +}
>>> +#endif
>>> +
>>> static coroutine_fn int
>>> raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>>> bool blkdev)
>>> @@ -3774,6 +4052,54 @@ static BlockDriver bdrv_host_device = {
>>> #endif
>>> };
>>>
>>> +#if defined(CONFIG_BLKZONED)
>>> +static BlockDriver bdrv_zoned_host_device = {
>>> + .format_name = "zoned_host_device",
>>> + .protocol_name = "zoned_host_device",
>>> + .instance_size = sizeof(BDRVRawState),
>>> + .bdrv_needs_filename = true,
>>> + .bdrv_probe_device = hdev_probe_device,
>>> + .bdrv_file_open = hdev_open,
>>> + .bdrv_close = raw_close,
>>> + .bdrv_reopen_prepare = raw_reopen_prepare,
>>> + .bdrv_reopen_commit = raw_reopen_commit,
>>> + .bdrv_reopen_abort = raw_reopen_abort,
>>> + .bdrv_co_create_opts = bdrv_co_create_opts_simple,
>>> + .create_opts = &bdrv_create_opts_simple,
>>> + .mutable_opts = mutable_opts,
>>> + .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
>>> + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
>>> +
>>> + .bdrv_co_preadv = raw_co_preadv,
>>> + .bdrv_co_pwritev = raw_co_pwritev,
>>> + .bdrv_co_flush_to_disk = raw_co_flush_to_disk,
>>> + .bdrv_co_pdiscard = hdev_co_pdiscard,
>>> + .bdrv_co_copy_range_from = raw_co_copy_range_from,
>>> + .bdrv_co_copy_range_to = raw_co_copy_range_to,
>>> + .bdrv_refresh_limits = raw_refresh_limits,
>>> + .bdrv_io_plug = raw_aio_plug,
>>> + .bdrv_io_unplug = raw_aio_unplug,
>>> + .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>>> +
>>> + .bdrv_co_truncate = raw_co_truncate,
>>> + .bdrv_getlength = raw_getlength,
>>> + .bdrv_get_info = raw_get_info,
>>> + .bdrv_get_allocated_file_size
>>> + = raw_get_allocated_file_size,
>>> + .bdrv_get_specific_stats = hdev_get_specific_stats,
>>> + .bdrv_check_perm = raw_check_perm,
>>> + .bdrv_set_perm = raw_set_perm,
>>> + .bdrv_abort_perm_update = raw_abort_perm_update,
>>> + .bdrv_probe_blocksizes = hdev_probe_blocksizes,
>>> + .bdrv_probe_geometry = hdev_probe_geometry,
>>> + .bdrv_co_ioctl = hdev_co_ioctl,
>>> +
>>> + /* zone management operations */
>>> + .bdrv_co_zone_report = raw_co_zone_report,
>>> + .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>>> +};
>>> +#endif
>>> +
>>> #if defined(__linux__) || defined(__FreeBSD__) ||
>>> defined(__FreeBSD_kernel__)
>>> static void cdrom_parse_filename(const char *filename, QDict *options,
>>> Error **errp)
>>> @@ -4034,6 +4360,9 @@ static void bdrv_file_init(void)
>>> bdrv_register(&bdrv_file);
>>> #if defined(HAVE_HOST_BLOCK_DEVICE)
>>> bdrv_register(&bdrv_host_device);
>>> +#if defined(CONFIG_BLKZONED)
>>> + bdrv_register(&bdrv_zoned_host_device);
>>> +#endif
>>> #ifdef __linux__
>>> bdrv_register(&bdrv_host_cdrom);
>>> #endif
>>> diff --git a/block/io.c b/block/io.c
>>> index c3200bcdff..e5aaa64e17 100644
>>> --- a/block/io.c
>>> +++ b/block/io.c
>>> @@ -3189,6 +3189,47 @@ out:
>>> return co.ret;
>>> }
>>>
>>> +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
>>> + unsigned int *nr_zones,
>>> + BlockZoneDescriptor *zones)
>>> +{
>>> + BlockDriver *drv = bs->drv;
>>> + CoroutineIOCompletion co = {
>>> + .coroutine = qemu_coroutine_self(),
>>> + };
>>> + IO_CODE();
>>> +
>>> + bdrv_inc_in_flight(bs);
>>> + if (!drv || !drv->bdrv_co_zone_report) {
>>> + co.ret = -ENOTSUP;
>>> + goto out;
>>> + }
>>> + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
>>> +out:
>>> + bdrv_dec_in_flight(bs);
>>> + return co.ret;
>>> +}
>>> +
>>> +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>> + int64_t offset, int64_t len)
>>> +{
>>> + BlockDriver *drv = bs->drv;
>>> + CoroutineIOCompletion co = {
>>> + .coroutine = qemu_coroutine_self(),
>>> + };
>>> + IO_CODE();
>>> +
>>> + bdrv_inc_in_flight(bs);
>>> + if (!drv || !drv->bdrv_co_zone_mgmt) {
>>> + co.ret = -ENOTSUP;
>>> + goto out;
>>> + }
>>> + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
>>> +out:
>>> + bdrv_dec_in_flight(bs);
>>> + return co.ret;
>>> +}
>>> +
>>> void *qemu_blockalign(BlockDriverState *bs, size_t size)
>>> {
>>> IO_CODE();
>>> diff --git a/include/block/block-common.h b/include/block/block-common.h
>>> index 36bd0e480e..882de6825e 100644
>>> --- a/include/block/block-common.h
>>> +++ b/include/block/block-common.h
>>> @@ -54,6 +54,7 @@ typedef enum BlockZoneOp {
>>> BLK_ZO_CLOSE,
>>> BLK_ZO_FINISH,
>>> BLK_ZO_RESET,
>>> + BLK_ZO_RESET_ALL,
>>
>> Shouldn't this be done in patch 1 ?
>
> Yes, I'll move it.
>
>>
>>> } BlockZoneOp;
>>>
>>> typedef enum BlockZoneModel {
>>> diff --git a/include/block/block-io.h b/include/block/block-io.h
>>> index 492f95fc05..f0cdf67d33 100644
>>> --- a/include/block/block-io.h
>>> +++ b/include/block/block-io.h
>>> @@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void
>>> *buf);
>>> /* Ensure contents are flushed to disk. */
>>> int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
>>>
>>> +/* Report zone information of zone block device. */
>>> +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
>>> + unsigned int *nr_zones,
>>> + BlockZoneDescriptor *zones);
>>> +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>> + int64_t offset, int64_t len);
>>> +
>>> int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
>>> bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
>>> int bdrv_block_status(BlockDriverState *bs, int64_t offset,
>>> diff --git a/include/block/block_int-common.h
>>> b/include/block/block_int-common.h
>>> index 7f7863cc9e..cdc06e77a6 100644
>>> --- a/include/block/block_int-common.h
>>> +++ b/include/block/block_int-common.h
>>> @@ -691,6 +691,12 @@ struct BlockDriver {
>>> QEMUIOVector *qiov,
>>> int64_t pos);
>>>
>>> + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
>>> + int64_t offset, unsigned int *nr_zones,
>>> + BlockZoneDescriptor *zones);
>>> + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs,
>>> BlockZoneOp op,
>>> + int64_t offset, int64_t len);
>>> +
>>> /* removable device specific */
>>> bool (*bdrv_is_inserted)(BlockDriverState *bs);
>>> void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
>>> @@ -828,6 +834,24 @@ typedef struct BlockLimits {
>>>
>>> /* device zone model */
>>> BlockZoneModel zoned;
>>> +
>>> + /* zone size expressed in bytes */
>>> + uint32_t zone_size;
>>> +
>>> + /* total number of zones */
>>> + unsigned int nr_zones;
>>> +
>>> + /* maximum sectors of a zone append write operation */
>>> + int64_t max_append_sectors;
>>> +
>>> + /* maximum number of open zones */
>>> + int64_t max_open_zones;
>>> +
>>> + /* maximum number of active zones */
>>> + int64_t max_active_zones;
>>> +
>>> + /* device capacity expressed in bytes */
>>> + int64_t capacity;
>>> } BlockLimits;
>>>
>>> typedef struct BdrvOpBlocker BdrvOpBlocker;
>>> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
>>> index 21fc10c4c9..3d26929cdd 100644
>>> --- a/include/block/raw-aio.h
>>> +++ b/include/block/raw-aio.h
>>> @@ -29,6 +29,8 @@
>>> #define QEMU_AIO_WRITE_ZEROES 0x0020
>>> #define QEMU_AIO_COPY_RANGE 0x0040
>>> #define QEMU_AIO_TRUNCATE 0x0080
>>> +#define QEMU_AIO_ZONE_REPORT 0x0100
>>> +#define QEMU_AIO_ZONE_MGMT 0x0200
>>> #define QEMU_AIO_TYPE_MASK \
>>> (QEMU_AIO_READ | \
>>> QEMU_AIO_WRITE | \
>>> @@ -37,7 +39,9 @@
>>> QEMU_AIO_DISCARD | \
>>> QEMU_AIO_WRITE_ZEROES | \
>>> QEMU_AIO_COPY_RANGE | \
>>> - QEMU_AIO_TRUNCATE)
>>> + QEMU_AIO_TRUNCATE | \
>>
>> extra space before "|"
>
> Sorry :-(
>
>>
>>> + QEMU_AIO_ZONE_REPORT | \
>>> + QEMU_AIO_ZONE_MGMT)
>>>
>>> /* AIO flags */
>>> #define QEMU_AIO_MISALIGNED 0x1000
>>> diff --git a/include/sysemu/block-backend-io.h
>>> b/include/sysemu/block-backend-io.h
>>> index 50f5aa2e07..6835525582 100644
>>> --- a/include/sysemu/block-backend-io.h
>>> +++ b/include/sysemu/block-backend-io.h
>>> @@ -45,6 +45,12 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t
>>> offset,
>>> BlockCompletionFunc *cb, void *opaque);
>>> BlockAIOCB *blk_aio_flush(BlockBackend *blk,
>>> BlockCompletionFunc *cb, void *opaque);
>>> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
>>> + unsigned int *nr_zones,
>>> BlockZoneDescriptor *zones,
>>> + BlockCompletionFunc *cb, void *opaque);
>>> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>> + int64_t offset, int64_t len,
>>> + BlockCompletionFunc *cb, void *opaque);
>>> BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t
>>> bytes,
>>> BlockCompletionFunc *cb, void *opaque);
>>> void blk_aio_cancel_async(BlockAIOCB *acb);
>>> @@ -156,6 +162,17 @@ int generated_co_wrapper
>>> blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
>>> int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
>>> int64_t bytes, BdrvRequestFlags
>>> flags);
>>>
>>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>>> + unsigned int *nr_zones,
>>> + BlockZoneDescriptor *zones);
>>> +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset,
>>> + unsigned int *nr_zones,
>>> + BlockZoneDescriptor *zones);
>>> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>> + int64_t offset, int64_t len);
>>> +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>> + int64_t offset, int64_t len);
>>> +
>>> int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
>>> int64_t bytes);
>>> int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
>>> diff --git a/meson.build b/meson.build
>>> index 44c1f92697..0aa99b64a0 100644
>>> --- a/meson.build
>>> +++ b/meson.build
>>> @@ -1928,6 +1928,7 @@ config_host_data.set('CONFIG_REPLICATION',
>>> get_option('replication').allowed())
>>> # has_header
>>> config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h'))
>>> config_host_data.set('CONFIG_LINUX_MAGIC_H',
>>> cc.has_header('linux/magic.h'))
>>> +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h'))
>>> config_host_data.set('CONFIG_VALGRIND_H',
>>> cc.has_header('valgrind/valgrind.h'))
>>> config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h'))
>>> config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h'))
>>> @@ -2021,6 +2022,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID',
>>> config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM',
>>> cc.has_member('struct stat', 'st_atim',
>>> prefix: '#include <sys/stat.h>'))
>>> +config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY',
>>> + cc.has_member('struct blk_zone', 'capacity',
>>> + prefix: '#include <linux/blkzoned.h>'))
>>>
>>> # has_type
>>> config_host_data.set('CONFIG_IOVEC',
>>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>>> index f21fa235f2..ee87c1df8a 100644
>>> --- a/qapi/block-core.json
>>> +++ b/qapi/block-core.json
>>> @@ -2942,6 +2942,7 @@
>>> # @compress: Since 5.0
>>> # @copy-before-write: Since 6.2
>>> # @snapshot-access: Since 7.0
>>> +# @zoned_host_device: Since 7.2
>>> #
>>> # Since: 2.9
>>> ##
>>> @@ -2955,7 +2956,8 @@
>>> 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme',
>>> 'parallels',
>>> 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
>>> { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
>>> - 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
>>> + 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat',
>>> + { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] }
>>>
>>> ##
>>> # @BlockdevOptionsFile:
>>> @@ -4329,7 +4331,9 @@
>>> 'vhdx': 'BlockdevOptionsGenericFormat',
>>> 'vmdk': 'BlockdevOptionsGenericCOWFormat',
>>> 'vpc': 'BlockdevOptionsGenericFormat',
>>> - 'vvfat': 'BlockdevOptionsVVFAT'
>>> + 'vvfat': 'BlockdevOptionsVVFAT',
>>> + 'zoned_host_device': { 'type': 'BlockdevOptionsFile',
>>> + 'if': 'CONFIG_BLKZONED' }
>>> } }
>>>
>>> ##
>>> diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
>>> index 952dc940f1..e56c8d1c30 100644
>>> --- a/qemu-io-cmds.c
>>> +++ b/qemu-io-cmds.c
>>> @@ -1712,6 +1712,149 @@ static const cmdinfo_t flush_cmd = {
>>> .oneline = "flush all in-core file state to disk",
>>> };
>>>
>>> +static inline int64_t tosector(int64_t bytes) {
>>> + return bytes >> BDRV_SECTOR_BITS;
>>> +}
>>> +
>>> +static int zone_report_f(BlockBackend *blk, int argc, char **argv)
>>> +{
>>> + int ret;
>>> + int64_t offset;
>>> + unsigned int nr_zones;
>>> +
>>> + ++optind;
>>> + offset = cvtnum(argv[optind]);
>>> + ++optind;
>>> + nr_zones = cvtnum(argv[optind]);
>>> +
>>> + g_autofree BlockZoneDescriptor *zones = NULL;
>>> + zones = g_new(BlockZoneDescriptor, nr_zones);
>>> + ret = blk_zone_report(blk, offset, &nr_zones, zones);
>>> + if (ret < 0) {
>>> + printf("zone report failed: %s\n", strerror(-ret));
>>> + } else {
>>> + for (int i = 0; i < nr_zones; ++i) {
>>> + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", "
>>> + "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", "
>>> + "zcond:%u, [type: %u]\n",
>>> + tosector(zones[i].start), tosector(zones[i].length),
>>> + tosector(zones[i].cap), tosector(zones[i].wp),
>>> + zones[i].cond, zones[i].type);
>>> + }
>>> + }
>>> + return ret;
>>> +}
>>> +
>>> +static const cmdinfo_t zone_report_cmd = {
>>> + .name = "zone_report",
>>> + .altname = "zrp",
>>> + .cfunc = zone_report_f,
>>> + .argmin = 2,
>>> + .argmax = 2,
>>> + .args = "offset number",
>>> + .oneline = "report zone information",
>>> +};
>>> +
>>> +static int zone_open_f(BlockBackend *blk, int argc, char **argv)
>>> +{
>>> + int ret;
>>> + int64_t offset, len;
>>> + ++optind;
>>> + offset = cvtnum(argv[optind]);
>>> + ++optind;
>>> + len = cvtnum(argv[optind]);
>>> + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len);
>>> + if (ret < 0) {
>>> + printf("zone open failed: %s\n", strerror(-ret));
>>> + }
>>> + return ret;
>>> +}
>>> +
>>> +static const cmdinfo_t zone_open_cmd = {
>>> + .name = "zone_open",
>>> + .altname = "zo",
>>> + .cfunc = zone_open_f,
>>> + .argmin = 2,
>>> + .argmax = 2,
>>> + .args = "offset len",
>>> + .oneline = "explicit open a range of zones in zone block device",
>>> +};
>>> +
>>> +static int zone_close_f(BlockBackend *blk, int argc, char **argv)
>>> +{
>>> + int ret;
>>> + int64_t offset, len;
>>> + ++optind;
>>> + offset = cvtnum(argv[optind]);
>>> + ++optind;
>>> + len = cvtnum(argv[optind]);
>>> + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len);
>>> + if (ret < 0) {
>>> + printf("zone close failed: %s\n", strerror(-ret));
>>> + }
>>> + return ret;
>>> +}
>>> +
>>> +static const cmdinfo_t zone_close_cmd = {
>>> + .name = "zone_close",
>>> + .altname = "zc",
>>> + .cfunc = zone_close_f,
>>> + .argmin = 2,
>>> + .argmax = 2,
>>> + .args = "offset len",
>>> + .oneline = "close a range of zones in zone block device",
>>> +};
>>> +
>>> +static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
>>> +{
>>> + int ret;
>>> + int64_t offset, len;
>>> + ++optind;
>>> + offset = cvtnum(argv[optind]);
>>> + ++optind;
>>> + len = cvtnum(argv[optind]);
>>> + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len);
>>> + if (ret < 0) {
>>> + printf("zone finish failed: %s\n", strerror(-ret));
>>> + }
>>> + return ret;
>>> +}
>>> +
>>> +static const cmdinfo_t zone_finish_cmd = {
>>> + .name = "zone_finish",
>>> + .altname = "zf",
>>> + .cfunc = zone_finish_f,
>>> + .argmin = 2,
>>> + .argmax = 2,
>>> + .args = "offset len",
>>> + .oneline = "finish a range of zones in zone block device",
>>> +};
>>> +
>>> +static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
>>> +{
>>> + int ret;
>>> + int64_t offset, len;
>>> + ++optind;
>>> + offset = cvtnum(argv[optind]);
>>> + ++optind;
>>> + len = cvtnum(argv[optind]);
>>> + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len);
>>> + if (ret < 0) {
>>> + printf("zone reset failed: %s\n", strerror(-ret));
>>> + }
>>> + return ret;
>>> +}
>>> +
>>> +static const cmdinfo_t zone_reset_cmd = {
>>> + .name = "zone_reset",
>>> + .altname = "zrs",
>>> + .cfunc = zone_reset_f,
>>> + .argmin = 2,
>>> + .argmax = 2,
>>> + .args = "offset len",
>>> + .oneline = "reset a zone write pointer in zone block device",
>>> +};
>>> +
>>> static int truncate_f(BlockBackend *blk, int argc, char **argv);
>>> static const cmdinfo_t truncate_cmd = {
>>> .name = "truncate",
>>> @@ -2504,6 +2647,11 @@ static void __attribute((constructor))
>>> init_qemuio_commands(void)
>>> qemuio_add_command(&aio_write_cmd);
>>> qemuio_add_command(&aio_flush_cmd);
>>> qemuio_add_command(&flush_cmd);
>>> + qemuio_add_command(&zone_report_cmd);
>>> + qemuio_add_command(&zone_open_cmd);
>>> + qemuio_add_command(&zone_close_cmd);
>>> + qemuio_add_command(&zone_finish_cmd);
>>> + qemuio_add_command(&zone_reset_cmd);
>>> qemuio_add_command(&truncate_cmd);
>>> qemuio_add_command(&length_cmd);
>>> qemuio_add_command(&info_cmd);
>>
>> --
>> Damien Le Moal
>> Western Digital Research
>>
--
Damien Le Moal
Western Digital Research
[PATCH v11 4/7] raw-format: add zone operations to pass through requests, Sam Li, 2022/10/09
[PATCH v11 5/7] config: add check to block layer, Sam Li, 2022/10/09
[PATCH v11 6/7] qemu-iotests: test new zone operations, Sam Li, 2022/10/09
[PATCH v11 7/7] docs/zoned-storage: add zoned device documentation, Sam Li, 2022/10/09