qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v10 3/7] block: add block layer APIs resembling Linux ZonedBl


From: Damien Le Moal
Subject: Re: [PATCH v10 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
Date: Fri, 7 Oct 2022 08:03:54 +0900
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Thunderbird/102.3.1

On 10/7/22 00:17, Stefan Hajnoczi wrote:
> On Tue, Oct 04, 2022 at 08:23:15AM +0900, Damien Le Moal wrote:
>> On 2022/10/04 2:47, Stefan Hajnoczi wrote:
>>> On Thu, Sep 29, 2022 at 04:36:27PM +0800, Sam Li wrote:
>>>> Add a new zoned_host_device BlockDriver. The zoned_host_device option
>>>> accepts only zoned host block devices. By adding zone management
>>>> operations in this new BlockDriver, users can use the new block
>>>> layer APIs including Report Zone and four zone management operations
>>>> (open, close, finish, reset).
>>>>
>>>> Qemu-io uses the new APIs to perform zoned storage commands of the device:
>>>> zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs),
>>>> zone_finish(zf).
>>>>
>>>> For example, to test zone_report, use following command:
>>>> $ ./build/qemu-io --image-opts -n driver=zoned_host_device, 
>>>> filename=/dev/nullb0
>>>> -c "zrp offset nr_zones"
>>>>
>>>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>>>> Reviewed-by: Hannes Reinecke <hare@suse.de>
>>>> ---
>>>>  block/block-backend.c             | 146 +++++++++++++
>>>>  block/file-posix.c                | 340 +++++++++++++++++++++++++++++-
>>>>  block/io.c                        |  41 ++++
>>>>  include/block/block-common.h      |   4 +
>>>>  include/block/block-io.h          |   7 +
>>>>  include/block/block_int-common.h  |  24 +++
>>>>  include/block/raw-aio.h           |   6 +-
>>>>  include/sysemu/block-backend-io.h |  17 ++
>>>>  meson.build                       |   4 +
>>>>  qapi/block-core.json              |   8 +-
>>>>  qemu-io-cmds.c                    | 148 +++++++++++++
>>>>  11 files changed, 741 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>>> index d4a5df2ac2..f7f7acd6f4 100644
>>>> --- a/block/block-backend.c
>>>> +++ b/block/block-backend.c
>>>> @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo {
>>>>      void *iobuf;
>>>>      int ret;
>>>>      BdrvRequestFlags flags;
>>>> +    union {
>>>> +        struct {
>>>> +            unsigned int *nr_zones;
>>>> +            BlockZoneDescriptor *zones;
>>>> +        } zone_report;
>>>> +        struct {
>>>> +            BlockZoneOp op;
>>>> +        } zone_mgmt;
>>>> +    };
>>>>  } BlkRwCo;
>>>>  
>>>>  int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
>>>> @@ -1775,6 +1784,143 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
>>>>      return ret;
>>>>  }
>>>>  
>>>> +static void blk_aio_zone_report_entry(void *opaque) {
>>>
>>>
>>> The coroutine_fn annotation is missing:
>>>
>>>   static void coroutine_fn blk_aio_zone_report_entry(void *opaque) {
>>>
>>>> +    BlkAioEmAIOCB *acb = opaque;
>>>> +    BlkRwCo *rwco = &acb->rwco;
>>>> +
>>>> +    rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
>>>> +                                   rwco->zone_report.nr_zones,
>>>> +                                   rwco->zone_report.zones);
>>>> +    blk_aio_complete(acb);
>>>> +}
>>>> +
>>>> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
>>>> +                                unsigned int *nr_zones,
>>>> +                                BlockZoneDescriptor  *zones,
>>>> +                                BlockCompletionFunc *cb, void *opaque)
>>>> +{
>>>> +    BlkAioEmAIOCB *acb;
>>>> +    Coroutine *co;
>>>> +    IO_CODE();
>>>> +
>>>> +    blk_inc_in_flight(blk);
>>>> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
>>>> +    acb->rwco = (BlkRwCo) {
>>>> +        .blk    = blk,
>>>> +        .offset = offset,
>>>> +        .ret    = NOT_DONE,
>>>> +        .zone_report = {
>>>> +            .zones = zones,
>>>> +            .nr_zones = nr_zones,
>>>> +        },
>>>> +    };
>>>> +    acb->has_returned = false;
>>>> +
>>>> +    co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
>>>> +    bdrv_coroutine_enter(blk_bs(blk), co);
>>>> +
>>>> +    acb->has_returned = true;
>>>> +    if (acb->rwco.ret != NOT_DONE) {
>>>> +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
>>>> +                                         blk_aio_complete_bh, acb);
>>>> +    }
>>>> +
>>>> +    return &acb->common;
>>>> +}
>>>> +
>>>> +static void blk_aio_zone_mgmt_entry(void *opaque) {
>>>
>>> coroutine_fn is missing here.
>>>
>>>> +    BlkAioEmAIOCB *acb = opaque;
>>>> +    BlkRwCo *rwco = &acb->rwco;
>>>> +
>>>> +    rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op,
>>>> +                                 rwco->offset, acb->bytes);
>>>> +    blk_aio_complete(acb);
>>>> +}
>>>> +
>>>> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>>> +                              int64_t offset, int64_t len,
>>>> +                              BlockCompletionFunc *cb, void *opaque) {
>>>> +    BlkAioEmAIOCB *acb;
>>>> +    Coroutine *co;
>>>> +    IO_CODE();
>>>> +
>>>> +    blk_inc_in_flight(blk);
>>>> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
>>>> +    acb->rwco = (BlkRwCo) {
>>>> +        .blk    = blk,
>>>> +        .offset = offset,
>>>> +        .ret    = NOT_DONE,
>>>> +        .zone_mgmt = {
>>>> +            .op = op,
>>>> +        },
>>>> +    };
>>>> +    acb->bytes = len;
>>>> +    acb->has_returned = false;
>>>> +
>>>> +    co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
>>>> +    bdrv_coroutine_enter(blk_bs(blk), co);
>>>> +
>>>> +    acb->has_returned = true;
>>>> +    if (acb->rwco.ret != NOT_DONE) {
>>>> +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
>>>> +                                         blk_aio_complete_bh, acb);
>>>> +    }
>>>> +
>>>> +    return &acb->common;
>>>> +}
>>>> +
>>>> +/*
>>>> + * Send a zone_report command.
>>>> + * offset is a byte offset from the start of the device. No alignment
>>>> + * required for offset.
>>>> + * nr_zones represents IN maximum and OUT actual.
>>>> + */
>>>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>>>> +                                    unsigned int *nr_zones,
>>>> +                                    BlockZoneDescriptor *zones)
>>>> +{
>>>> +    int ret;
>>>> +    IO_CODE();
>>>> +
>>>> +    blk_inc_in_flight(blk); /* increase before waiting */
>>>> +    blk_wait_while_drained(blk);
>>>> +    if (!blk_is_available(blk)) {
>>>> +        blk_dec_in_flight(blk);
>>>> +        return -ENOMEDIUM;
>>>> +    }
>>>> +    ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
>>>> +    blk_dec_in_flight(blk);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +/*
>>>> + * Send a zone_management command.
>>>> + * op is the zone operation;
>>>> + * offset is the byte offset from the start of the zoned device;
>>>> + * len is the maximum number of bytes the command should operate on. It
>>>> + * should be aligned with the device zone size.
>>>> + */
>>>> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>>> +        int64_t offset, int64_t len)
>>>> +{
>>>> +    int ret;
>>>> +    IO_CODE();
>>>> +
>>>> +
>>>> +    blk_inc_in_flight(blk);
>>>> +    blk_wait_while_drained(blk);
>>>> +
>>>> +    ret = blk_check_byte_request(blk, offset, len);
>>>> +    if (ret < 0) {
>>>> +        blk_dec_in_flight(blk);
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
>>>> +    blk_dec_in_flight(blk);
>>>> +    return ret;
>>>> +}
>>>> +
>>>>  void blk_drain(BlockBackend *blk)
>>>>  {
>>>>      BlockDriverState *bs = blk_bs(blk);
>>>> diff --git a/block/file-posix.c b/block/file-posix.c
>>>> index 0a8b4b426e..0a6c781201 100644
>>>> --- a/block/file-posix.c
>>>> +++ b/block/file-posix.c
>>>> @@ -67,6 +67,9 @@
>>>>  #include <sys/param.h>
>>>>  #include <sys/syscall.h>
>>>>  #include <sys/vfs.h>
>>>> +#if defined(CONFIG_BLKZONED)
>>>> +#include <linux/blkzoned.h>
>>>> +#endif
>>>>  #include <linux/cdrom.h>
>>>>  #include <linux/fd.h>
>>>>  #include <linux/fs.h>
>>>> @@ -216,6 +219,15 @@ typedef struct RawPosixAIOData {
>>>>              PreallocMode prealloc;
>>>>              Error **errp;
>>>>          } truncate;
>>>> +        struct {
>>>> +            unsigned int *nr_zones;
>>>> +            BlockZoneDescriptor *zones;
>>>> +        } zone_report;
>>>> +        struct {
>>>> +            unsigned long zone_op;
>>>> +            const char *zone_op_name;
>>>> +            bool all;
>>>
>>> Please remove this field if it is unused.
>>>
>>>> +        } zone_mgmt;
>>>>      };
>>>>  } RawPosixAIOData;
>>>>  
>>>> @@ -1339,7 +1351,7 @@ static void raw_refresh_limits(BlockDriverState *bs, 
>>>> Error **errp)
>>>>  #endif
>>>>  
>>>>      if (bs->sg || S_ISBLK(st.st_mode)) {
>>>> -        int ret = hdev_get_max_hw_transfer(s->fd, &st);
>>>> +        ret = hdev_get_max_hw_transfer(s->fd, &st);
>>>>  
>>>>          if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
>>>>              bs->bl.max_hw_transfer = ret;
>>>> @@ -1356,6 +1368,41 @@ static void raw_refresh_limits(BlockDriverState 
>>>> *bs, Error **errp)
>>>>          zoned = BLK_Z_NONE;
>>>>      }
>>>>      bs->bl.zoned = zoned;
>>>> +    if (zoned != BLK_Z_NONE) {
>>>> +        ret = get_sysfs_long_val(&st, "chunk_sectors");
>>>> +        if (ret <= 0) {
>>>> +            error_report("Invalid zone size %" PRId32 " sectors ", ret);
>>>> +            bs->bl.zoned = BLK_Z_NONE;
>>>> +            return;
>>>> +        }
>>>> +        bs->bl.zone_size = ret * 512;
>>>> +
>>>> +        ret = get_sysfs_long_val(&st, "zone_append_max_bytes");
>>>> +        if (ret > 0) {
>>>> +            bs->bl.max_append_sectors = ret / 512;
>>>> +        }
>>>> +
>>>> +        ret = get_sysfs_long_val(&st, "max_open_zones");
>>>> +        if (ret >= 0) {
>>>> +            bs->bl.max_open_zones = ret;
>>>> +        }
>>>> +
>>>> +        ret = get_sysfs_long_val(&st, "max_active_zones");
>>>> +        if (ret >= 0) {
>>>> +            bs->bl.max_active_zones = ret;
>>>> +        }
>>>> +        
>>>> +        ret = get_sysfs_long_val(&st, "nr_zones");
>>>> +        if (ret >= 0) {
>>>> +            bs->bl.nr_zones = ret;
>>>> +        }
>>>> +
>>>> +        ret = ioctl(s->fd, BLKGETSIZE64, &bs->bl.capacity);
>>>> +        if (ret != 0) {
>>>> +            error_report("Invalid device capacity %" PRId64 " bytes ", 
>>>> bs->bl.capacity);
>>>> +            return;
>>>> +        }
>>>
>>> The QEMU block layer already knows the capacity of the device. Can
>>> bdrv_getlength() be used instead of introducing a new
>>> BlockLimits.capacity field?
>>>
>>>> +    }
>>>>  }
>>>>  
>>>>  static int check_for_dasd(int fd)
>>>> @@ -1850,6 +1897,147 @@ static off_t copy_file_range(int in_fd, off_t 
>>>> *in_off, int out_fd,
>>>>  }
>>>>  #endif
>>>>  
>>>> +/*
>>>> + * parse_zone - Fill a zone descriptor
>>>> + */
>>>> +#if defined(CONFIG_BLKZONED)
>>>> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
>>>> +                              const struct blk_zone *blkz,
>>>> +                              const struct blk_zone_report *rep) {
>>>> +    zone->start = blkz->start << BDRV_SECTOR_BITS;
>>>> +    zone->length = blkz->len << BDRV_SECTOR_BITS;
>>>> +    zone->wp = blkz->wp << BDRV_SECTOR_BITS;
>>>> +    
>>>> +    if (rep->flags & BLK_ZONE_REP_CAPACITY) {
>>>> +        zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
>>>
>>> #ifdef HAVE_BLK_ZONE_REP_CAPACITY is needed since the rep->flags and
>>> blkz->capacity fields are missing and would cause a compilation error
>>> when HAVE_BLK_ZONE_REP_CAPACITY is not defined:
>>>
>>>   zone->cap = blkz->len << BDRV_SECTOR_BITS;
>>>   #ifdef HAVE_BLK_ZONE_REP_CAPACITY
>>>   /* Replace with the dedicated field on newer kernels */
>>>   if (rep->flags & BLK_ZONE_REP_CAPACITY) {
>>>       zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
>>>   }
>>>   #endif
>>
>> It would be a lot cleaner to do something like this:
>>
>> in the block common header file, add:
>>
>> #ifdef HAVE_BLK_ZONE_REP_CAPACITY
>>
>> #define BLK_ZONE_REP_CAPACITY   (1 << 0)
>>
>> struct blk_zone_v2 {
>>         __u64   start;          /* Zone start sector */
>>         __u64   len;            /* Zone length in number of sectors */
>>         __u64   wp;             /* Zone write pointer position */
>>         __u8    type;           /* Zone type */
>>         __u8    cond;           /* Zone condition */
>>         __u8    non_seq;        /* Non-sequential write resources active */
>>         __u8    reset;          /* Reset write pointer recommended */
>>         __u8    resv[4];
>>         __u64   capacity;       /* Zone capacity in number of sectors */
>>         __u8    reserved[24];
>> };
>> #define blk_zone blk_zone_v2
>>
>> struct blk_zone_report_v2 {
>>         __u64   sector;
>>         __u32   nr_zones;
>>         __u32   flags;
>>      struct blk_zone zones[0];
>> };
>> #define blk_zone_report blk_zone_report_v2
>>
>> #endif
>>
>> Then the above code becomes:
>>
>> if (rep->flags & BLK_ZONE_REP_CAPACITY) {
>>     zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
>> } else {
>>     zone->cap = blkz->len << BDRV_SECTOR_BITS;
>> }
>>
>> No #ifdef in the C code, only in the header and that compiles and works for 
>> all
>> host kernel versions.
> 
> The ->flags field doesn't exist in old versions of the header. How will
> "if (rep->flags ..." compile on old systems?

The "#define blk_zone_report blk_zone_report_v2" overloads the kernel API
defined struct that does not have the flag with the v2 struct definition
that has the flags field. So code compiled against older kernels can use
the flags and will always see that field as 0. This is how we have libzbd
coded and this compiles & works on all kernels.

> 
> Stefan

-- 
Damien Le Moal
Western Digital Research




reply via email to

[Prev in Thread] Current Thread [Next in Thread]