qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [RFC v4 1/9] block: add block layer APIs resembling Linux ZonedBlock


From: Sam Li
Subject: Re: [RFC v4 1/9] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.
Date: Wed, 13 Jul 2022 08:51:45 +0800

Stefan Hajnoczi <stefanha@redhat.com> 于2022年7月12日周二 23:49写道:
>
> On Tue, Jul 12, 2022 at 10:13:37AM +0800, Sam Li wrote:
> > By adding zone management operations in BlockDriver, storage
> > controller emulation can use the new block layer APIs including
> > zone_report and zone_mgmt(open, close, finish, reset).
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  block/block-backend.c            |  41 ++++++
> >  block/coroutines.h               |   5 +
> >  block/file-posix.c               | 236 +++++++++++++++++++++++++++++++
> >  include/block/block-common.h     |  43 +++++-
> >  include/block/block_int-common.h |  20 +++
> >  5 files changed, 344 insertions(+), 1 deletion(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index f425b00793..0a05247ae4 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1806,6 +1806,47 @@ int blk_flush(BlockBackend *blk)
> >      return ret;
> >  }
> >
> > +/*
> > + * Send a zone_report command.
> > + * offset can be any number within the zone size. No alignment for offset.
>
> I think offset is a byte offset from the start of the device and its
> range is [0, total_sectors * BDRV_SECTOR_SIZE)?
>
> "any number within the zone size" gives the impression that the value
> must be [0, zone_size_in_bytes), which is not right.
>
> I suggest changing the text to "offset can be any number of bytes from
> the start of the device" or similar.
>
> > + * nr_zones represents IN maximum and OUT actual.
> > + */
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                                    int64_t *nr_zones,
> > +                                    BlockZoneDescriptor *zones)
> > +{
> > +    int ret;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk); /* increase before waiting */
> > +    blk_wait_while_drained(blk);
> > +    ret = bdrv_co_zone_report(blk->root->bs, offset, nr_zones, zones);
>
> The !blk_is_available(blk) case needs to return -ENOMEDIUM before we can
> safely dereference blk->root->bs (which can also be written as
> blk_bs(blk)).
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> > +/*
> > + * Send a zone_management command.
> > + * Offset is the start of a zone and len is aligned to zones.
> > + */
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
>
> Please define typedef enum { ... } BlockZoneOp instead of enum { ... }
> zone_op and then use a BlockZoneOp op argument instead of enum zone_op.
> QEMU coding style uses typedefs instead of struct foo or enum foo when
> possible.

Yes, it must be a type.

> > +        int64_t offset, int64_t len)
> > +{
> > +    int ret;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk);
> > +    blk_wait_while_drained(blk);
> > +    ret = blk_check_byte_request(blk, offset, len);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> >  void blk_drain(BlockBackend *blk)
> >  {
> >      BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/coroutines.h b/block/coroutines.h
> > index 830ecaa733..19aa96cc56 100644
> > --- a/block/coroutines.h
> > +++ b/block/coroutines.h
> > @@ -80,6 +80,11 @@ int coroutine_fn
> >  blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
> >
> >  int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                                    int64_t *nr_zones,
> > +                                    BlockZoneDescriptor *zones);
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +                                  int64_t offset, int64_t len);
> >
> >
> >  /*
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 48cd096624..e7523ae2ed 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -67,6 +67,7 @@
> >  #include <sys/param.h>
> >  #include <sys/syscall.h>
> >  #include <sys/vfs.h>
> > +#include <linux/blkzoned.h>
> >  #include <linux/cdrom.h>
> >  #include <linux/fd.h>
> >  #include <linux/fs.h>
> > @@ -216,6 +217,13 @@ typedef struct RawPosixAIOData {
> >              PreallocMode prealloc;
> >              Error **errp;
> >          } truncate;
> > +        struct {
> > +            int64_t *nr_zones;
> > +            BlockZoneDescriptor *zones;
> > +        } zone_report;
> > +        struct {
> > +            zone_op op;
> > +        } zone_mgmt;
> >      };
> >  } RawPosixAIOData;
> >
> > @@ -1801,6 +1809,130 @@ static off_t copy_file_range(int in_fd, off_t 
> > *in_off, int out_fd,
> >  }
> >  #endif
> >
>
> Are the functions below within #ifdef __linux__?

Maybe add them later?

> > +/*
> > + * parse_zone - Fill a zone descriptor
> > + */
> > +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> > +                              struct blk_zone *blkz) {
> > +    zone->start = blkz->start;
> > +    zone->length = blkz->len;
> > +    zone->cap = blkz->capacity;
> > +    zone->wp = blkz->wp - blkz->start;
> > +    zone->type = blkz->type;
> > +    zone->cond = blkz->cond;
> > +}
> > +
> > +static int handle_aiocb_zone_report(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> > +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> > +    int64_t offset = aiocb->aio_offset;
>
> The code is easier to read if it's clear this value is in sector units:
>
>   int64_t sector = aiocb->aio_offset / 512; /* BLKREPORTZONE uses 512B 
> sectors */
>
> Then there's no confusion about whether 'offset' is bytes or sectors.

I see. It is more readable.

> > +
> > +    struct blk_zone *blkz;
> > +    int64_t rep_size, nrz;
> > +    int ret, n = 0, i = 0;
> > +
> > +    nrz = *nr_zones;
> > +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct 
> > blk_zone);
> > +    g_autofree struct blk_zone_report *rep = NULL;
> > +    rep = g_malloc(rep_size);
> > +    offset = offset / 512; /* get the unit of the start sector: sector 
> > size is 512 bytes. */
> > +    printf("start to report zone with offset: 0x%lx\n", offset);
> > +
> > +    blkz = (struct blk_zone *)(rep + 1);
> > +    while (n < nrz) {
> > +        memset(rep, 0, rep_size);
> > +        rep->sector = offset;
> > +        rep->nr_zones = nrz;
>
> After the first time around the while loop zones[] no longer has space
> for nrz elements. This must be taken into account to avoid overflowing
> zones[]:
>
>   rep->nr_zones = nrz - n;
>
> > +
> > +        ret = ioctl(fd, BLKREPORTZONE, rep);
> > +        if (ret != 0) {
>
> Does this need to retry when ret != 0 && errno == EINTR is encountered?
> Damien/Hannes/Dmitry might know the answer. See handle_aiocb_rw_vector()
> for an example of retrying when EINTR is returned from a blocking
> syscall.
>
> > +            ret = -errno;
> > +            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
> > +                         fd, offset, errno);
>
> Please use "... at %" PRId64 " failed ..." instead of %ld for int64_t
> values. %ld is not portable because sizeof(long) varies on different
> machines.

Right! I've missed other part than qemuio-cmds to use PRI64.

> > +            return ret;
> > +        }
> > +
> > +        if (!rep->nr_zones) {
> > +            break;
> > +        }
> > +
> > +        for (i = 0; i < rep->nr_zones; i++, n++) {
> > +            parse_zone(&zones[n], &blkz[i]);
> > +            /* The next report should start after the last zone reported */
> > +            offset = blkz[i].start + blkz[i].len;
> > +        }
> > +    }
> > +
> > +    *nr_zones = n;
> > +    return 0;
> > +}
> > +
> > +static int handle_aiocb_zone_mgmt(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t offset = aiocb->aio_offset;
> > +    int64_t len = aiocb->aio_nbytes;
> > +    zone_op op = aiocb->zone_mgmt.op;
> > +
> > +    struct blk_zone_range range;
> > +    const char *ioctl_name;
> > +    unsigned long ioctl_op;
> > +    int64_t zone_size;
> > +    int64_t zone_size_mask;
> > +    int ret;
> > +
> > +    g_autofree struct stat *file = NULL;
> > +    file = g_new(struct stat, 1);
> > +    stat(s->filename, file);
>
> Heap allocation is not needed. It's easier to put the struct stat
> variable on the stack:
>
>   struct stat st;
>   if (fstat(fd, &st) < 0) {
>       return -errno;
>   }
>
> fstat(2) is preferred over stat(2) when a file descriptor is available
> because stat(2) suffers from race conditions when file system paths have
> changed.

Yes, it was mentioned before. I have included this change in patch 3.
I'll move them into this patch for better reviewing later.

> > +    zone_size = get_sysfs_long_val(fd, file, "chunk_sectors");
>
> The name "zone_sectors" would be clearer since size doesn't indicate the
> units (bytes vs sectors).
> > +    zone_size_mask = zone_size - 1;
> > +    if (offset & zone_size_mask) {
>
> Bytes and sectors units are being mixed here:
>
>   int64_t offset = aiocb->aio_offset; <-- bytes
>
> I suggest changing it to:
>
>   int64_t sector = aiocb->aio_offset / 512; /* BLK*ZONE ioctls use 512B 
> sectors */

I see. I missed considering the difference between sector and byte.

> > +        error_report("offset is not the start of a zone");
> > +        return -EINVAL;
> > +    }
> > +
> > +    if (len & zone_size_mask) {
>
> Bytes and sectors are mixed here too. I think len is in bytes:
>
>   int64_t len = aiocb->aio_nbytes;
>
> Maybe change it to:
>
>   int64_t nr_sectors = aiocb->aio_nbytes / 512;
> > +        error_report("len is not aligned to zones");
> > +        return -EINVAL;
> > +    }
> > +
> > +    switch (op) {
> > +    case zone_open:
> > +        ioctl_name = "BLKOPENZONE";
> > +        ioctl_op = BLKOPENZONE;
> > +        break;
> > +    case zone_close:
> > +        ioctl_name = "BLKCLOSEZONE";
> > +        ioctl_op = BLKCLOSEZONE;
> > +        break;
> > +    case zone_finish:
> > +        ioctl_name = "BLKFINISHZONE";
> > +        ioctl_op = BLKFINISHZONE;
> > +        break;
> > +    case zone_reset:
> > +        ioctl_name = "BLKRESETZONE";
> > +        ioctl_op = BLKRESETZONE;
> > +        break;
> > +    default:
> > +        error_report("Invalid zone operation 0x%x", op);
> > +        return -EINVAL;
> > +    }
> > +
> > +    /* Execute the operation */
> > +    range.sector = offset;
> > +    range.nr_sectors = len;
> > +    ret = ioctl(fd, ioctl_op, &range);
> > +    if (ret != 0) {
> > +        error_report("ioctl %s failed %d",
> > +                     ioctl_name, errno);
> > +        return -errno;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >  static int handle_aiocb_copy_range(void *opaque)
> >  {
> >      RawPosixAIOData *aiocb = opaque;
> > @@ -2973,6 +3105,59 @@ static void raw_account_discard(BDRVRawState *s, 
> > uint64_t nbytes, int ret)
> >      }
> >  }
> >
> > +/*
> > + * zone report - Get a zone block device's information in the form
> > + * of an array of zone descriptors.
> > + *
> > + * @param bs: passing zone block device file descriptor
> > + * @param zones: an array of zone descriptors to hold zone
> > + * information on reply
> > + * @param offset: offset can be any byte within the zone size.
> > + * @param len: (not sure yet.
> > + * @return 0 on success, -1 on failure
> > + */
> > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t 
> > offset,
> > +                                           int64_t *nr_zones,
> > +                                           BlockZoneDescriptor *zones) {
> > +    BDRVRawState *s = bs->opaque;
> > +    RawPosixAIOData acb;
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs         = bs,
> > +        .aio_fildes = s->fd,
> > +        .aio_type   = QEMU_AIO_IOCTL,
>
> Please define QEMU_AIO_ZONE_REPORT, QEMU_AIO_ZONE_MGMT, etc values for
> these new operations instead of reusing QEMU_AIO_IOCTL, which is meant
> for generic passthrough ioctls.
> > +        .aio_offset = offset,
> > +        .zone_report    = {
> > +                .nr_zones       = nr_zones,
> > +                .zones          = zones,
> > +        },
> > +    };
> > +
> > +    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
> > +}
> > +
> > +/*
> > + * zone management operations - Execute an operation on a zone
> > + */
> > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
> > +        int64_t offset, int64_t len) {
> > +    BDRVRawState *s = bs->opaque;
> > +    RawPosixAIOData acb;
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs             = bs,
> > +        .aio_fildes     = s->fd,
> > +        .aio_type       = QEMU_AIO_IOCTL,
> > +        .aio_offset     = offset,
> > +        .aio_nbytes     = len,
> > +        .zone_mgmt  = {
> > +                .op = op,
> > +        },
> > +    };
> > +
> > +    return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
> > +}
> > +
> >  static coroutine_fn int
> >  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
> >                  bool blkdev)
> > @@ -3324,6 +3509,9 @@ BlockDriver bdrv_file = {
> >      .bdrv_abort_perm_update = raw_abort_perm_update,
> >      .create_opts = &raw_create_opts,
> >      .mutable_opts = mutable_opts,
> > +
> > +    .bdrv_co_zone_report = raw_co_zone_report,
> > +    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> >  };
> >
> >  /***********************************************/
> > @@ -3703,6 +3891,53 @@ static BlockDriver bdrv_host_device = {
> >  #endif
> >  };
> >
>
> #ifdef __linux__
>
> > +static BlockDriver bdrv_zoned_host_device = {
> > +        .format_name = "zoned_host_device",
> > +        .protocol_name = "zoned_host_device",
> > +        .instance_size = sizeof(BDRVRawState),
> > +        .bdrv_needs_filename = true,
> > +        .bdrv_probe_device  = hdev_probe_device,
> > +        .bdrv_parse_filename = hdev_parse_filename,
>
> This function hardcodes "host_device:". zoned_host_device needs a
> separate function that uses "zoned_host_device:".
>
> > +        .bdrv_file_open     = hdev_open,
> > +        .bdrv_close         = raw_close,
> > +        .bdrv_reopen_prepare = raw_reopen_prepare,
> > +        .bdrv_reopen_commit  = raw_reopen_commit,
> > +        .bdrv_reopen_abort   = raw_reopen_abort,
> > +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> > +        .create_opts         = &bdrv_create_opts_simple,
> > +        .mutable_opts        = mutable_opts,
> > +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> > +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> > +
> > +        .bdrv_co_preadv         = raw_co_preadv,
> > +        .bdrv_co_pwritev        = raw_co_pwritev,
> > +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> > +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> > +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> > +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> > +        .bdrv_refresh_limits = raw_refresh_limits,
> > +        .bdrv_io_plug = raw_aio_plug,
> > +        .bdrv_io_unplug = raw_aio_unplug,
> > +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > +
> > +        .bdrv_co_truncate       = raw_co_truncate,
> > +        .bdrv_getlength = raw_getlength,
> > +        .bdrv_get_info = raw_get_info,
> > +        .bdrv_get_allocated_file_size
> > +                            = raw_get_allocated_file_size,
> > +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> > +        .bdrv_check_perm = raw_check_perm,
> > +        .bdrv_set_perm   = raw_set_perm,
> > +        .bdrv_abort_perm_update = raw_abort_perm_update,
> > +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> > +        .bdrv_probe_geometry = hdev_probe_geometry,
> > +        .bdrv_co_ioctl = hdev_co_ioctl,
> > +
> > +        /* zone management operations */
> > +        .bdrv_co_zone_report = raw_co_zone_report,
> > +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > +};
>
> #endif /* __linux__ */
>
> > +
> >  #if defined(__linux__) || defined(__FreeBSD__) || 
> > defined(__FreeBSD_kernel__)
> >  static void cdrom_parse_filename(const char *filename, QDict *options,
> >                                   Error **errp)
> > @@ -3964,6 +4199,7 @@ static void bdrv_file_init(void)
> >  #if defined(HAVE_HOST_BLOCK_DEVICE)
> >      bdrv_register(&bdrv_host_device);
> >  #ifdef __linux__
> > +    bdrv_register(&bdrv_zoned_host_device);
> >      bdrv_register(&bdrv_host_cdrom);
> >  #endif
> >  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> > diff --git a/include/block/block-common.h b/include/block/block-common.h
> > index fdb7306e78..78cddeeda5 100644
> > --- a/include/block/block-common.h
> > +++ b/include/block/block-common.h
> > @@ -23,7 +23,6 @@
> >   */
> >  #ifndef BLOCK_COMMON_H
> >  #define BLOCK_COMMON_H
> > -
> >  #include "block/aio.h"
> >  #include "block/aio-wait.h"
> >  #include "qemu/iov.h"
>
> Please avoid making whitespace changes that are unrelated to the patch.

Sorry, I will be more careful.

> > @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver;
> >  typedef struct BdrvChild BdrvChild;
> >  typedef struct BdrvChildClass BdrvChildClass;
> >
> > +typedef enum zone_op {
> > +    zone_open,
> > +    zone_close,
> > +    zone_finish,
> > +    zone_reset,
> > +} zone_op;
>
> QEMU coding style:
>
>   typedef enum {
>       BLK_ZO_OPEN,
>       BLK_ZO_CLOSE,
>       BLK_ZO_FINISH,
>       BLK_ZO_RESET,
>   } BlockZoneOp;
>
> Please also reformat the enums below.
>
> > +
> > +typedef enum zone_model {
> > +    BLK_Z_HM,
> > +    BLK_Z_HA,
> > +} zone_model;
> > +
> > +typedef enum BlkZoneCondition {
> > +    BLK_ZS_NOT_WP = 0x0,
> > +    BLK_ZS_EMPTY = 0x1,
> > +    BLK_ZS_IOPEN = 0x2,
> > +    BLK_ZS_EOPEN = 0x3,
> > +    BLK_ZS_CLOSED = 0x4,
> > +    BLK_ZS_RDONLY = 0xD,
> > +    BLK_ZS_FULL = 0xE,
> > +    BLK_ZS_OFFLINE = 0xF,
> > +} BlkZoneCondition;
> > +
> > +typedef enum BlkZoneType {
> > +    BLK_ZT_CONV = 0x1,
> > +    BLK_ZT_SWR = 0x2,
> > +    BLK_ZT_SWP = 0x3,
> > +} BlkZoneType;
> > +
> > +/*
> > + * Zone descriptor data structure.
> > + * Provide information on a zone with all position and size values in 
> > bytes.
> > + */
> > +typedef struct BlockZoneDescriptor {
> > +    uint64_t start;
> > +    uint64_t length;
> > +    uint64_t cap;
> > +    uint64_t wp;
> > +    BlkZoneType type;
> > +    BlkZoneCondition cond;
> > +} BlockZoneDescriptor;
> > +
> >  typedef struct BlockDriverInfo {
> >      /* in bytes, 0 if irrelevant */
> >      int cluster_size;
> > diff --git a/include/block/block_int-common.h 
> > b/include/block/block_int-common.h
> > index 8947abab76..6037871089 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest {
> >      struct BdrvTrackedRequest *waiting_for;
> >  } BdrvTrackedRequest;
> >
> > +/**
> > + * Zone device information data structure.
> > + * Provide information on a device.
> > + */
> > +typedef struct zbd_dev {
> > +    uint32_t zone_size;
> > +    zone_model model;
> > +    uint32_t block_size;
> > +    uint32_t write_granularity;
> > +    uint32_t nr_zones;
> > +    struct BlockZoneDescriptor *zones; /* array of zones */
> > +    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones 
> > */
> > +    uint32_t max_nr_active_zones;
> > +} zbd_dev;
>
> This struct isn't use by this patch. Please move this change into the
> patch that uses struct zbd_dev.
>
> QEMU coding style naming would be BlockZoneDev instead of zbd_dev.
>
> I think this struct contains the block limits fields that could be added
> to QEMU's BlockLimits? A new struct may not be necessary.

Yes! It will in the next patch series.

> >
> >  struct BlockDriver {
> >      /*
> > @@ -691,6 +705,12 @@ struct BlockDriver {
> >                                            QEMUIOVector *qiov,
> >                                            int64_t pos);
> >
> > +    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
> > +            int64_t offset, int64_t *nr_zones,
> > +            BlockZoneDescriptor *zones);
> > +    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum 
> > zone_op op,
> > +            int64_t offset, int64_t len);
> > +
> >      /* removable device specific */
> >      bool (*bdrv_is_inserted)(BlockDriverState *bs);
> >      void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
> > --
> > 2.36.1
> >



reply via email to

[Prev in Thread] Current Thread [Next in Thread]