qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm


From: Zhi Yong Wu
Subject: Re: [Qemu-devel] [PATCH v12 3/5] block: add I/O throttling algorithm
Date: Tue, 8 Nov 2011 12:34:46 +0800

On Mon, Nov 7, 2011 at 11:18 PM, Kevin Wolf <address@hidden> wrote:
> Am 03.11.2011 09:57, schrieb Zhi Yong Wu:
>> Signed-off-by: Zhi Yong Wu <address@hidden>
>> Signed-off-by: Stefan Hajnoczi <address@hidden>
>> ---
>>  block.c     |  220 
>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  block.h     |    1 +
>>  block_int.h |    1 +
>>  3 files changed, 222 insertions(+), 0 deletions(-)
>>
>> diff --git a/block.c b/block.c
>> index 79e7f09..b2af48f 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -74,6 +74,13 @@ static BlockDriverAIOCB 
>> *bdrv_co_aio_rw_vector(BlockDriverState *bs,
>>                                                 bool is_write);
>>  static void coroutine_fn bdrv_co_do_rw(void *opaque);
>>
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +        double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, int64_t *wait);
>> +
>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>
>> @@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
>>  #endif
>>
>>  /* throttling disk I/O limits */
>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>> +{
>> +    bs->io_limits_enabled = false;
>> +
>> +    while (qemu_co_queue_next(&bs->throttled_reqs));
>> +
>> +    if (bs->block_timer) {
>> +        qemu_del_timer(bs->block_timer);
>> +        qemu_free_timer(bs->block_timer);
>> +        bs->block_timer = NULL;
>> +    }
>> +
>> +    bs->slice_start = 0;
>> +    bs->slice_end   = 0;
>> +    bs->slice_time  = 0;
>> +    memset(&bs->io_base, 0, sizeof(bs->io_base));
>> +}
>> +
>>  static void bdrv_block_timer(void *opaque)
>>  {
>>      BlockDriverState *bs = opaque;
>> @@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
>>           || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
>>  }
>>
>> +static void bdrv_io_limits_intercept(BlockDriverState *bs,
>> +                                     bool is_write, int nb_sectors)
>> +{
>> +    int64_t wait_time = -1;
>> +
>> +    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
>> +        qemu_co_queue_wait(&bs->throttled_reqs);
>> +    }
>> +
>> +    /* In fact, we hope to keep each request's timing, in FIFO mode. The 
>> next
>> +     * throttled requests will not be dequeued until the current request is
>> +     * allowed to be serviced. So if the current request still exceeds the
>> +     * limits, it will be inserted to the head. All requests followed it 
>> will
>> +     * be still in throttled_reqs queue.
>> +     */
>> +
>> +    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
>> +        qemu_mod_timer(bs->block_timer,
>> +                       wait_time + qemu_get_clock_ns(vm_clock));
>> +        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
>> +    }
>> +
>> +    qemu_co_queue_next(&bs->throttled_reqs);
>> +}
>> +
>>  /* check if the path starts with "<protocol>:" */
>>  static int path_has_protocol(const char *path)
>>  {
>> @@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char 
>> *filename, int flags,
>>          bdrv_dev_change_media_cb(bs, true);
>>      }
>>
>> +    /* throttling disk I/O limits */
>> +    if (bs->io_limits_enabled) {
>> +        bdrv_io_limits_enable(bs);
>> +    }
>> +
>>      return 0;
>>
>>  unlink_and_fail:
>> @@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
>>
>>          bdrv_dev_change_media_cb(bs, false);
>>      }
>> +
>> +    /*throttling disk I/O limits*/
>> +    if (bs->io_limits_enabled) {
>> +        bdrv_io_limits_disable(bs);
>> +    }
>>  }
>>
>>  void bdrv_close_all(void)
>> @@ -1291,6 +1351,11 @@ static int coroutine_fn 
>> bdrv_co_do_readv(BlockDriverState *bs,
>>          return -EIO;
>>      }
>>
>> +    /* throttling disk read I/O */
>> +    if (bs->io_limits_enabled) {
>> +        bdrv_io_limits_intercept(bs, false, nb_sectors);
>> +    }
>> +
>>      return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
>>  }
>>
>> @@ -1321,6 +1386,11 @@ static int coroutine_fn 
>> bdrv_co_do_writev(BlockDriverState *bs,
>>          return -EIO;
>>      }
>>
>> +    /* throttling disk write I/O */
>> +    if (bs->io_limits_enabled) {
>> +        bdrv_io_limits_intercept(bs, true, nb_sectors);
>> +    }
>> +
>>      ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>>
>>      if (bs->dirty_bitmap) {
>> @@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>>      acb->pool->cancel(acb);
>>  }
>>
>> +/* block I/O throttling */
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +                 bool is_write, double elapsed_time, uint64_t *wait) {
>> +    uint64_t bps_limit = 0;
>> +    double   bytes_limit, bytes_base, bytes_res;
>> +    double   slice_time, wait_time;
>> +
>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>> +    } else if (bs->io_limits.bps[is_write]) {
>> +        bps_limit = bs->io_limits.bps[is_write];
>> +    } else {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    slice_time = bs->slice_end - bs->slice_start;
>> +    slice_time /= (NANOSECONDS_PER_SECOND);
>> +    bytes_limit = bps_limit * slice_time;
>> +    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> +        bytes_base += bs->nr_bytes[!is_write] - 
>> bs->io_base.bytes[!is_write];
>> +    }
>> +
>> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> +
>> +    if (bytes_base + bytes_res <= bytes_limit) {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    /* Calc approx time to dispatch */
>> +    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>> +
>> +    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>> +    if (wait) {
>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    }
>
> I'm not quire sure what bs->slice_end really is and what these
> calculations do exactly. Looks like magic. Can you add some comments
> that explain why slice_end is increased?
As you'ver known, when the I/O rate at runtime exceeds the limits,
bs->slice_end need to be extended in order that the current statistic
info can be kept until the timer fire, so it is increased and tuned
based on the result of experimet.

> and how you estimate *wait?
The wait time is calcuated based on the history info of bps and iops.

bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;

1.) bytes_base is the bytes of data which have been read/written; and
it is obtained from the history statistic info.
2.) bytes_res is the remaining bytes of data which need to be read/written.
3.) (bytes_base + bytes_res) / bps_limit, this expression will be used
to calcuated the total time for completing reading/writting all data.

I don't make sure if you understand this.

>
>> +
>> +    return true;
>> +}
>> +
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +                             double elapsed_time, uint64_t *wait) {
>
> Coding style requires the brace on its own line.
>
>> +    uint64_t iops_limit = 0;
>> +    double   ios_limit, ios_base;
>> +    double   slice_time, wait_time;
>> +
>> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
>> +    } else if (bs->io_limits.iops[is_write]) {
>> +        iops_limit = bs->io_limits.iops[is_write];
>> +    } else {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    slice_time = bs->slice_end - bs->slice_start;
>> +    slice_time /= (NANOSECONDS_PER_SECOND);
>> +    ios_limit  = iops_limit * slice_time;
>> +    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
>> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
>> +    }
>> +
>> +    if (ios_base + 1 <= ios_limit) {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    /* Calc approx time to dispatch */
>> +    wait_time = (ios_base + 1) / iops_limit;
>> +    if (wait_time > elapsed_time) {
>> +        wait_time = wait_time - elapsed_time;
>> +    } else {
>> +        wait_time = 0;
>> +    }
>> +
>> +    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>> +    if (wait) {
>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> +                           bool is_write, int64_t *wait) {
>
> Same here.
>
> Kevin
>



-- 
Regards,

Zhi Yong Wu



reply via email to

[Prev in Thread] Current Thread [Next in Thread]