[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 6/8] block/quorum: add broken state to BlockDriv
From: |
Benoît Canet |
Subject: |
Re: [Qemu-devel] [PATCH 6/8] block/quorum: add broken state to BlockDriverState |
Date: |
Mon, 1 Sep 2014 10:57:43 +0200 |
User-agent: |
Mutt/1.5.23 (2014-03-12) |
The Monday 01 Sep 2014 à 15:43:12 (+0800), Liu Yuan wrote :
> This allow VM continues to process even if some devices are broken meanwhile
> with proper configuration.
>
> We mark the device broken when the protocol tier notify back some broken
> state(s) of the device, such as diconnection via driver operations. We could
> also reset the device as sound when the protocol tier is repaired.
>
> Origianlly .threshold controls how we should decide the success of read/write
> and return the failure only if the success count of read/write is less than
> .threshold specified by users. But it doesn't record the states of underlying
> states and will impact performance a bit in some cases.
>
> For example, we have 3 children and .threshold is set 2. If one of the devices
> broken, we should still return success and continue to run VM. But for every
> IO operations, we will blindly send the requests to the broken device.
>
> To store broken state into driver state we can save requests to borken devices
> and resend the requests to the repaired ones by setting broken as false.
>
> This is especially useful for network based protocol such as sheepdog, which
> has a auto-reconnection mechanism and will never report EIO if the connection
> is broken but just store the requests to its local queue and wait for
> resending.
> Without broken state, quorum request will not come back until the connection
> is
> re-established. So we have to skip the broken deivces to allow VM to continue
> running with networked backed child (glusterfs, nfs, sheepdog, etc).
>
> With the combination of read-pattern and threshold, we can easily mimic the
> DRVD
> behavior with following configuration:
>
> read-pattern=fifo,threshold=1 will two children.
>
> Cc: Eric Blake <address@hidden>
> Cc: Benoit Canet <address@hidden>
> Cc: Kevin Wolf <address@hidden>
> Cc: Stefan Hajnoczi <address@hidden>
> Signed-off-by: Liu Yuan <address@hidden>
> ---
> block/quorum.c | 102
> ++++++++++++++++++++++++++++++++++++++--------
> include/block/block_int.h | 3 ++
> 2 files changed, 87 insertions(+), 18 deletions(-)
>
> diff --git a/block/quorum.c b/block/quorum.c
> index b9eeda3..7b07e35 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -120,6 +120,7 @@ struct QuorumAIOCB {
> int rewrite_count; /* number of replica to rewrite: count down
> to
> * zero once writes are fired
> */
> + int issued_count; /* actual read&write issued count */
>
> QuorumVotes votes;
>
> @@ -170,8 +171,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
> if (acb->is_read) {
> /* on the quorum case acb->child_iter == s->num_children - 1 */
> for (i = 0; i <= acb->child_iter; i++) {
> - qemu_vfree(acb->qcrs[i].buf);
> - qemu_iovec_destroy(&acb->qcrs[i].qiov);
> + if (acb->qcrs[i].buf) {
> + qemu_vfree(acb->qcrs[i].buf);
> + qemu_iovec_destroy(&acb->qcrs[i].qiov);
> + }
> }
> }
>
> @@ -207,6 +210,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> acb->count = 0;
> acb->success_count = 0;
> acb->rewrite_count = 0;
> + acb->issued_count = 0;
> acb->votes.compare = quorum_sha256_compare;
> QLIST_INIT(&acb->votes.vote_list);
> acb->is_read = false;
> @@ -286,6 +290,22 @@ static void quorum_copy_qiov(QEMUIOVector *dest,
> QEMUIOVector *source)
> }
> }
>
> +static int next_fifo_child(QuorumAIOCB *acb)
> +{
> + BDRVQuorumState *s = acb->common.bs->opaque;
> + int i;
> +
> + for (i = acb->child_iter; i < s->num_children; i++) {
> + if (!s->bs[i]->broken) {
> + break;
> + }
> + }
> + if (i == s->num_children) {
> + return -1;
> + }
> + return i;
> +}
> +
> static void quorum_aio_cb(void *opaque, int ret)
> {
> QuorumChildRequest *sacb = opaque;
> @@ -293,11 +313,18 @@ static void quorum_aio_cb(void *opaque, int ret)
> BDRVQuorumState *s = acb->common.bs->opaque;
> bool rewrite = false;
>
> + if (ret < 0) {
> + s->bs[acb->child_iter]->broken = true;
> + }
child_iter is fifo mode stuff.
Do we need to write if (s->read_pattern == QUORUM_READ_PATTERN_FIFO && ret < 0)
here ?
> +
> if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) {
> /* We try to read next child in FIFO order if we fail to read */
> - if (ret < 0 && ++acb->child_iter < s->num_children) {
> - read_fifo_child(acb);
> - return;
> + if (ret < 0) {
> + acb->child_iter = next_fifo_child(acb);
You don't seem to increment child_iter anymore.
> + if (acb->child_iter > 0) {
> + read_fifo_child(acb);
> + return;
> + }
> }
>
> if (ret == 0) {
> @@ -315,9 +342,9 @@ static void quorum_aio_cb(void *opaque, int ret)
> } else {
> quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret);
> }
> - assert(acb->count <= s->num_children);
> - assert(acb->success_count <= s->num_children);
> - if (acb->count < s->num_children) {
> + assert(acb->count <= acb->issued_count);
> + assert(acb->success_count <= acb->issued_count);
> + if (acb->count < acb->issued_count) {
> return;
> }
>
> @@ -647,22 +674,46 @@ free_exit:
> return rewrite;
> }
>
> +static bool has_enough_children(BDRVQuorumState *s)
> +{
> + int good = 0, i;
> +
> + for (i = 0; i < s->num_children; i++) {
> + if (s->bs[i]->broken) {
> + continue;
> + }
> + good++;
> + }
> +
> + if (good >= s->threshold) {
> + return true;
> + } else {
> + return false;
> + }
> +}
> +
> static BlockDriverAIOCB *read_quorum_children(QuorumAIOCB *acb)
> {
> BDRVQuorumState *s = acb->common.bs->opaque;
> int i;
>
> + if (!has_enough_children(s)) {
> + quorum_aio_release(acb);
> + return NULL;
> + }
> +
> for (i = 0; i < s->num_children; i++) {
> + if (s->bs[i]->broken) {
> + continue;
> + }
> acb->qcrs[i].buf = qemu_blockalign(s->bs[i], acb->qiov->size);
> qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov);
> qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf);
> - }
> -
> - for (i = 0; i < s->num_children; i++) {
> acb->qcrs[i].aiocb = bdrv_aio_readv(s->bs[i], acb->sector_num,
> &acb->qcrs[i].qiov,
> acb->nb_sectors, quorum_aio_cb,
> &acb->qcrs[i]);
> + acb->issued_count++;
> }
>
> return &acb->common;
> @@ -679,7 +730,7 @@ static BlockDriverAIOCB *read_fifo_child(QuorumAIOCB *acb)
> acb->qcrs[i].aiocb = bdrv_aio_readv(s->bs[i], acb->sector_num,
> &acb->qcrs[i].qiov, acb->nb_sectors,
> quorum_aio_cb, &acb->qcrs[i]);
> -
> + acb->issued_count = 1;
> return &acb->common;
> }
>
> @@ -693,6 +744,7 @@ static BlockDriverAIOCB
> *quorum_aio_readv(BlockDriverState *bs,
> BDRVQuorumState *s = bs->opaque;
> QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
> nb_sectors, cb, opaque);
> +
Spurious carriage return.
> acb->is_read = true;
>
> if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
> @@ -701,6 +753,11 @@ static BlockDriverAIOCB
> *quorum_aio_readv(BlockDriverState *bs,
> }
>
> acb->child_iter = 0;
> + acb->child_iter = next_fifo_child(acb);
> + if (acb->child_iter < 0) {
> + quorum_aio_release(acb);
> + return NULL;
> + }
> return read_fifo_child(acb);
> }
>
> @@ -716,10 +773,19 @@ static BlockDriverAIOCB
> *quorum_aio_writev(BlockDriverState *bs,
> cb, opaque);
> int i;
>
> + if (!has_enough_children(s)) {
> + quorum_aio_release(acb);
> + return NULL;
> + }
> +
> for (i = 0; i < s->num_children; i++) {
> + if (s->bs[i]->broken) {
> + continue;
> + }
> acb->qcrs[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov,
> nb_sectors, &quorum_aio_cb,
> &acb->qcrs[i]);
> + acb->issued_count++;
> }
>
> return &acb->common;
> @@ -926,6 +992,12 @@ static int quorum_open(BlockDriverState *bs, QDict
> *options, int flags,
> }
>
> s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0);
> + /* and validate it against s->num_children */
> + ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
> + if (ret < 0) {
> + goto exit;
> + }
> +
> ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN));
> if (ret < 0) {
> error_setg(&local_err, "Please set read-pattern as fifo or
> quorum\n");
> @@ -934,12 +1006,6 @@ static int quorum_open(BlockDriverState *bs, QDict
> *options, int flags,
> s->read_pattern = ret;
>
> if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
> - /* and validate it against s->num_children */
> - ret = quorum_valid_threshold(s->threshold, s->num_children,
> &local_err);
> - if (ret < 0) {
> - goto exit;
> - }
> -
> /* is the driver in blkverify mode */
> if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
> s->num_children == 2 && s->threshold == 2) {
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 9fdec7f..599110b 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -402,6 +402,9 @@ struct BlockDriverState {
>
> /* The error object in use for blocking operations on backing_hd */
> Error *backing_blocker;
> +
> + /* True if the associated device is broken */
> + bool broken;
> };
>
> int get_tmp_filename(char *filename, int size);
> --
> 1.9.1
>
>
- Re: [Qemu-devel] [PATCH 3/8] block/sheepdog: propagate disconnect/reconnect events to upper driver, (continued)
[Qemu-devel] [PATCH 6/8] block/quorum: add broken state to BlockDriverState, Liu Yuan, 2014/09/01
- Re: [Qemu-devel] [PATCH 6/8] block/quorum: add broken state to BlockDriverState,
Benoît Canet <=
[Qemu-devel] [PATCH 7/8] block: add two helpers, Liu Yuan, 2014/09/01
[Qemu-devel] [PATCH 8/8] quorum: add basic device recovery logic, Liu Yuan, 2014/09/01
Re: [Qemu-devel] [PATCH 0/8] add basic recovery logic to quorum driver, Benoît Canet, 2014/09/01
Re: [Qemu-devel] [PATCH 0/8] add basic recovery logic to quorum driver, Benoît Canet, 2014/09/02
Re: [Qemu-devel] [PATCH 0/8] add basic recovery logic to quorum driver, Benoît Canet, 2014/09/07