qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-block] [PATCH v19 08/10] Implement new driver for block replic


From: Changlong Xie
Subject: Re: [Qemu-block] [PATCH v19 08/10] Implement new driver for block replication
Date: Tue, 7 Jun 2016 13:36:55 +0800
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.7.0

On 05/20/2016 03:36 PM, Changlong Xie wrote:
+        if (!failover) {
+            /*
+             * This BDS will be closed, and the job should be completed
+             * before the BDS is closed, because we will access hidden
+             * disk, secondary disk in backup_job_completed().
+             */
+            if (s->secondary_disk->bs->job) {
+                block_job_cancel_sync(s->secondary_disk->bs->job);
+            }
+            secondary_do_checkpoint(s, errp);
+            s->replication_state = BLOCK_REPLICATION_DONE;
+            aio_context_release(aio_context);
+            return;
+        }
+
+        s->replication_state = BLOCK_REPLICATION_FAILOVER;
+        if (s->secondary_disk->bs->job) {
+            block_job_cancel(s->secondary_disk->bs->job);
+        }

Since commit b6d2e599 "block: Convert block job core to BlockBackend", blockjob uses BB instead of bdrv_ref(), this introduces unexpected Segmentation fault with COLO.

In the below backtrace, you can see that. During failover, s->target was changed to an illegal value "0x1e1e1e1e1e1e1e1e" in bakup_complete. Then the active commit job what also has a pointer that refer to s->target will use this illegal pointer. To avoid this, we should use "bloc_job_cancel_sync" to ensure backup job complete synchronously.

% MALLOC_PERTURB_=$(($RANDOM % 255 + 1))
% export MALLOC_PERTURB_
% gdb --args ./tests/test-replication
(gdb) b backup_complete
(gdb) r
(gdb) n
(gdb) n
(gdb) watch s->target
(gdb) c
Old value = (BlockBackend *) 0x555555f1d990
New value = (BlockBackend *) 0x1e1e1e1e1e1e1e1e
0x00007ffff5a811eb in memset () from /lib64/libc.so.6
(gdb) bt
#0  0x00007ffff5a811eb in memset () from /lib64/libc.so.6
#1  0x00007ffff5a7500e in _int_free () from /lib64/libc.so.6
#2  0x00007ffff705bf7f in g_free () from /lib64/libglib-2.0.so.0
#3 0x000055555557e924 in block_job_unref (job=0x555555f1d630) at blockjob.c:124 #4 0x000055555557e9da in block_job_completed_single (job=0x555555f1d630) at blockjob.c:143 #5 0x000055555557ecc8 in block_job_completed (job=0x555555f1d630, ret=0) at blockjob.c:215 #6 0x00005555555e6d49 in backup_complete (job=0x555555f1d630, opaque=0x555555f1dd50) at block/backup.c:325 #7 0x000055555557f5f4 in block_job_defer_to_main_loop_bh (opaque=0x5555596e1dc0) at blockjob.c:500
#8  0x00005555555747d7 in aio_bh_call (bh=0x5555596e1c30) at async.c:66
#9  0x0000555555574899 in aio_bh_poll (ctx=0x555555ce2d60) at async.c:94
#10 0x0000555555581d4d in aio_dispatch (ctx=0x555555ce2d60) at aio-posix.c:308 #11 0x00005555555823cb in aio_poll (ctx=0x555555ce2d60, blocking=false) at aio-posix.c:479 #12 0x00005555555d639b in bdrv_drain_poll (bs=0x555555dec210) at block/io.c:190 #13 0x00005555555d6566 in bdrv_drained_begin (bs=0x555555dec210) at block/io.c:240 #14 0x0000555555577261 in bdrv_child_cb_drained_begin (child=0x5555596e1ab0) at block.c:665 #15 0x00005555555d5e81 in bdrv_parent_drained_begin (bs=0x555555f0e130) at block/io.c:54 #16 0x00005555555d652b in bdrv_drained_begin (bs=0x555555f0e130) at block/io.c:232 #17 0x0000555555577261 in bdrv_child_cb_drained_begin (child=0x5555596e1810) at block.c:665 #18 0x00005555555d5e81 in bdrv_parent_drained_begin (bs=0x5555596d7030) at block/io.c:54 #19 0x00005555555d652b in bdrv_drained_begin (bs=0x5555596d7030) at block/io.c:232 #20 0x0000555555577261 in bdrv_child_cb_drained_begin (child=0x555555df8830) at block.c:665 #21 0x00005555555d5e81 in bdrv_parent_drained_begin (bs=0x555555df0bb0) at block/io.c:54
#22 0x00005555555d66ee in bdrv_drain_all () at block/io.c:301
#23 0x0000555555579f9f in bdrv_reopen_multiple (bs_queue=0x555555f1dd30, errp=0x7fffffffd768) at block.c:1953 #24 0x000055555557a169 in bdrv_reopen (bs=0x555555df0bb0, bdrv_flags=24578, errp=0x7fffffffd8e0) at block.c:1994 #25 0x00005555555d54a7 in commit_active_start (bs=0x555555f0e130, base=0x555555df0bb0, speed=0, on_error=BLOCKDEV_ON_ERROR_REPORT, cb=0x5555555e8b97 <replication_done>, opaque=0x555555dec210, errp=0x7fffffffd8e0, auto_complete=true) at block/mirror.c:901 #26 0x00005555555e8d98 in replication_stop (rs=0x55555746f7b0, failover=true, errp=0x7fffffffd8e0) at block/replication.c:623 #27 0x00005555555873d1 in replication_stop_all (failover=true, errp=0x7fffffffd928) at replication.c:98 #28 0x000055555557406d in test_secondary_start () at tests/test-replication.c:403 #29 0x00007ffff707a5e1 in g_test_run_suite_internal () from /lib64/libglib-2.0.so.0 #30 0x00007ffff707a7a6 in g_test_run_suite_internal () from /lib64/libglib-2.0.so.0 #31 0x00007ffff707a7a6 in g_test_run_suite_internal () from /lib64/libglib-2.0.so.0
#32 0x00007ffff707ab1b in g_test_run_suite () from /lib64/libglib-2.0.so.0
#33 0x00005555555746c8 in main (argc=1, argv=0x7fffffffdda8) at tests/test-replication.c:545
(gdb) d breakpoints
(gdb) c
Program received signal SIGSEGV, Segmentation fault.
0x00005555555c7d6c in blk_bs (blk=0x1e1e1e1e1e1e1e1e) at block/block-backend.c:389
389         return blk->root ? blk->root->bs : NULL;
(gdb) bt
#0 0x00005555555c7d6c in blk_bs (blk=0x1e1e1e1e1e1e1e1e) at block/block-backend.c:389 #1 0x00005555555c79e3 in bdrv_next (it=0x7fffffffd6a0) at block/block-backend.c:279
#2  0x00005555555d674d in bdrv_drain_all () at block/io.c:294
#3 0x0000555555579f9f in bdrv_reopen_multiple (bs_queue=0x555555f1dd30, errp=0x7fffffffd768) at block.c:1953 #4 0x000055555557a169 in bdrv_reopen (bs=0x555555df0bb0, bdrv_flags=24578, errp=0x7fffffffd8e0) at block.c:1994 #5 0x00005555555d54a7 in commit_active_start (bs=0x555555f0e130, base=0x555555df0bb0, speed=0, on_error=BLOCKDEV_ON_ERROR_REPORT, cb=0x5555555e8b97 <replication_done>, opaque=0x555555dec210, errp=0x7fffffffd8e0, auto_complete=true) at block/mirror.c:901 #6 0x00005555555e8d98 in replication_stop (rs=0x55555746f7b0, failover=true, errp=0x7fffffffd8e0) at block/replication.c:623 #7 0x00005555555873d1 in replication_stop_all (failover=true, errp=0x7fffffffd928) at replication.c:98 #8 0x000055555557406d in test_secondary_start () at tests/test-replication.c:403 #9 0x00007ffff707a5e1 in g_test_run_suite_internal () from /lib64/libglib-2.0.so.0 #10 0x00007ffff707a7a6 in g_test_run_suite_internal () from /lib64/libglib-2.0.so.0 #11 0x00007ffff707a7a6 in g_test_run_suite_internal () from /lib64/libglib-2.0.so.0
#12 0x00007ffff707ab1b in g_test_run_suite () from /lib64/libglib-2.0.so.0
#13 0x00005555555746c8 in main (argc=1, argv=0x7fffffffdda8) at tests/test-replication.c:545

+
+        commit_active_start(s->active_disk->bs, s->secondary_disk->bs, 0,
+                            BLOCKDEV_ON_ERROR_REPORT, replication_done,
+                            bs, errp, true);
+        break;





reply via email to

[Prev in Thread] Current Thread [Next in Thread]