>From 2a28241de49d27d7b0968f1d1b137aea153fc3b4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Oct 2020 09:44:32 -0400 Subject: [PATCH 2/2] migration: Postpone the kick of the fault thread after recover The new migrate_send_rp_req_pages_pending() call should greatly improve destination responsiveness because it will resync faulted address after postcopy recovery. However it is also the 1st place to initiate the page request from the main thread. One thing is overlooked on that migrate_send_rp_message_req_pages() is not designed to be thread-safe. So if we wake the fault thread before syncing all the faulted pages in the main thread, it means they can race. Postpone the wake up operation after the sync of faulted addresses. Fixes: 0c26781c09 ("migration: Sync requested pages after postcopy recovery") Signed-off-by: Peter Xu --- migration/savevm.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/migration/savevm.c b/migration/savevm.c index e8834991ec..5f937a2762 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2069,12 +2069,9 @@ static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis) /* * This means source VM is ready to resume the postcopy migration. - * It's time to switch state and release the fault thread to - * continue service page faults. */ migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER, MIGRATION_STATUS_POSTCOPY_ACTIVE); - qemu_sem_post(&mis->postcopy_pause_sem_fault); trace_loadvm_postcopy_handle_resume(); @@ -2095,6 +2092,14 @@ static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis) */ migrate_send_rp_req_pages_pending(mis); + /* + * It's time to switch state and release the fault thread to continue + * service page faults. Note that this should be explicitly after the + * above call to migrate_send_rp_req_pages_pending(). In short: + * migrate_send_rp_message_req_pages() is not thread safe, yet. + */ + qemu_sem_post(&mis->postcopy_pause_sem_fault); + return 0; } -- 2.26.2