[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 21/21] postcopy: implement postcopy livemigratio
From: |
Isaku Yamahata |
Subject: |
Re: [Qemu-devel] [PATCH 21/21] postcopy: implement postcopy livemigration |
Date: |
Wed, 4 Jan 2012 12:34:33 +0900 |
User-agent: |
Mutt/1.5.19 (2009-01-05) |
On Thu, Dec 29, 2011 at 05:51:36PM +0200, Orit Wasserman wrote:
> Hi,
Thank you for review.
> A general comment this patch is a bit too long,which makes it hard to review.
> Can you split it please?
Will do. Maybe split into umem.[hc] part, incoming part and outgoing part.
> On 12/29/2011 03:26 AM, Isaku Yamahata wrote:
> > This patch implements postcopy livemigration.
> >
> > Signed-off-by: Isaku Yamahata <address@hidden>
> > ---
> > Makefile.target | 4 +
> > arch_init.c | 26 +-
> > cpu-all.h | 7 +
> > exec.c | 20 +-
> > migration-exec.c | 8 +
> > migration-fd.c | 30 +
> > migration-postcopy-stub.c | 77 ++
> > migration-postcopy.c | 1891
> > +++++++++++++++++++++++++++++++++++++++++++++
> > migration-tcp.c | 37 +-
> > migration-unix.c | 32 +-
> > migration.c | 31 +
> > migration.h | 30 +
> > qemu-common.h | 1 +
> > qemu-options.hx | 5 +-
> > umem.c | 379 +++++++++
> > umem.h | 105 +++
> > vl.c | 14 +-
> > 17 files changed, 2677 insertions(+), 20 deletions(-)
> > create mode 100644 migration-postcopy-stub.c
> > create mode 100644 migration-postcopy.c
> > create mode 100644 umem.c
> > create mode 100644 umem.h
> >
> > diff --git a/Makefile.target b/Makefile.target
> > index 3261383..d94c53f 100644
> > --- a/Makefile.target
> > +++ b/Makefile.target
> > @@ -4,6 +4,7 @@ GENERATED_HEADERS = config-target.h
> > CONFIG_NO_PCI = $(if $(subst n,,$(CONFIG_PCI)),n,y)
> > CONFIG_NO_KVM = $(if $(subst n,,$(CONFIG_KVM)),n,y)
> > CONFIG_NO_XEN = $(if $(subst n,,$(CONFIG_XEN)),n,y)
> > +CONFIG_NO_POSTCOPY = $(if $(subst n,,$(CONFIG_POSTCOPY)),n,y)
> >
> > include ../config-host.mak
> > include config-devices.mak
> > @@ -199,6 +200,9 @@ obj-$(CONFIG_NO_KVM) += kvm-stub.o
> > obj-y += memory.o
> > LIBS+=-lz
> >
> > +common-obj-$(CONFIG_POSTCOPY) += migration-postcopy.o umem.o
> > +common-obj-$(CONFIG_NO_POSTCOPY) += migration-postcopy-stub.o
> > +
> > QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
> > QEMU_CFLAGS += $(VNC_SASL_CFLAGS)
> > QEMU_CFLAGS += $(VNC_JPEG_CFLAGS)
> > diff --git a/arch_init.c b/arch_init.c
> > index bc53092..8b3130d 100644
> > --- a/arch_init.c
> > +++ b/arch_init.c
> > @@ -102,6 +102,13 @@ static int is_dup_page(uint8_t *page, uint8_t ch)
> > return 1;
> > }
> >
> > +static bool outgoing_postcopy = false;
> > +
> > +void ram_save_set_params(const MigrationParams *params, void *opaque)
> > +{
> > + outgoing_postcopy = params->postcopy;
> > +}
> > +
> > static RAMBlock *last_block_sent = NULL;
> >
> > int ram_save_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
> > @@ -284,6 +291,17 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int
> > stage, void *opaque)
> > uint64_t expected_time = 0;
> > int ret;
> >
> > + if (stage == 1) {
> > + last_block_sent = NULL;
> > +
> > + bytes_transferred = 0;
> > + last_block = NULL;
> > + last_offset = 0;
>
> Changing of line order + new empty line
>
> > + }
> > + if (outgoing_postcopy) {
> > + return postcopy_outgoing_ram_save_live(mon, f, stage, opaque);
> > + }
> > +
>
> I would just do :
>
> unregister_savevm_live and then
> register_savevm_live(...,postcopy_outgoing_ram_save_live,...)
> when starting outgoing postcopy migration.
>
> > if (stage < 0) {
> > cpu_physical_memory_set_dirty_tracking(0);
> > return 0;
> > @@ -295,10 +313,6 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int
> > stage, void *opaque)
> > }
> >
> > if (stage == 1) {
> > - bytes_transferred = 0;
> > - last_block_sent = NULL;
> > - last_block = NULL;
> > - last_offset = 0;
> > sort_ram_list();
> >
> > /* Make sure all dirty bits are set */
> > @@ -436,6 +450,10 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
> > int flags;
> > int error;
> >
> > + if (incoming_postcopy) {
> > + return postcopy_incoming_ram_load(f, opaque, version_id);
> > + }
> > +
> why not call register_savevm_live(...,postcopy_incoming_ram_load,...) when
> starting guest with postcopy_incoming
>
> > if (version_id < 3 || version_id > RAM_SAVE_VERSION_ID) {
> > return -EINVAL;
> > }
> > diff --git a/cpu-all.h b/cpu-all.h
> > index 0244f7a..2e9d8a7 100644
> > --- a/cpu-all.h
> > +++ b/cpu-all.h
> > @@ -475,6 +475,9 @@ extern ram_addr_t ram_size;
> > /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
> > #define RAM_PREALLOC_MASK (1 << 0)
> >
> > +/* RAM is allocated via umem for postcopy incoming mode */
> > +#define RAM_POSTCOPY_UMEM_MASK (1 << 1)
> > +
> > typedef struct RAMBlock {
> > uint8_t *host;
> > ram_addr_t offset;
> > @@ -485,6 +488,10 @@ typedef struct RAMBlock {
> > #if defined(__linux__) && !defined(TARGET_S390X)
> > int fd;
> > #endif
> > +
> > +#ifdef CONFIG_POSTCOPY
> > + UMem *umem; /* for incoming postcopy mode */
> > +#endif
> > } RAMBlock;
> >
> > typedef struct RAMList {
> > diff --git a/exec.c b/exec.c
> > index c8c6692..90b0491 100644
> > --- a/exec.c
> > +++ b/exec.c
> > @@ -35,6 +35,7 @@
> > #include "qemu-timer.h"
> > #include "memory.h"
> > #include "exec-memory.h"
> > +#include "migration.h"
> > #if defined(CONFIG_USER_ONLY)
> > #include <qemu.h>
> > #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> > @@ -2949,6 +2950,13 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev,
> > const char *name,
> > new_block->host = host;
> > new_block->flags |= RAM_PREALLOC_MASK;
> > } else {
> > +#ifdef CONFIG_POSTCOPY
> > + if (incoming_postcopy) {
> > + postcopy_incoming_ram_alloc(name, size,
> > + &new_block->host,
> > &new_block->umem);
> > + new_block->flags |= RAM_POSTCOPY_UMEM_MASK;
> > + } else
> > +#endif
> > if (mem_path) {
> > #if defined (__linux__) && !defined(TARGET_S390X)
> > new_block->host = file_ram_alloc(new_block, size, mem_path);
> > @@ -3027,7 +3035,13 @@ void qemu_ram_free(ram_addr_t addr)
> > QLIST_REMOVE(block, next);
> > if (block->flags & RAM_PREALLOC_MASK) {
> > ;
> > - } else if (mem_path) {
> > + }
> > +#ifdef CONFIG_POSTCOPY
> > + else if (block->flags & RAM_POSTCOPY_UMEM_MASK) {
> > + postcopy_incoming_ram_free(block->umem);
> > + }
> > +#endif
> > + else if (mem_path) {
> > #if defined (__linux__) && !defined(TARGET_S390X)
> > if (block->fd) {
> > munmap(block->host, block->length);
> > @@ -3073,6 +3087,10 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t
> > length)
> > } else {
> > flags = MAP_FIXED;
> > munmap(vaddr, length);
> > + if (block->flags & RAM_POSTCOPY_UMEM_MASK) {
> > + postcopy_incoming_qemu_pages_unmapped(addr, length);
> > + block->flags &= ~RAM_POSTCOPY_UMEM_MASK;
> > + }
> > if (mem_path) {
> > #if defined(__linux__) && !defined(TARGET_S390X)
> > if (block->fd) {
> > diff --git a/migration-exec.c b/migration-exec.c
> > index e14552e..2bd0c3b 100644
> > --- a/migration-exec.c
> > +++ b/migration-exec.c
> > @@ -62,6 +62,10 @@ int exec_start_outgoing_migration(MigrationState *s,
> > const char *command)
> > {
> > FILE *f;
> >
> > + if (s->params.postcopy) {
> > + return -ENOSYS;
> > + }
> > +
> > f = popen(command, "w");
> > if (f == NULL) {
> > DPRINTF("Unable to popen exec target\n");
> > @@ -104,6 +108,10 @@ int exec_start_incoming_migration(const char *command)
> > {
> > QEMUFile *f;
> >
> > + if (incoming_postcopy) {
> > + return -ENOSYS;
> > + }
> > +
> > DPRINTF("Attempting to start an incoming migration\n");
> > f = qemu_popen_cmd(command, "r");
> > if(f == NULL) {
> > diff --git a/migration-fd.c b/migration-fd.c
> > index 6211124..5a62ab9 100644
> > --- a/migration-fd.c
> > +++ b/migration-fd.c
> > @@ -88,6 +88,23 @@ int fd_start_outgoing_migration(MigrationState *s, const
> > char *fdname)
> > s->write = fd_write;
> > s->close = fd_close;
> >
> > + if (s->params.postcopy) {
> > + int flags = fcntl(s->fd, F_GETFL);
> > + if ((flags & O_ACCMODE) != O_RDWR) {
> > + goto err_after_open;
> > + }
> > +
> > + s->fd_read = dup(s->fd);
> > + if (s->fd_read == -1) {
> > + goto err_after_open;
> > + }
> > + s->file_read = qemu_fdopen(s->fd_read, "r");
> > + if (s->file_read == NULL) {
> > + close(s->fd_read);
> > + goto err_after_open;
> > + }
> > + }
> > +
> > migrate_fd_connect(s);
> > return 0;
> >
> > @@ -103,7 +120,14 @@ static void fd_accept_incoming_migration(void *opaque)
> >
> > process_incoming_migration(f);
> > qemu_set_fd_handler2(qemu_stdio_fd(f), NULL, NULL, NULL, NULL);
> > + if (incoming_postcopy) {
> > + postcopy_incoming_fork_umemd(qemu_stdio_fd(f), f);
> > + }
> > qemu_fclose(f);
> > + if (incoming_postcopy) {
> > + postcopy_incoming_qemu_ready();
> > + }
> > + return;
> > }
> >
> > int fd_start_incoming_migration(const char *infd)
> > @@ -114,6 +138,12 @@ int fd_start_incoming_migration(const char *infd)
> > DPRINTF("Attempting to start an incoming migration via fd\n");
> >
> > fd = strtol(infd, NULL, 0);
> > + if (incoming_postcopy) {
> > + int flags = fcntl(fd, F_GETFL);
> > + if ((flags & O_ACCMODE) != O_RDWR) {
> > + return -EINVAL;
> > + }
> > + }
> > f = qemu_fdopen(fd, "rb");
> > if(f == NULL) {
> > DPRINTF("Unable to apply qemu wrapper to file descriptor\n");
> > diff --git a/migration-postcopy-stub.c b/migration-postcopy-stub.c
> > new file mode 100644
> > index 0000000..0b78de7
> > --- /dev/null
> > +++ b/migration-postcopy-stub.c
> > @@ -0,0 +1,77 @@
> > +/*
> > + * migration-postcopy-stub.c: postcopy livemigration
> > + * stub functions for non-supported hosts
> > + *
> > + * Copyright (c) 2011
> > + * National Institute of Advanced Industrial Science and Technology
> > + *
> > + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> > + * Author: Isaku Yamahata <yamahata at valinux co jp>
> > + *
> > + * This program is free software; you can redistribute it and/or modify it
> > + * under the terms and conditions of the GNU General Public License,
> > + * version 2, as published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope it will be useful, but WITHOUT
> > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
> > for
> > + * more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#include "sysemu.h"
> > +#include "migration.h"
> > +
> > +int postcopy_outgoing_create_read_socket(MigrationState *s)
> > +{
> > + return -ENOSYS;
> > +}
> > +
> > +int postcopy_outgoing_ram_save_live(Monitor *mon,
> > + QEMUFile *f, int stage, void *opaque)
> > +{
> > + return -ENOSYS;
> > +}
> > +
> > +void *postcopy_outgoing_begin(MigrationState *ms)
> > +{
> > + return NULL;
> > +}
> > +
> > +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f,
> > + void *postcopy)
> > +{
> > + return -ENOSYS;
> > +}
> > +
> > +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy)
> > +{
> > + return -ENOSYS;
> > +}
> > +
> > +void postcopy_incoming_prepare(void)
> > +{
> > +}
> > +
> > +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id)
> > +{
> > + return -ENOSYS;
> > +}
> > +
> > +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read)
> > +{
> > +}
> > +
> > +void postcopy_incoming_qemu_ready(void)
> > +{
> > +}
> > +
> > +void postcopy_incoming_qemu_cleanup(void)
> > +{
> > +}
> > +
> > +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t
> > size)
> > +{
> > +}
> > diff --git a/migration-postcopy.c b/migration-postcopy.c
> > new file mode 100644
> > index 0000000..ed0d574
> > --- /dev/null
> > +++ b/migration-postcopy.c
> > @@ -0,0 +1,1891 @@
> > +/*
> > + * migration-postcopy.c: postcopy livemigration
> > + *
> > + * Copyright (c) 2011
> > + * National Institute of Advanced Industrial Science and Technology
> > + *
> > + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> > + * Author: Isaku Yamahata <yamahata at valinux co jp>
> > + *
> > + * This program is free software; you can redistribute it and/or modify it
> > + * under the terms and conditions of the GNU General Public License,
> > + * version 2, as published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope it will be useful, but WITHOUT
> > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
> > for
> > + * more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#include "bitmap.h"
> > +#include "sysemu.h"
> > +#include "hw/hw.h"
> > +#include "arch_init.h"
> > +#include "migration.h"
> > +#include "umem.h"
> > +
> > +#include "memory.h"
> > +#define WANT_EXEC_OBSOLETE
> > +#include "exec-obsolete.h"
> > +
> > +//#define DEBUG_POSTCOPY
> > +#ifdef DEBUG_POSTCOPY
> > +#include <sys/syscall.h>
> > +#define DPRINTF(fmt, ...) \
> > + do { \
> > + printf("%d:%ld %s:%d: " fmt, getpid(), syscall(SYS_gettid), \
> > + __func__, __LINE__, ## __VA_ARGS__); \
> > + } while (0)
> > +#else
> > +#define DPRINTF(fmt, ...) do { } while (0)
> > +#endif
> > +
> > +#define ALIGN_UP(size, align) (((size) + (align) - 1) & ~((align) - 1))
> > +
> > +static void fd_close(int *fd)
> > +{
> > + if (*fd >= 0) {
> > + close(*fd);
> > + *fd = -1;
> > + }
> > +}
> > +
> > +/***************************************************************************
> > + * QEMUFile for non blocking pipe
> > + */
> > +
> > +/* read only */
> > +struct QEMUFilePipe {
> > + int fd;
> > + QEMUFile *file;
> > +};
>
> Why not use QEMUFileSocket ?
Okay, will rename it to QEMUFile_FD (or whatever) and share the struct.
> > +typedef struct QEMUFilePipe QEMUFilePipe;
> > +
> > +static int pipe_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int
> > size)
> > +{
> > + QEMUFilePipe *s = opaque;
> > + ssize_t len = 0;
> > +
> > + while (size > 0) {
> > + ssize_t ret = read(s->fd, buf, size);
> > + if (ret == -1) {
> > + if (errno == EINTR) {
> > + continue;
> > + }
> > + if (len == 0) {
> > + len = -errno;
> > + }
> > + break;
> > + }
> > +
> > + if (ret == 0) {
> > + /* the write end of the pipe is closed */
> > + break;
> > + }
> > + len += ret;
> > + buf += ret;
> > + size -= ret;
> > + }
> > +
> > + return len;
> > +}
> > +
> > +static int pipe_close(void *opaque)
> > +{
> > + QEMUFilePipe *s = opaque;
> > + g_free(s);
> > + return 0;
> > +}
> > +
> > +static QEMUFile *qemu_fopen_pipe(int fd)
> > +{
> > + QEMUFilePipe *s = g_malloc0(sizeof(*s));
> > +
> > + s->fd = fd;
> > + fcntl_setfl(fd, O_NONBLOCK);
> > + s->file = qemu_fopen_ops(s, NULL, pipe_get_buffer, pipe_close,
> > + NULL, NULL, NULL);
> > + return s->file;
> > +}
> > +
> > +/* write only */
> > +struct QEMUFileNonblock {
> > + int fd;
> > + QEMUFile *file;
> > +
> > + /* for pipe-write nonblocking mode */
> > +#define BUF_SIZE_INC (32 * 1024) /* = IO_BUF_SIZE */
> > + uint8_t *buffer;
> > + size_t buffer_size;
> > + size_t buffer_capacity;
> > + bool freeze_output;
> > +};
> > +typedef struct QEMUFileNonblock QEMUFileNonblock;
> > +
>
> Couldn't you use QEMUFileBuffered ?
QEMUFileBuffered can be built on top of QEMUFileNonblock.
I'll refactor buffered_file.c
> > +static void nonblock_flush_buffer(QEMUFileNonblock *s)
> > +{
> > + size_t offset = 0;
> > + ssize_t ret;
> > +
> > + while (offset < s->buffer_size) {
> > + ret = write(s->fd, s->buffer + offset, s->buffer_size - offset);
> > + if (ret == -1) {
> > + if (errno == EINTR) {
> > + continue;
> > + } else if (errno == EAGAIN) {
> > + s->freeze_output = true;
> > + } else {
> > + qemu_file_set_error(s->file, errno);
> > + }
> > + break;
> > + }
> > +
> > + if (ret == 0) {
> > + DPRINTF("ret == 0\n");
> > + break;
> > + }
> > +
> > + offset += ret;
> > + }
> > +
> > + if (offset > 0) {
> > + assert(s->buffer_size >= offset);
> > + memmove(s->buffer, s->buffer + offset, s->buffer_size - offset);
> > + s->buffer_size -= offset;
> > + }
> > + if (s->buffer_size > 0) {
> > + s->freeze_output = true;
> > + }
> > +}
> > +
> > +static int nonblock_put_buffer(void *opaque,
> > + const uint8_t *buf, int64_t pos, int size)
> > +{
> > + QEMUFileNonblock *s = opaque;
> > + int error;
> > + ssize_t len = 0;
> > +
> > + error = qemu_file_get_error(s->file);
> > + if (error) {
> > + return error;
> > + }
> > +
> > + nonblock_flush_buffer(s);
> > + error = qemu_file_get_error(s->file);
> > + if (error) {
> > + return error;
> > + }
> > +
> > + while (!s->freeze_output && size > 0) {
> > + ssize_t ret;
> > + assert(s->buffer_size == 0);
> > +
> > + ret = write(s->fd, buf, size);
> > + if (ret == -1) {
> > + if (errno == EINTR) {
> > + continue;
> > + } else if (errno == EAGAIN) {
> > + s->freeze_output = true;
> > + } else {
> > + qemu_file_set_error(s->file, errno);
> > + }
> > + break;
> > + }
> > +
> > + len += ret;
> > + buf += ret;
> > + size -= ret;
> > + }
> > +
> > + if (size > 0) {
> > + int inc = size - (s->buffer_capacity - s->buffer_size);
> > + if (inc > 0) {
> > + s->buffer_capacity +=
> > + DIV_ROUND_UP(inc, BUF_SIZE_INC) * BUF_SIZE_INC;
> > + s->buffer = g_realloc(s->buffer, s->buffer_capacity);
> > + }
> > + memcpy(s->buffer + s->buffer_size, buf, size);
> > + s->buffer_size += size;
> > +
> > + len += size;
> > + }
> > +
> > + return len;
> > +}
> > +
> > +static int nonblock_pending_size(QEMUFileNonblock *s)
> > +{
> > + return qemu_pending_size(s->file) + s->buffer_size;
> > +}
> > +
> > +static void nonblock_fflush(QEMUFileNonblock *s)
> > +{
> > + s->freeze_output = false;
> > + nonblock_flush_buffer(s);
> > + if (!s->freeze_output) {
> > + qemu_fflush(s->file);
> > + }
> > +}
> > +
> > +static void nonblock_wait_for_flush(QEMUFileNonblock *s)
> > +{
> > + while (nonblock_pending_size(s) > 0) {
> > + fd_set fds;
> > + FD_ZERO(&fds);
> > + FD_SET(s->fd, &fds);
> > + select(s->fd + 1, NULL, &fds, NULL, NULL);
> > +
> > + nonblock_fflush(s);
> > + }
> > +}
> > +
> > +static int nonblock_close(void *opaque)
> > +{
> > + QEMUFileNonblock *s = opaque;
> > + nonblock_wait_for_flush(s);
> > + g_free(s->buffer);
> > + g_free(s);
> > + return 0;
> > +}
> > +
> > +static QEMUFileNonblock *qemu_fopen_nonblock(int fd)
> > +{
> > + QEMUFileNonblock *s = g_malloc0(sizeof(*s));
> > +
> > + s->fd = fd;
> > + fcntl_setfl(fd, O_NONBLOCK);
> > + s->file = qemu_fopen_ops(s, nonblock_put_buffer, NULL, nonblock_close,
> > + NULL, NULL, NULL);
> > + return s;
> > +}
> > +
> > +/***************************************************************************
> > + * umem daemon on destination <-> qemu on source protocol
> > + */
> > +
> > +#define QEMU_UMEM_REQ_INIT 0x00
> > +#define QEMU_UMEM_REQ_ON_DEMAND 0x01
> > +#define QEMU_UMEM_REQ_ON_DEMAND_CONT 0x02
> > +#define QEMU_UMEM_REQ_BACKGROUND 0x03
> > +#define QEMU_UMEM_REQ_BACKGROUND_CONT 0x04
> > +#define QEMU_UMEM_REQ_REMOVE 0x05
> > +#define QEMU_UMEM_REQ_EOC 0x06
> > +
> > +struct qemu_umem_req {
> > + int8_t cmd;
> > + uint8_t len;
> > + char *idstr; /* ON_DEMAND, BACKGROUND, REMOVE */
> > + uint32_t nr; /* ON_DEMAND, ON_DEMAND_CONT,
> > + BACKGROUND, BACKGROUND_CONT, REMOVE */
> > +
> > + /* in target page size as qemu migration protocol */
> > + uint64_t *pgoffs; /* ON_DEMAND, ON_DEMAND_CONT,
> > + BACKGROUND, BACKGROUND_CONT, REMOVE */
> > +};
> > +
> > +static void postcopy_incoming_send_req_idstr(QEMUFile *f, const char*
> > idstr)
> > +{
> > + qemu_put_byte(f, strlen(idstr));
> > + qemu_put_buffer(f, (uint8_t *)idstr, strlen(idstr));
> > +}
> > +
> > +static void postcopy_incoming_send_req_pgoffs(QEMUFile *f, uint32_t nr,
> > + const uint64_t *pgoffs)
> > +{
> > + uint32_t i;
> > +
> > + qemu_put_be32(f, nr);
> > + for (i = 0; i < nr; i++) {
> > + qemu_put_be64(f, pgoffs[i]);
> > + }
> > +}
> > +
> > +static void postcopy_incoming_send_req_one(QEMUFile *f,
> > + const struct qemu_umem_req *req)
> > +{
> > + DPRINTF("cmd %d\n", req->cmd);
> > + qemu_put_byte(f, req->cmd);
> > + switch (req->cmd) {
> > + case QEMU_UMEM_REQ_INIT:
> > + case QEMU_UMEM_REQ_EOC:
> > + /* nothing */
> > + break;
> > + case QEMU_UMEM_REQ_ON_DEMAND:
> > + case QEMU_UMEM_REQ_BACKGROUND:
> > + case QEMU_UMEM_REQ_REMOVE:
> > + postcopy_incoming_send_req_idstr(f, req->idstr);
> > + postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs);
> > + break;
> > + case QEMU_UMEM_REQ_ON_DEMAND_CONT:
> > + case QEMU_UMEM_REQ_BACKGROUND_CONT:
> > + postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs);
> > + break;
> > + default:
> > + abort();
> > + break;
> > + }
> > +}
> > +
> > +/* QEMUFile can buffer up to IO_BUF_SIZE = 32 * 1024.
> > + * So one message size must be <= IO_BUF_SIZE
> > + * cmd: 1
> > + * id len: 1
> > + * id: 256
> > + * nr: 2
> > + */
> > +#define MAX_PAGE_NR ((32 * 1024 - 1 - 1 - 256 - 2) / sizeof(uint64_t))
> > +static void postcopy_incoming_send_req(QEMUFile *f,
> > + const struct qemu_umem_req *req)
> > +{
> > + uint32_t nr = req->nr;
> > + struct qemu_umem_req tmp = *req;
> > +
> > + switch (req->cmd) {
> > + case QEMU_UMEM_REQ_INIT:
> > + case QEMU_UMEM_REQ_EOC:
> > + postcopy_incoming_send_req_one(f, &tmp);
> > + break;
> > + case QEMU_UMEM_REQ_ON_DEMAND:
> > + case QEMU_UMEM_REQ_BACKGROUND:
> > + tmp.nr = MIN(nr, MAX_PAGE_NR);
> > + postcopy_incoming_send_req_one(f, &tmp);
> > +
> > + nr -= tmp.nr;
> > + tmp.pgoffs += tmp.nr;
> > + if (tmp.cmd == QEMU_UMEM_REQ_ON_DEMAND) {
> > + tmp.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT;
> > + }else {
> > + tmp.cmd = QEMU_UMEM_REQ_BACKGROUND_CONT;
> > + }
> > + /* fall through */
> > + case QEMU_UMEM_REQ_REMOVE:
> > + case QEMU_UMEM_REQ_ON_DEMAND_CONT:
> > + case QEMU_UMEM_REQ_BACKGROUND_CONT:
> > + while (nr > 0) {
> > + tmp.nr = MIN(nr, MAX_PAGE_NR);
> > + postcopy_incoming_send_req_one(f, &tmp);
> > +
> > + nr -= tmp.nr;
> > + tmp.pgoffs += tmp.nr;
> > + }
> > + break;
> > + default:
> > + abort();
> > + break;
> > + }
> > +}
> > +
> > +static int postcopy_outgoing_recv_req_idstr(QEMUFile *f,
> > + struct qemu_umem_req *req,
> > + size_t *offset)
> > +{
> > + int ret;
> > +
> > + req->len = qemu_peek_byte(f, *offset);
> > + *offset += 1;
> > + if (req->len == 0) {
> > + return -EAGAIN;
> > + }
> > + req->idstr = g_malloc((int)req->len + 1);
> > + ret = qemu_peek_buffer(f, (uint8_t*)req->idstr, req->len, *offset);
> > + *offset += ret;
> > + if (ret != req->len) {
> > + g_free(req->idstr);
> > + req->idstr = NULL;
> > + return -EAGAIN;
> > + }
> > + req->idstr[req->len] = 0;
> > + return 0;
> > +}
> > +
> > +static int postcopy_outgoing_recv_req_pgoffs(QEMUFile *f,
> > + struct qemu_umem_req *req,
> > + size_t *offset)
> > +{
> > + int ret;
> > + uint32_t be32;
> > + uint32_t i;
> > +
> > + ret = qemu_peek_buffer(f, (uint8_t*)&be32, sizeof(be32), *offset);
> > + *offset += sizeof(be32);
> > + if (ret != sizeof(be32)) {
> > + return -EAGAIN;
> > + }
> > +
> > + req->nr = be32_to_cpu(be32);
> > + req->pgoffs = g_new(uint64_t, req->nr);
> > + for (i = 0; i < req->nr; i++) {
> > + uint64_t be64;
> > + ret = qemu_peek_buffer(f, (uint8_t*)&be64, sizeof(be64), *offset);
> > + *offset += sizeof(be64);
> > + if (ret != sizeof(be64)) {
> > + g_free(req->pgoffs);
> > + req->pgoffs = NULL;
> > + return -EAGAIN;
> > + }
> > + req->pgoffs[i] = be64_to_cpu(be64);
> > + }
> > + return 0;
> > +}
> > +
> > +static int postcopy_outgoing_recv_req(QEMUFile *f, struct qemu_umem_req
> > *req)
> > +{
> > + int size;
> > + int ret;
> > + size_t offset = 0;
> > +
> > + size = qemu_peek_buffer(f, (uint8_t*)&req->cmd, 1, offset);
> > + if (size <= 0) {
> > + return -EAGAIN;
> > + }
> > + offset += 1;
> > +
> > + switch (req->cmd) {
> > + case QEMU_UMEM_REQ_INIT:
> > + case QEMU_UMEM_REQ_EOC:
> > + /* nothing */
> > + break;
> > + case QEMU_UMEM_REQ_ON_DEMAND:
> > + case QEMU_UMEM_REQ_BACKGROUND:
> > + case QEMU_UMEM_REQ_REMOVE:
> > + ret = postcopy_outgoing_recv_req_idstr(f, req, &offset);
> > + if (ret < 0) {
> > + return ret;
> > + }
> > + ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset);
> > + if (ret < 0) {
> > + return ret;
> > + }
> > + break;
> > + case QEMU_UMEM_REQ_ON_DEMAND_CONT:
> > + case QEMU_UMEM_REQ_BACKGROUND_CONT:
> > + ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset);
> > + if (ret < 0) {
> > + return ret;
> > + }
> > + break;
> > + default:
> > + abort();
> > + break;
> > + }
> > + qemu_file_skip(f, offset);
> > + DPRINTF("cmd %d\n", req->cmd);
> > + return 0;
> > +}
> > +
> > +static void postcopy_outgoing_free_req(struct qemu_umem_req *req)
> > +{
> > + g_free(req->idstr);
> > + g_free(req->pgoffs);
> > +}
> > +
> > +/***************************************************************************
> > + * outgoing part
> > + */
> > +
> > +#define QEMU_SAVE_LIVE_STAGE_START 0x01 /* = QEMU_VM_SECTION_START
> > */
> > +#define QEMU_SAVE_LIVE_STAGE_PART 0x02 /* = QEMU_VM_SECTION_PART
> > */
> > +#define QEMU_SAVE_LIVE_STAGE_END 0x03 /* = QEMU_VM_SECTION_END */
> > +
> > +enum POState {
> > + PO_STATE_ERROR_RECEIVE,
> > + PO_STATE_ACTIVE,
> > + PO_STATE_EOC_RECEIVED,
> > + PO_STATE_ALL_PAGES_SENT,
> > + PO_STATE_COMPLETED,
> > +};
> > +typedef enum POState POState;
> > +
> > +struct PostcopyOutgoingState {
> > + POState state;
> > + QEMUFile *mig_read;
> > + int fd_read;
> > + RAMBlock *last_block_read;
> > +
> > + QEMUFile *mig_buffered_write;
> > + MigrationState *ms;
> > +
> > + /* For nobg mode. Check if all pages are sent */
> > + RAMBlock *block;
> > + ram_addr_t addr;
> > +};
> > +typedef struct PostcopyOutgoingState PostcopyOutgoingState;
> > +
> > +int postcopy_outgoing_create_read_socket(MigrationState *s)
> > +{
> > + if (!s->params.postcopy) {
> > + return 0;
> > + }
> > +
> > + s->fd_read = dup(s->fd);
> > + if (s->fd_read == -1) {
> > + int ret = -errno;
> > + perror("dup");
> > + return ret;
> > + }
> > + s->file_read = qemu_fopen_socket(s->fd_read);
> > + if (s->file_read == NULL) {
> > + return -EINVAL;
> > + }
> > + return 0;
> > +}
> > +
> > +int postcopy_outgoing_ram_save_live(Monitor *mon,
> > + QEMUFile *f, int stage, void *opaque)
> > +{
> > + int ret = 0;
> > + DPRINTF("stage %d\n", stage);
> > + if (stage == QEMU_SAVE_LIVE_STAGE_START) {
> > + sort_ram_list();
> > + ram_save_live_mem_size(f);
> > + }
> > + if (stage == QEMU_SAVE_LIVE_STAGE_PART) {
> > + ret = 1;
> > + }
> > + qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> > + return ret;
> > +}
> > +
> > +static RAMBlock *postcopy_outgoing_find_block(const char *idstr)
> > +{
> > + RAMBlock *block;
> > + QLIST_FOREACH(block, &ram_list.blocks, next) {
> > + if (!strncmp(idstr, block->idstr, strlen(idstr))) {
> > + return block;
> > + }
> > + }
> > + return NULL;
> > +}
> > +
> > +/*
> > + * return value
> > + * 0: continue postcopy mode
> > + * > 0: completed postcopy mode.
> > + * < 0: error
> > + */
> > +static int postcopy_outgoing_handle_req(PostcopyOutgoingState *s,
> > + const struct qemu_umem_req *req,
> > + bool *written)
> > +{
> > + int i;
> > + RAMBlock *block;
> > +
> > + DPRINTF("cmd %d state %d\n", req->cmd, s->state);
> > + switch(req->cmd) {
> > + case QEMU_UMEM_REQ_INIT:
> > + /* nothing */
> > + break;
> > + case QEMU_UMEM_REQ_EOC:
> > + /* tell to finish migration. */
> > + if (s->state == PO_STATE_ALL_PAGES_SENT) {
> > + s->state = PO_STATE_COMPLETED;
> > + DPRINTF("-> PO_STATE_COMPLETED\n");
> > + } else {
> > + s->state = PO_STATE_EOC_RECEIVED;
> > + DPRINTF("-> PO_STATE_EOC_RECEIVED\n");
> > + }
> > + return 1;
> > + case QEMU_UMEM_REQ_ON_DEMAND:
> > + case QEMU_UMEM_REQ_BACKGROUND:
> > + DPRINTF("idstr: %s\n", req->idstr);
> > + block = postcopy_outgoing_find_block(req->idstr);
> > + if (block == NULL) {
> > + return -EINVAL;
> > + }
> > + s->last_block_read = block;
> > + /* fall through */
> > + case QEMU_UMEM_REQ_ON_DEMAND_CONT:
> > + case QEMU_UMEM_REQ_BACKGROUND_CONT:
> > + DPRINTF("nr %d\n", req->nr);
> > + for (i = 0; i < req->nr; i++) {
> > + DPRINTF("offs[%d] 0x%"PRIx64"\n", i, req->pgoffs[i]);
> > + int ret = ram_save_page(s->mig_buffered_write,
> > s->last_block_read,
> > + req->pgoffs[i] << TARGET_PAGE_BITS);
> > + if (ret > 0) {
> > + *written = true;
> > + }
> > + }
> > + break;
> > + case QEMU_UMEM_REQ_REMOVE:
> > + block = postcopy_outgoing_find_block(req->idstr);
> > + if (block == NULL) {
> > + return -EINVAL;
> > + }
> > + for (i = 0; i < req->nr; i++) {
> > + ram_addr_t addr = block->offset +
> > + (req->pgoffs[i] << TARGET_PAGE_BITS);
> > + cpu_physical_memory_reset_dirty(addr,
> > + addr + TARGET_PAGE_SIZE,
> > + MIGRATION_DIRTY_FLAG);
> > + }
> > + break;
> > + default:
> > + return -EINVAL;
> > + }
> > + return 0;
> > +}
> > +
> > +static void postcopy_outgoing_close_mig_read(PostcopyOutgoingState *s)
> > +{
> > + if (s->mig_read != NULL) {
> > + qemu_set_fd_handler(s->fd_read, NULL, NULL, NULL);
> > + qemu_fclose(s->mig_read);
> > + s->mig_read = NULL;
> > + fd_close(&s->fd_read);
> > +
> > + s->ms->file_read = NULL;
> > + s->ms->fd_read = -1;
> > + }
> > +}
> > +
> > +static void postcopy_outgoing_completed(PostcopyOutgoingState *s)
> > +{
> > + postcopy_outgoing_close_mig_read(s);
> > + s->ms->postcopy = NULL;
> > + g_free(s);
> > +}
> > +
> > +static void postcopy_outgoing_recv_handler(void *opaque)
> > +{
> > + PostcopyOutgoingState *s = opaque;
> > + bool written = false;
> > + int ret = 0;
> > +
> > + assert(s->state == PO_STATE_ACTIVE ||
> > + s->state == PO_STATE_ALL_PAGES_SENT);
> > +
> > + do {
> > + struct qemu_umem_req req = {.idstr = NULL,
> > + .pgoffs = NULL};
> > +
> > + ret = postcopy_outgoing_recv_req(s->mig_read, &req);
> > + if (ret < 0) {
> > + if (ret == -EAGAIN) {
> > + ret = 0;
> > + }
> > + break;
> > + }
> > + if (s->state == PO_STATE_ACTIVE) {
> > + ret = postcopy_outgoing_handle_req(s, &req, &written);
> > + }
> > + postcopy_outgoing_free_req(&req);
> > + } while (ret == 0);
> > +
> > + /*
> > + * flush buffered_file.
> > + * Although mig_write is rate-limited buffered file, those written
> > pages
> > + * are requested on demand by the destination. So forcibly push
> > + * those pages ignoring rate limiting
> > + */
> > + if (written) {
> > + qemu_fflush(s->mig_buffered_write);
> > + /* qemu_buffered_file_drain(s->mig_buffered_write); */
> > + }
> > +
> > + if (ret < 0) {
> > + switch (s->state) {
> > + case PO_STATE_ACTIVE:
> > + s->state = PO_STATE_ERROR_RECEIVE;
> > + DPRINTF("-> PO_STATE_ERROR_RECEIVE\n");
> > + break;
> > + case PO_STATE_ALL_PAGES_SENT:
> > + s->state = PO_STATE_COMPLETED;
> > + DPRINTF("-> PO_STATE_ALL_PAGES_SENT\n");
> > + break;
> > + default:
> > + abort();
> > + }
> > + }
> > + if (s->state == PO_STATE_ERROR_RECEIVE || s->state ==
> > PO_STATE_COMPLETED) {
> > + postcopy_outgoing_close_mig_read(s);
> > + }
> > + if (s->state == PO_STATE_COMPLETED) {
> > + DPRINTF("PO_STATE_COMPLETED\n");
> > + MigrationState *ms = s->ms;
> > + postcopy_outgoing_completed(s);
> > + migrate_fd_completed(ms);
> > + }
> > +}
> > +
> > +void *postcopy_outgoing_begin(MigrationState *ms)
> > +{
> > + PostcopyOutgoingState *s = g_new(PostcopyOutgoingState, 1);
> > + DPRINTF("outgoing begin\n");
> > + qemu_fflush(ms->file);
> > +
> > + s->ms = ms;
> > + s->state = PO_STATE_ACTIVE;
> > + s->fd_read = ms->fd_read;
> > + s->mig_read = ms->file_read;
> > + s->mig_buffered_write = ms->file;
> > + s->block = NULL;
> > + s->addr = 0;
> > +
> > + /* Make sure all dirty bits are set */
> > + ram_save_memory_set_dirty();
> > +
> > + qemu_set_fd_handler(s->fd_read,
> > + &postcopy_outgoing_recv_handler, NULL, s);
> > + return s;
> > +}
> > +
> > +static void postcopy_outgoing_ram_all_sent(QEMUFile *f,
> > + PostcopyOutgoingState *s)
> > +{
> > + assert(s->state == PO_STATE_ACTIVE);
> > +
> > + s->state = PO_STATE_ALL_PAGES_SENT;
> > + /* tell incoming side that all pages are sent */
> > + qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> > + qemu_fflush(f);
> > + qemu_buffered_file_drain(f);
> > + DPRINTF("sent RAM_SAVE_FLAG_EOS\n");
> > + migrate_fd_cleanup(s->ms);
> > +
> > + /* Later migrate_fd_complete() will be called which calls
> > + * migrate_fd_cleanup() again. So dummy file is created
> > + * for qemu monitor to keep working.
> > + */
> > + s->ms->file = qemu_fopen_ops(NULL, NULL, NULL, NULL, NULL,
> > + NULL, NULL);
> > +}
> > +
> > +static int postcopy_outgoing_check_all_ram_sent(PostcopyOutgoingState *s,
> > + RAMBlock *block,
> > + ram_addr_t addr)
> > +{
> > + if (block == NULL) {
> > + block = QLIST_FIRST(&ram_list.blocks);
> > + addr = block->offset;
> > + }
> > +
> > + for (; block != NULL;
> > + s->block = QLIST_NEXT(s->block, next), addr = block->offset) {
> > + for (; addr < block->offset + block->length;
> > + addr += TARGET_PAGE_SIZE) {
> > + if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG))
> > {
> > + s->block = block;
> > + s->addr = addr;
> > + return 0;
> > + }
> > + }
> > + }
> > +
> > + return 1;
> > +}
> > +
> > +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f,
> > + void *postcopy)
> > +{
> > + PostcopyOutgoingState *s = postcopy;
> > +
> > + assert(s->state == PO_STATE_ACTIVE ||
> > + s->state == PO_STATE_EOC_RECEIVED ||
> > + s->state == PO_STATE_ERROR_RECEIVE);
> > +
> > + switch (s->state) {
> > + case PO_STATE_ACTIVE:
> > + /* nothing. processed below */
> > + break;
> > + case PO_STATE_EOC_RECEIVED:
> > + qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> > + s->state = PO_STATE_COMPLETED;
> > + postcopy_outgoing_completed(s);
> > + DPRINTF("PO_STATE_COMPLETED\n");
> > + return 1;
> > + case PO_STATE_ERROR_RECEIVE:
> > + postcopy_outgoing_completed(s);
> > + DPRINTF("PO_STATE_ERROR_RECEIVE\n");
> > + return -1;
> > + default:
> > + abort();
> > + }
> > +
> > + if (s->ms->params.nobg) {
> > + /* See if all pages are sent. */
> > + if (postcopy_outgoing_check_all_ram_sent(s, s->block, s->addr) ==
> > 0) {
> > + return 0;
> > + }
> > + /* ram_list can be reordered. (it doesn't seem so during migration,
> > + though) So the whole list needs to be checked again */
> > + if (postcopy_outgoing_check_all_ram_sent(s, NULL, 0) == 0) {
> > + return 0;
> > + }
> > +
> > + postcopy_outgoing_ram_all_sent(f, s);
> > + return 0;
> > + }
> > +
> > + DPRINTF("outgoing background state: %d\n", s->state);
> > +
> > + while (qemu_file_rate_limit(f) == 0) {
> > + if (ram_save_block(f) == 0) { /* no more blocks */
> > + assert(s->state == PO_STATE_ACTIVE);
> > + postcopy_outgoing_ram_all_sent(f, s);
> > + return 0;
> > + }
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +/***************************************************************************
> > + * incoming part
> > + */
> > +
> > +/* flags for incoming mode to modify the behavior.
> > + This is for benchmark/debug purpose */
> > +#define INCOMING_FLAGS_FAULT_REQUEST 0x01
> > +
> > +
> > +static void postcopy_incoming_umemd(void);
> > +
> > +#define PIS_STATE_QUIT_RECEIVED 0x01
> > +#define PIS_STATE_QUIT_QUEUED 0x02
> > +#define PIS_STATE_QUIT_SENT 0x04
> > +
> > +#define PIS_STATE_QUIT_MASK (PIS_STATE_QUIT_RECEIVED | \
> > + PIS_STATE_QUIT_QUEUED | \
> > + PIS_STATE_QUIT_SENT)
> > +
> > +struct PostcopyIncomingState {
> > + /* dest qemu state */
> > + uint32_t state;
> > +
> > + UMemDev *dev;
> > + int host_page_size;
> > + int host_page_shift;
> > +
> > + /* qemu side */
> > + int to_umemd_fd;
> > + QEMUFileNonblock *to_umemd;
> > +#define MAX_FAULTED_PAGES 256
> > + struct umem_pages *faulted_pages;
> > +
> > + int from_umemd_fd;
> > + QEMUFile *from_umemd;
> > + int version_id; /* save/load format version id */
> > +};
> > +typedef struct PostcopyIncomingState PostcopyIncomingState;
> > +
> > +
> > +#define UMEM_STATE_EOS_RECEIVED 0x01 /* umem daemon <-> src
> > qemu */
> > +#define UMEM_STATE_EOC_SENT 0x02 /* umem daemon <-> src
> > qemu */
> > +#define UMEM_STATE_QUIT_RECEIVED 0x04 /* umem daemon <-> dst
> > qemu */
> > +#define UMEM_STATE_QUIT_QUEUED 0x08 /* umem daemon <-> dst
> > qemu */
> > +#define UMEM_STATE_QUIT_SENT 0x10 /* umem daemon <-> dst
> > qemu */
> > +
> > +#define UMEM_STATE_QUIT_MASK (UMEM_STATE_QUIT_QUEUED | \
> > + UMEM_STATE_QUIT_SENT | \
> > + UMEM_STATE_QUIT_RECEIVED)
> > +#define UMEM_STATE_END_MASK (UMEM_STATE_EOS_RECEIVED | \
> > + UMEM_STATE_EOC_SENT | \
> > + UMEM_STATE_QUIT_MASK)
> > +
> > +struct PostcopyIncomingUMemDaemon {
> > + /* umem daemon side */
> > + uint32_t state;
> > +
> > + int host_page_size;
> > + int host_page_shift;
> > + int nr_host_pages_per_target_page;
> > + int host_to_target_page_shift;
> > + int nr_target_pages_per_host_page;
> > + int target_to_host_page_shift;
> > + int version_id; /* save/load format version id */
> > +
> > + int to_qemu_fd;
> > + QEMUFileNonblock *to_qemu;
> > + int from_qemu_fd;
> > + QEMUFile *from_qemu;
> > +
> > + int mig_read_fd;
> > + QEMUFile *mig_read; /* qemu on source -> umem daemon */
> > +
> > + int mig_write_fd;
> > + QEMUFileNonblock *mig_write; /* umem daemon -> qemu on source */
> > +
> > + /* = KVM_MAX_VCPUS * (ASYNC_PF_PER_VCPUS + 1) */
> > +#define MAX_REQUESTS (512 * (64 + 1))
> > +
> > + struct umem_page_request page_request;
> > + struct umem_page_cached page_cached;
> > +
> > +#define MAX_PRESENT_REQUESTS MAX_FAULTED_PAGES
> > + struct umem_pages *present_request;
> > +
> > + uint64_t *target_pgoffs;
> > +
> > + /* bitmap indexed by target page offset */
> > + unsigned long *phys_requested;
> > +
> > + /* bitmap indexed by target page offset */
> > + unsigned long *phys_received;
> > +
> > + RAMBlock *last_block_read; /* qemu on source -> umem daemon */
> > + RAMBlock *last_block_write; /* umem daemon -> qemu on source */
> > +};
> > +typedef struct PostcopyIncomingUMemDaemon PostcopyIncomingUMemDaemon;
> > +
> > +static PostcopyIncomingState state = {
> > + .state = 0,
> > + .dev = NULL,
> > + .to_umemd_fd = -1,
> > + .to_umemd = NULL,
> > + .from_umemd_fd = -1,
> > + .from_umemd = NULL,
> > +};
> > +
> > +static PostcopyIncomingUMemDaemon umemd = {
> > + .state = 0,
> > + .to_qemu_fd = -1,
> > + .to_qemu = NULL,
> > + .from_qemu_fd = -1,
> > + .from_qemu = NULL,
> > + .mig_read_fd = -1,
> > + .mig_read = NULL,
> > + .mig_write_fd = -1,
> > + .mig_write = NULL,
> > +};
> > +
> > +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy)
> > +{
> > + /* incoming_postcopy makes sense only when incoming migration mode */
> > + if (!incoming && incoming_postcopy) {
> > + return -EINVAL;
> > + }
> > +
> > + if (!incoming_postcopy) {
> > + return 0;
> > + }
> > +
> > + state.state = 0;
> > + state.dev = umem_dev_new();
> > + state.host_page_size = getpagesize();
> > + state.host_page_shift = ffs(state.host_page_size) - 1;
> > + state.version_id = RAM_SAVE_VERSION_ID; /* = save version of
> > + ram_save_live() */
> > + return 0;
> > +}
> > +
> > +void postcopy_incoming_ram_alloc(const char *name,
> > + size_t size, uint8_t **hostp, UMem
> > **umemp)
> > +{
> > + UMem *umem;
> > + size = ALIGN_UP(size, state.host_page_size);
> > + umem = umem_dev_create(state.dev, size, name);
> > +
> > + *umemp = umem;
> > + *hostp = umem->umem;
> > +}
> > +
> > +void postcopy_incoming_ram_free(UMem *umem)
> > +{
> > + umem_unmap(umem);
> > + umem_close(umem);
> > + umem_destroy(umem);
> > +}
> > +
> > +void postcopy_incoming_prepare(void)
> > +{
> > + RAMBlock *block;
> > +
> > + QLIST_FOREACH(block, &ram_list.blocks, next) {
> > + if (block->umem != NULL) {
> > + umem_mmap(block->umem);
> > + }
> > + }
> > +}
> > +
> > +static int postcopy_incoming_ram_load_get64(QEMUFile *f,
> > + ram_addr_t *addr, int *flags)
> > +{
> > + *addr = qemu_get_be64(f);
> > + *flags = *addr & ~TARGET_PAGE_MASK;
> > + *addr &= TARGET_PAGE_MASK;
> > + return qemu_file_get_error(f);
> > +}
> > +
> > +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id)
> > +{
> > + ram_addr_t addr;
> > + int flags;
> > + int error;
> > +
> > + DPRINTF("incoming ram load\n");
> > + /*
> > + * RAM_SAVE_FLAGS_EOS or
> > + * RAM_SAVE_FLAGS_MEM_SIZE + mem size + RAM_SAVE_FLAGS_EOS
> > + * see postcopy_outgoing_ram_save_live()
> > + */
> > +
> > + if (version_id != RAM_SAVE_VERSION_ID) {
> > + DPRINTF("RAM_SAVE_VERSION_ID %d != %d\n",
> > + version_id, RAM_SAVE_VERSION_ID);
> > + return -EINVAL;
> > + }
> > + error = postcopy_incoming_ram_load_get64(f, &addr, &flags);
> > + DPRINTF("addr 0x%lx flags 0x%x\n", addr, flags);
> > + if (error) {
> > + DPRINTF("error %d\n", error);
> > + return error;
> > + }
> > + if (flags == RAM_SAVE_FLAG_EOS && addr == 0) {
> > + DPRINTF("EOS\n");
> > + return 0;
> > + }
> > +
> > + if (flags != RAM_SAVE_FLAG_MEM_SIZE) {
> > + DPRINTF("-EINVAL flags 0x%x\n", flags);
> > + return -EINVAL;
> > + }
> > + error = ram_load_mem_size(f, addr);
> > + if (error) {
> > + DPRINTF("addr 0x%lx error %d\n", addr, error);
> > + return error;
> > + }
> > +
> > + error = postcopy_incoming_ram_load_get64(f, &addr, &flags);
> > + if (error) {
> > + DPRINTF("addr 0x%lx flags 0x%x error %d\n", addr, flags, error);
> > + return error;
> > + }
> > + if (flags == RAM_SAVE_FLAG_EOS && addr == 0) {
> > + DPRINTF("done\n");
> > + return 0;
> > + }
> > + DPRINTF("-EINVAL\n");
> > + return -EINVAL;
> > +}
> > +
> > +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read)
> > +{
> > + int fds[2];
> > + RAMBlock *block;
> > +
> > + DPRINTF("fork\n");
> > +
> > + /* socketpair(AF_UNIX)? */
> > +
> > + if (qemu_pipe(fds) == -1) {
> > + perror("qemu_pipe");
> > + abort();
> > + }
> > + state.from_umemd_fd = fds[0];
> > + umemd.to_qemu_fd = fds[1];
> > +
> > + if (qemu_pipe(fds) == -1) {
> > + perror("qemu_pipe");
> > + abort();
> > + }
> > + umemd.from_qemu_fd = fds[0];
> > + state.to_umemd_fd = fds[1];
> > +
> > + pid_t child = fork();
> > + if (child < 0) {
> > + perror("fork");
> > + abort();
> > + }
> > +
> > + if (child == 0) {
> > + int mig_write_fd;
> > +
> > + fd_close(&state.to_umemd_fd);
> > + fd_close(&state.from_umemd_fd);
> > + umemd.host_page_size = state.host_page_size;
> > + umemd.host_page_shift = state.host_page_shift;
> > +
> > + umemd.nr_host_pages_per_target_page =
> > + TARGET_PAGE_SIZE / umemd.host_page_size;
> > + umemd.nr_target_pages_per_host_page =
> > + umemd.host_page_size / TARGET_PAGE_SIZE;
> > +
> > + umemd.target_to_host_page_shift =
> > + ffs(umemd.nr_host_pages_per_target_page) - 1;
> > + umemd.host_to_target_page_shift =
> > + ffs(umemd.nr_target_pages_per_host_page) - 1;
> > +
> > + umemd.state = 0;
> > + umemd.version_id = state.version_id;
> > + umemd.mig_read_fd = mig_read_fd;
> > + umemd.mig_read = mig_read;
> > +
> > + mig_write_fd = dup(mig_read_fd);
> > + if (mig_write_fd < 0) {
> > + perror("could not dup for writable socket \n");
> > + abort();
> > + }
> > + umemd.mig_write_fd = mig_write_fd;
> > + umemd.mig_write = qemu_fopen_nonblock(mig_write_fd);
> > +
> > + postcopy_incoming_umemd(); /* noreturn */
> > + }
> > +
> > + DPRINTF("qemu pid: %d daemon pid: %d\n", getpid(), child);
> > + fd_close(&umemd.to_qemu_fd);
> > + fd_close(&umemd.from_qemu_fd);
> > + state.faulted_pages = g_malloc(umem_pages_size(MAX_FAULTED_PAGES));
> > + state.faulted_pages->nr = 0;
> > +
> > + /* close all UMem.shmem_fd */
> > + QLIST_FOREACH(block, &ram_list.blocks, next) {
> > + umem_close_shmem(block->umem);
> > + }
> > + umem_qemu_wait_for_daemon(state.from_umemd_fd);
> > +}
> > +
> > +static void postcopy_incoming_qemu_recv_quit(void)
> > +{
> > + RAMBlock *block;
> > + if (state.state & PIS_STATE_QUIT_RECEIVED) {
> > + return;
> > + }
> > +
> > + QLIST_FOREACH(block, &ram_list.blocks, next) {
> > + if (block->umem != NULL) {
> > + umem_destroy(block->umem);
> > + block->umem = NULL;
> > + block->flags &= ~RAM_POSTCOPY_UMEM_MASK;
> > + }
> > + }
> > +
> > + DPRINTF("|= PIS_STATE_QUIT_RECEIVED\n");
> > + state.state |= PIS_STATE_QUIT_RECEIVED;
> > + qemu_set_fd_handler(state.from_umemd_fd, NULL, NULL, NULL);
> > + qemu_fclose(state.from_umemd);
> > + state.from_umemd = NULL;
> > + fd_close(&state.from_umemd_fd);
> > +}
> > +
> > +static void postcopy_incoming_qemu_fflush_to_umemd_handler(void *opaque)
> > +{
> > + assert(state.to_umemd != NULL);
> > +
> > + nonblock_fflush(state.to_umemd);
> > + if (nonblock_pending_size(state.to_umemd) > 0) {
> > + return;
> > + }
> > +
> > + qemu_set_fd_handler(state.to_umemd->fd, NULL, NULL, NULL);
> > + if (state.state & PIS_STATE_QUIT_QUEUED) {
> > + DPRINTF("|= PIS_STATE_QUIT_SENT\n");
> > + state.state |= PIS_STATE_QUIT_SENT;
> > + qemu_fclose(state.to_umemd->file);
> > + state.to_umemd = NULL;
> > + fd_close(&state.to_umemd_fd);
> > + g_free(state.faulted_pages);
> > + state.faulted_pages = NULL;
> > + }
> > +}
> > +
> > +static void postcopy_incoming_qemu_fflush_to_umemd(void)
> > +{
> > + qemu_set_fd_handler(state.to_umemd->fd, NULL,
> > + postcopy_incoming_qemu_fflush_to_umemd_handler,
> > NULL);
> > + postcopy_incoming_qemu_fflush_to_umemd_handler(NULL);
> > +}
> > +
> > +static void postcopy_incoming_qemu_queue_quit(void)
> > +{
> > + if (state.state & PIS_STATE_QUIT_QUEUED) {
> > + return;
> > + }
> > +
> > + DPRINTF("|= PIS_STATE_QUIT_QUEUED\n");
> > + umem_qemu_quit(state.to_umemd->file);
> > + state.state |= PIS_STATE_QUIT_QUEUED;
> > +}
> > +
> > +static void postcopy_incoming_qemu_send_pages_present(void)
> > +{
> > + if (state.faulted_pages->nr > 0) {
> > + umem_qemu_send_pages_present(state.to_umemd->file,
> > + state.faulted_pages);
> > + state.faulted_pages->nr = 0;
> > + }
> > +}
> > +
> > +static void postcopy_incoming_qemu_faulted_pages(
> > + const struct umem_pages *pages)
> > +{
> > + assert(pages->nr <= MAX_FAULTED_PAGES);
> > + assert(state.faulted_pages != NULL);
> > +
> > + if (state.faulted_pages->nr + pages->nr > MAX_FAULTED_PAGES) {
> > + postcopy_incoming_qemu_send_pages_present();
> > + }
> > + memcpy(&state.faulted_pages->pgoffs[state.faulted_pages->nr],
> > + &pages->pgoffs[0], sizeof(pages->pgoffs[0]) * pages->nr);
> > + state.faulted_pages->nr += pages->nr;
> > +}
> > +
> > +static void postcopy_incoming_qemu_cleanup_umem(void);
> > +
> > +static int postcopy_incoming_qemu_handle_req_one(void)
> > +{
> > + int offset = 0;
> > + int ret;
> > + uint8_t cmd;
> > +
> > + ret = qemu_peek_buffer(state.from_umemd, &cmd, sizeof(cmd), offset);
> > + offset += sizeof(cmd);
> > + if (ret != sizeof(cmd)) {
> > + return -EAGAIN;
> > + }
> > + DPRINTF("cmd %c\n", cmd);
> > +
> > + switch (cmd) {
> > + case UMEM_DAEMON_QUIT:
> > + postcopy_incoming_qemu_recv_quit();
> > + postcopy_incoming_qemu_queue_quit();
> > + postcopy_incoming_qemu_cleanup_umem();
> > + break;
> > + case UMEM_DAEMON_TRIGGER_PAGE_FAULT: {
> > + struct umem_pages *pages =
> > + umem_qemu_trigger_page_fault(state.from_umemd, &offset);
> > + if (pages == NULL) {
> > + return -EAGAIN;
> > + }
> > + if (state.to_umemd_fd >= 0 && !(state.state &
> > PIS_STATE_QUIT_QUEUED)) {
> > + postcopy_incoming_qemu_faulted_pages(pages);
> > + g_free(pages);
> > + }
> > + break;
> > + }
> > + case UMEM_DAEMON_ERROR:
> > + /* umem daemon hit troubles, so it warned us to stop vm execution
> > */
> > + vm_stop(RUN_STATE_IO_ERROR); /* or RUN_STATE_INTERNAL_ERROR */
> > + break;
> > + default:
> > + abort();
> > + break;
> > + }
> > +
> > + if (state.from_umemd != NULL) {
> > + qemu_file_skip(state.from_umemd, offset);
> > + }
> > + return 0;
> > +}
> > +
> > +static void postcopy_incoming_qemu_handle_req(void *opaque)
> > +{
> > + do {
> > + int ret = postcopy_incoming_qemu_handle_req_one();
> > + if (ret == -EAGAIN) {
> > + break;
> > + }
> > + } while (state.from_umemd != NULL &&
> > + qemu_pending_size(state.from_umemd) > 0);
> > +
> > + if (state.to_umemd != NULL) {
> > + if (state.faulted_pages->nr > 0) {
> > + postcopy_incoming_qemu_send_pages_present();
> > + }
> > + postcopy_incoming_qemu_fflush_to_umemd();
> > + }
> > +}
> > +
> > +void postcopy_incoming_qemu_ready(void)
> > +{
> > + umem_qemu_ready(state.to_umemd_fd);
> > +
> > + state.from_umemd = qemu_fopen_pipe(state.from_umemd_fd);
> > + state.to_umemd = qemu_fopen_nonblock(state.to_umemd_fd);
> > + qemu_set_fd_handler(state.from_umemd_fd,
> > + postcopy_incoming_qemu_handle_req, NULL, NULL);
> > +}
> > +
> > +static void postcopy_incoming_qemu_cleanup_umem(void)
> > +{
> > + /* when qemu will quit before completing postcopy, tell umem daemon
> > + to tear down umem device and exit. */
> > + if (state.to_umemd_fd >= 0) {
> > + postcopy_incoming_qemu_queue_quit();
> > + postcopy_incoming_qemu_fflush_to_umemd();
> > + }
> > +
> > + if (state.dev) {
> > + umem_dev_destroy(state.dev);
> > + state.dev = NULL;
> > + }
> > +}
> > +
> > +void postcopy_incoming_qemu_cleanup(void)
> > +{
> > + postcopy_incoming_qemu_cleanup_umem();
> > + if (state.to_umemd != NULL) {
> > + nonblock_wait_for_flush(state.to_umemd);
> > + }
> > +}
> > +
> > +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t
> > size)
> > +{
> > + uint64_t nr = DIV_ROUND_UP(size, state.host_page_size);
> > + size_t len = umem_pages_size(nr);
> > + ram_addr_t end = addr + size;
> > + struct umem_pages *pages;
> > + int i;
> > +
> > + if (state.to_umemd_fd < 0 || state.state & PIS_STATE_QUIT_QUEUED) {
> > + return;
> > + }
> > + pages = g_malloc(len);
> > + pages->nr = nr;
> > + for (i = 0; addr < end; addr += state.host_page_size, i++) {
> > + pages->pgoffs[i] = addr >> state.host_page_shift;
> > + }
> > + umem_qemu_send_pages_unmapped(state.to_umemd->file, pages);
> > + g_free(pages);
> > + assert(state.to_umemd != NULL);
> > + postcopy_incoming_qemu_fflush_to_umemd();
> > +}
> > +
> > +/**************************************************************************
> > + * incoming umem daemon
> > + */
> > +
> > +static void postcopy_incoming_umem_recv_quit(void)
> > +{
> > + if (umemd.state & UMEM_STATE_QUIT_RECEIVED) {
> > + return;
> > + }
> > + DPRINTF("|= UMEM_STATE_QUIT_RECEIVED\n");
> > + umemd.state |= UMEM_STATE_QUIT_RECEIVED;
> > + qemu_fclose(umemd.from_qemu);
> > + umemd.from_qemu = NULL;
> > + fd_close(&umemd.from_qemu_fd);
> > +}
> > +
> > +static void postcopy_incoming_umem_queue_quit(void)
> > +{
> > + if (umemd.state & UMEM_STATE_QUIT_QUEUED) {
> > + return;
> > + }
> > + DPRINTF("|= UMEM_STATE_QUIT_QUEUED\n");
> > + umem_daemon_quit(umemd.to_qemu->file);
> > + umemd.state |= UMEM_STATE_QUIT_QUEUED;
> > +}
> > +
> > +static void postcopy_incoming_umem_send_eoc_req(void)
> > +{
> > + struct qemu_umem_req req;
> > +
> > + if (umemd.state & UMEM_STATE_EOC_SENT) {
> > + return;
> > + }
> > +
> > + DPRINTF("|= UMEM_STATE_EOC_SENT\n");
> > + req.cmd = QEMU_UMEM_REQ_EOC;
> > + postcopy_incoming_send_req(umemd.mig_write->file, &req);
> > + umemd.state |= UMEM_STATE_EOC_SENT;
> > + qemu_fclose(umemd.mig_write->file);
> > + umemd.mig_write = NULL;
> > + fd_close(&umemd.mig_write_fd);
> > +}
> > +
> > +static void postcopy_incoming_umem_send_page_req(RAMBlock *block)
> > +{
> > + struct qemu_umem_req req;
> > + int bit;
> > + uint64_t target_pgoff;
> > + int i;
> > +
> > + umemd.page_request.nr = MAX_REQUESTS;
> > + umem_get_page_request(block->umem, &umemd.page_request);
> > + DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n",
> > + block->idstr, umemd.page_request.nr,
> > + (uint64_t)umemd.page_request.pgoffs[0],
> > + (uint64_t)umemd.page_request.pgoffs[1]);
> > +
> > + if (umemd.last_block_write != block) {
> > + req.cmd = QEMU_UMEM_REQ_ON_DEMAND;
> > + req.idstr = block->idstr;
> > + } else {
> > + req.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT;
> > + }
> > +
> > + req.nr = 0;
> > + req.pgoffs = umemd.target_pgoffs;
> > + if (TARGET_PAGE_SIZE >= umemd.host_page_size) {
> > + for (i = 0; i < umemd.page_request.nr; i++) {
> > + target_pgoff =
> > + umemd.page_request.pgoffs[i] >>
> > umemd.host_to_target_page_shift;
> > + bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff;
> > +
> > + if (!test_and_set_bit(bit, umemd.phys_requested)) {
> > + req.pgoffs[req.nr] = target_pgoff;
> > + req.nr++;
> > + }
> > + }
> > + } else {
> > + for (i = 0; i < umemd.page_request.nr; i++) {
> > + int j;
> > + target_pgoff =
> > + umemd.page_request.pgoffs[i] <<
> > umemd.host_to_target_page_shift;
> > + bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff;
> > +
> > + for (j = 0; j < umemd.nr_target_pages_per_host_page; j++) {
> > + if (!test_and_set_bit(bit + j, umemd.phys_requested)) {
> > + req.pgoffs[req.nr] = target_pgoff + j;
> > + req.nr++;
> > + }
> > + }
> > + }
> > + }
> > +
> > + DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n",
> > + block->idstr, req.nr, req.pgoffs[0], req.pgoffs[1]);
> > + if (req.nr > 0 && umemd.mig_write != NULL) {
> > + postcopy_incoming_send_req(umemd.mig_write->file, &req);
> > + umemd.last_block_write = block;
> > + }
> > +}
> > +
> > +static void postcopy_incoming_umem_send_pages_present(void)
> > +{
> > + if (umemd.present_request->nr > 0) {
> > + umem_daemon_send_pages_present(umemd.to_qemu->file,
> > + umemd.present_request);
> > + umemd.present_request->nr = 0;
> > + }
> > +}
> > +
> > +static void postcopy_incoming_umem_pages_present_one(
> > + uint32_t nr, const __u64 *pgoffs, uint64_t ramblock_pgoffset)
> > +{
> > + uint32_t i;
> > + assert(nr <= MAX_PRESENT_REQUESTS);
> > +
> > + if (umemd.present_request->nr + nr > MAX_PRESENT_REQUESTS) {
> > + postcopy_incoming_umem_send_pages_present();
> > + }
> > +
> > + for (i = 0; i < nr; i++) {
> > + umemd.present_request->pgoffs[umemd.present_request->nr + i] =
> > + pgoffs[i] + ramblock_pgoffset;
> > + }
> > + umemd.present_request->nr += nr;
> > +}
> > +
> > +static void postcopy_incoming_umem_pages_present(
> > + const struct umem_page_cached *page_cached, uint64_t ramblock_pgoffset)
> > +{
> > + uint32_t left = page_cached->nr;
> > + uint32_t offset = 0;
> > +
> > + while (left > 0) {
> > + uint32_t nr = MIN(left, MAX_PRESENT_REQUESTS);
> > + postcopy_incoming_umem_pages_present_one(
> > + nr, &page_cached->pgoffs[offset], ramblock_pgoffset);
> > +
> > + left -= nr;
> > + offset += nr;
> > + }
> > +}
> > +
> > +static int postcopy_incoming_umem_ram_load(void)
> > +{
> > + ram_addr_t offset;
> > + int flags;
> > + int error;
> > + void *shmem;
> > + int i;
> > + int bit;
> > +
> > + if (umemd.version_id != RAM_SAVE_VERSION_ID) {
> > + return -EINVAL;
> > + }
> > +
> > + offset = qemu_get_be64(umemd.mig_read);
> > +
> > + flags = offset & ~TARGET_PAGE_MASK;
> > + offset &= TARGET_PAGE_MASK;
> > +
> > + assert(!(flags & RAM_SAVE_FLAG_MEM_SIZE));
> > +
> > + if (flags & RAM_SAVE_FLAG_EOS) {
> > + DPRINTF("RAM_SAVE_FLAG_EOS\n");
> > + postcopy_incoming_umem_send_eoc_req();
> > +
> > + qemu_fclose(umemd.mig_read);
> > + umemd.mig_read = NULL;
> > + fd_close(&umemd.mig_read_fd);
> > + umemd.state |= UMEM_STATE_EOS_RECEIVED;
> > +
> > + postcopy_incoming_umem_queue_quit();
> > + DPRINTF("|= UMEM_STATE_EOS_RECEIVED\n");
> > + return 0;
> > + }
> > +
> > + shmem = ram_load_host_from_stream_offset(umemd.mig_read, offset, flags,
> > + &umemd.last_block_read);
> > + if (!shmem) {
> > + DPRINTF("shmem == NULL\n");
> > + return -EINVAL;
> > + }
> > +
> > + if (flags & RAM_SAVE_FLAG_COMPRESS) {
> > + uint8_t ch = qemu_get_byte(umemd.mig_read);
> > + memset(shmem, ch, TARGET_PAGE_SIZE);
> > + } else if (flags & RAM_SAVE_FLAG_PAGE) {
> > + qemu_get_buffer(umemd.mig_read, shmem, TARGET_PAGE_SIZE);
> > + }
> > +
> > + error = qemu_file_get_error(umemd.mig_read);
> > + if (error) {
> > + DPRINTF("error %d\n", error);
> > + return error;
> > + }
> > +
> > + umemd.page_cached.nr = 0;
> > + bit = (umemd.last_block_read->offset + offset) >> TARGET_PAGE_BITS;
> > + if (!test_and_set_bit(bit, umemd.phys_received)) {
> > + if (TARGET_PAGE_SIZE >= umemd.host_page_size) {
> > + __u64 pgoff = offset >> umemd.host_page_shift;
> > + for (i = 0; i < umemd.nr_host_pages_per_target_page; i++) {
> > + umemd.page_cached.pgoffs[umemd.page_cached.nr] = pgoff + i;
> > + umemd.page_cached.nr++;
> > + }
> > + } else {
> > + bool mark_cache = true;
> > + for (i = 0; i < umemd.nr_target_pages_per_host_page; i++) {
> > + if (!test_bit(bit + i, umemd.phys_received)) {
> > + mark_cache = false;
> > + break;
> > + }
> > + }
> > + if (mark_cache) {
> > + umemd.page_cached.pgoffs[0] = offset >>
> > umemd.host_page_shift;
> > + umemd.page_cached.nr = 1;
> > + }
> > + }
> > + }
> > +
> > + if (umemd.page_cached.nr > 0) {
> > + umem_mark_page_cached(umemd.last_block_read->umem,
> > &umemd.page_cached);
> > +
> > + if (!(umemd.state & UMEM_STATE_QUIT_QUEUED) && umemd.to_qemu_fd
> > >=0 &&
> > + (incoming_postcopy_flags & INCOMING_FLAGS_FAULT_REQUEST)) {
> > + uint64_t ramblock_pgoffset;
> > +
> > + ramblock_pgoffset =
> > + umemd.last_block_read->offset >> umemd.host_page_shift;
> > + postcopy_incoming_umem_pages_present(&umemd.page_cached,
> > + ramblock_pgoffset);
> > + }
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +static bool postcopy_incoming_umem_check_umem_done(void)
> > +{
> > + bool all_done = true;
> > + RAMBlock *block;
> > +
> > + QLIST_FOREACH(block, &ram_list.blocks, next) {
> > + UMem *umem = block->umem;
> > + if (umem != NULL && umem->nsets == umem->nbits) {
> > + umem_unmap_shmem(umem);
> > + umem_destroy(umem);
> > + block->umem = NULL;
> > + }
> > + if (block->umem != NULL) {
> > + all_done = false;
> > + }
> > + }
> > + return all_done;
> > +}
> > +
> > +static bool postcopy_incoming_umem_page_faulted(const struct umem_pages
> > *pages)
> > +{
> > + int i;
> > +
> > + for (i = 0; i < pages->nr; i++) {
> > + ram_addr_t addr = pages->pgoffs[i] << umemd.host_page_shift;
> > + RAMBlock *block = qemu_get_ram_block(addr);
> > + addr -= block->offset;
> > + umem_remove_shmem(block->umem, addr, umemd.host_page_size);
> > + }
> > + return postcopy_incoming_umem_check_umem_done();
> > +}
> > +
> > +static bool
> > +postcopy_incoming_umem_page_unmapped(const struct umem_pages *pages)
> > +{
> > + RAMBlock *block;
> > + ram_addr_t addr;
> > + int i;
> > +
> > + struct qemu_umem_req req = {
> > + .cmd = QEMU_UMEM_REQ_REMOVE,
> > + .nr = 0,
> > + .pgoffs = (uint64_t*)pages->pgoffs,
> > + };
> > +
> > + addr = pages->pgoffs[0] << umemd.host_page_shift;
> > + block = qemu_get_ram_block(addr);
> > +
> > + for (i = 0; i < pages->nr; i++) {
> > + int pgoff;
> > +
> > + addr = pages->pgoffs[i] << umemd.host_page_shift;
> > + pgoff = addr >> TARGET_PAGE_BITS;
> > + if (!test_bit(pgoff, umemd.phys_received) &&
> > + !test_bit(pgoff, umemd.phys_requested)) {
> > + req.pgoffs[req.nr] = pgoff;
> > + req.nr++;
> > + }
> > + set_bit(pgoff, umemd.phys_received);
> > + set_bit(pgoff, umemd.phys_requested);
> > +
> > + umem_remove_shmem(block->umem,
> > + addr - block->offset, umemd.host_page_size);
> > + }
> > + if (req.nr > 0 && umemd.mig_write != NULL) {
> > + req.idstr = block->idstr;
> > + postcopy_incoming_send_req(umemd.mig_write->file, &req);
> > + }
> > +
> > + return postcopy_incoming_umem_check_umem_done();
> > +}
> > +
> > +static void postcopy_incoming_umem_done(void)
> > +{
> > + postcopy_incoming_umem_send_eoc_req();
> > + postcopy_incoming_umem_queue_quit();
> > +}
> > +
> > +static int postcopy_incoming_umem_handle_qemu(void)
> > +{
> > + int ret;
> > + int offset = 0;
> > + uint8_t cmd;
> > +
> > + ret = qemu_peek_buffer(umemd.from_qemu, &cmd, sizeof(cmd), offset);
> > + offset += sizeof(cmd);
> > + if (ret != sizeof(cmd)) {
> > + return -EAGAIN;
> > + }
> > + DPRINTF("cmd %c\n", cmd);
> > + switch (cmd) {
> > + case UMEM_QEMU_QUIT:
> > + postcopy_incoming_umem_recv_quit();
> > + postcopy_incoming_umem_done();
> > + break;
> > + case UMEM_QEMU_PAGE_FAULTED: {
> > + struct umem_pages *pages = umem_recv_pages(umemd.from_qemu,
> > + &offset);
> > + if (pages == NULL) {
> > + return -EAGAIN;
> > + }
> > + if (postcopy_incoming_umem_page_faulted(pages)){
> > + postcopy_incoming_umem_done();
> > + }
> > + g_free(pages);
> > + break;
> > + }
> > + case UMEM_QEMU_PAGE_UNMAPPED: {
> > + struct umem_pages *pages = umem_recv_pages(umemd.from_qemu,
> > + &offset);
> > + if (pages == NULL) {
> > + return -EAGAIN;
> > + }
> > + if (postcopy_incoming_umem_page_unmapped(pages)){
> > + postcopy_incoming_umem_done();
> > + }
> > + g_free(pages);
> > + break;
> > + }
> > + default:
> > + abort();
> > + break;
> > + }
> > + if (umemd.from_qemu != NULL) {
> > + qemu_file_skip(umemd.from_qemu, offset);
> > + }
> > + return 0;
> > +}
> > +
> > +static void set_fd(int fd, fd_set *fds, int *nfds)
> > +{
> > + FD_SET(fd, fds);
> > + if (fd > *nfds) {
> > + *nfds = fd;
> > + }
> > +}
> > +
> > +static int postcopy_incoming_umemd_main_loop(void)
> > +{
> > + fd_set writefds;
> > + fd_set readfds;
> > + int nfds;
> > + RAMBlock *block;
> > + int ret;
> > +
> > + int pending_size;
> > + bool get_page_request;
> > +
> > + nfds = -1;
> > + FD_ZERO(&writefds);
> > + FD_ZERO(&readfds);
> > +
> > + if (umemd.mig_write != NULL) {
> > + pending_size = nonblock_pending_size(umemd.mig_write);
> > + if (pending_size > 0) {
> > + set_fd(umemd.mig_write_fd, &writefds, &nfds);
> > + }
> > + } else {
> > + pending_size = 0;
> > + }
> > +
> > +#define PENDING_SIZE_MAX (MAX_REQUESTS * sizeof(uint64_t) * 2)
> > + /* If page request to the migration source is accumulated,
> > + suspend getting page fault request. */
> > + get_page_request = (pending_size <= PENDING_SIZE_MAX);
> > +
> > + if (get_page_request) {
> > + QLIST_FOREACH(block, &ram_list.blocks, next) {
> > + if (block->umem != NULL) {
> > + set_fd(block->umem->fd, &readfds, &nfds);
> > + }
> > + }
> > + }
> > +
> > + if (umemd.mig_read_fd >= 0) {
> > + set_fd(umemd.mig_read_fd, &readfds, &nfds);
> > + }
> > +
> > + if (umemd.to_qemu != NULL &&
> > + nonblock_pending_size(umemd.to_qemu) > 0) {
> > + set_fd(umemd.to_qemu_fd, &writefds, &nfds);
> > + }
> > + if (umemd.from_qemu_fd >= 0) {
> > + set_fd(umemd.from_qemu_fd, &readfds, &nfds);
> > + }
> > +
> > + ret = select(nfds + 1, &readfds, &writefds, NULL, NULL);
> > + if (ret == -1) {
> > + if (errno == EINTR) {
> > + return 0;
> > + }
> > + return ret;
> > + }
> > +
> > + if (umemd.mig_write_fd >= 0 && FD_ISSET(umemd.mig_write_fd,
> > &writefds)) {
> > + nonblock_fflush(umemd.mig_write);
> > + }
> > + if (umemd.to_qemu_fd >= 0 && FD_ISSET(umemd.to_qemu_fd, &writefds)) {
> > + nonblock_fflush(umemd.to_qemu);
> > + }
> > + if (get_page_request) {
> > + QLIST_FOREACH(block, &ram_list.blocks, next) {
> > + if (block->umem != NULL && FD_ISSET(block->umem->fd,
> > &readfds)) {
> > + postcopy_incoming_umem_send_page_req(block);
> > + }
> > + }
> > + }
> > + if (umemd.mig_read_fd >= 0 && FD_ISSET(umemd.mig_read_fd, &readfds)) {
> > + do {
> > + ret = postcopy_incoming_umem_ram_load();
> > + if (ret < 0) {
> > + return ret;
> > + }
> > + } while (umemd.mig_read != NULL &&
> > + qemu_pending_size(umemd.mig_read) > 0);
> > + }
> > + if (umemd.from_qemu_fd >= 0 && FD_ISSET(umemd.from_qemu_fd, &readfds))
> > {
> > + do {
> > + ret = postcopy_incoming_umem_handle_qemu();
> > + if (ret == -EAGAIN) {
> > + break;
> > + }
> > + } while (umemd.from_qemu != NULL &&
> > + qemu_pending_size(umemd.from_qemu) > 0);
> > + }
> > +
> > + if (umemd.mig_write != NULL) {
> > + nonblock_fflush(umemd.mig_write);
> > + }
> > + if (umemd.to_qemu != NULL) {
> > + if (!(umemd.state & UMEM_STATE_QUIT_QUEUED)) {
> > + postcopy_incoming_umem_send_pages_present();
> > + }
> > + nonblock_fflush(umemd.to_qemu);
> > + if ((umemd.state & UMEM_STATE_QUIT_QUEUED) &&
> > + nonblock_pending_size(umemd.to_qemu) == 0) {
> > + DPRINTF("|= UMEM_STATE_QUIT_SENT\n");
> > + qemu_fclose(umemd.to_qemu->file);
> > + umemd.to_qemu = NULL;
> > + fd_close(&umemd.to_qemu_fd);
> > + umemd.state |= UMEM_STATE_QUIT_SENT;
> > + }
> > + }
> > +
> > + return (umemd.state & UMEM_STATE_END_MASK) == UMEM_STATE_END_MASK;
> > +}
> > +
> > +static void postcopy_incoming_umemd(void)
> > +{
> > + ram_addr_t last_ram_offset;
> > + int nbits;
> > + RAMBlock *block;
> > + int ret;
> > +
> > + qemu_daemon(1, 1);
> > + signal(SIGPIPE, SIG_IGN);
> > + DPRINTF("daemon pid: %d\n", getpid());
> > +
> > + umemd.page_request.pgoffs = g_new(__u64, MAX_REQUESTS);
> > + umemd.page_cached.pgoffs =
> > + g_new(__u64, MAX_REQUESTS *
> > + (TARGET_PAGE_SIZE >= umemd.host_page_size ?
> > + 1: umemd.nr_host_pages_per_target_page));
> > + umemd.target_pgoffs =
> > + g_new(uint64_t, MAX_REQUESTS *
> > + MAX(umemd.nr_host_pages_per_target_page,
> > + umemd.nr_target_pages_per_host_page));
> > + umemd.present_request =
> > g_malloc(umem_pages_size(MAX_PRESENT_REQUESTS));
> > + umemd.present_request->nr = 0;
> > +
> > + last_ram_offset = qemu_last_ram_offset();
> > + nbits = last_ram_offset >> TARGET_PAGE_BITS;
> > + umemd.phys_requested = g_new0(unsigned long, BITS_TO_LONGS(nbits));
> > + umemd.phys_received = g_new0(unsigned long, BITS_TO_LONGS(nbits));
> > + umemd.last_block_read = NULL;
> > + umemd.last_block_write = NULL;
> > +
> > + QLIST_FOREACH(block, &ram_list.blocks, next) {
> > + UMem *umem = block->umem;
> > + umem->umem = NULL; /* umem mapping area has VM_DONT_COPY flag,
> > + so we lost those mappings by fork */
> > + block->host = umem_map_shmem(umem);
> > + umem_close_shmem(umem);
> > + }
> > + umem_daemon_ready(umemd.to_qemu_fd);
> > + umemd.to_qemu = qemu_fopen_nonblock(umemd.to_qemu_fd);
> > +
> > + /* wait for qemu to disown migration_fd */
> > + umem_daemon_wait_for_qemu(umemd.from_qemu_fd);
> > + umemd.from_qemu = qemu_fopen_pipe(umemd.from_qemu_fd);
> > +
> > + DPRINTF("entering umemd main loop\n");
> > + for (;;) {
> > + ret = postcopy_incoming_umemd_main_loop();
> > + if (ret != 0) {
> > + break;
> > + }
> > + }
> > + DPRINTF("exiting umemd main loop\n");
> > +
> > + /* This daemon forked from qemu and the parent qemu is still running.
> > + * Cleanups of linked libraries like SDL should not be triggered,
> > + * otherwise the parent qemu may use resources which was already freed.
> > + */
> > + fflush(stdout);
> > + fflush(stderr);
> > + _exit(ret < 0? EXIT_FAILURE: 0);
> > +}
> > diff --git a/migration-tcp.c b/migration-tcp.c
> > index cf6a9b8..aa35050 100644
> > --- a/migration-tcp.c
> > +++ b/migration-tcp.c
> > @@ -63,18 +63,25 @@ static void tcp_wait_for_connect(void *opaque)
> > } while (ret == -1 && (socket_error()) == EINTR);
> >
> > if (ret < 0) {
> > - migrate_fd_error(s);
> > - return;
> > + goto error_out;
> > }
> >
> > qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
> >
> > - if (val == 0)
> > + if (val == 0) {
> > + ret = postcopy_outgoing_create_read_socket(s);
> > + if (ret < 0) {
> > + goto error_out;
> > + }
> > migrate_fd_connect(s);
> > - else {
> > + } else {
> > DPRINTF("error connecting %d\n", val);
> > - migrate_fd_error(s);
> > + goto error_out;
> > }
> > + return;
> > +
> > +error_out:
> > + migrate_fd_error(s);
> > }
> >
> > int tcp_start_outgoing_migration(MigrationState *s, const char *host_port)
> > @@ -112,11 +119,19 @@ int tcp_start_outgoing_migration(MigrationState *s,
> > const char *host_port)
> >
> > if (ret < 0) {
> > DPRINTF("connect failed\n");
> > - migrate_fd_error(s);
> > - return ret;
> > + goto error_out;
> > + }
> > +
> > + ret = postcopy_outgoing_create_read_socket(s);
> > + if (ret < 0) {
> > + goto error_out;
> > }
> > migrate_fd_connect(s);
> > return 0;
> > +
> > +error_out:
> > + migrate_fd_error(s);
> > + return ret;
> > }
> >
> > static void tcp_accept_incoming_migration(void *opaque)
> > @@ -145,7 +160,15 @@ static void tcp_accept_incoming_migration(void *opaque)
> > }
> >
> > process_incoming_migration(f);
> > + if (incoming_postcopy) {
> > + postcopy_incoming_fork_umemd(c, f);
> > + }
> > qemu_fclose(f);
> > + if (incoming_postcopy) {
> > + /* now socket is disowned.
> > + So tell umem server that it's safe to use it */
> > + postcopy_incoming_qemu_ready();
> > + }
> > out:
> > close(c);
> > out2:
> > diff --git a/migration-unix.c b/migration-unix.c
> > index dfcf203..3707505 100644
> > --- a/migration-unix.c
> > +++ b/migration-unix.c
> > @@ -69,12 +69,20 @@ static void unix_wait_for_connect(void *opaque)
> >
> > qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
> >
> > - if (val == 0)
> > + if (val == 0) {
> > + ret = postcopy_outgoing_create_read_socket(s);
> > + if (ret < 0) {
> > + goto error_out;
> > + }
> > migrate_fd_connect(s);
> > - else {
> > + } else {
> > DPRINTF("error connecting %d\n", val);
> > - migrate_fd_error(s);
> > + goto error_out;
> > }
> > + return;
> > +
> > +error_out:
> > + migrate_fd_error(s);
> > }
> >
> > int unix_start_outgoing_migration(MigrationState *s, const char *path)
> > @@ -109,11 +117,19 @@ int unix_start_outgoing_migration(MigrationState *s,
> > const char *path)
> >
> > if (ret < 0) {
> > DPRINTF("connect failed\n");
> > - migrate_fd_error(s);
> > - return ret;
> > + goto error_out;
> > + }
> > +
> > + ret = postcopy_outgoing_create_read_socket(s);
> > + if (ret < 0) {
> > + goto error_out;
> > }
> > migrate_fd_connect(s);
> > return 0;
> > +
> > +error_out:
> > + migrate_fd_error(s);
> > + return ret;
> > }
> >
> > static void unix_accept_incoming_migration(void *opaque)
> > @@ -142,7 +158,13 @@ static void unix_accept_incoming_migration(void
> > *opaque)
> > }
> >
> > process_incoming_migration(f);
> > + if (incoming_postcopy) {
> > + postcopy_incoming_fork_umemd(c, f);
> > + }
> > qemu_fclose(f);
> > + if (incoming_postcopy) {
> > + postcopy_incoming_qemu_ready();
> > + }
> > out:
> > close(c);
> > out2:
> > diff --git a/migration.c b/migration.c
> > index 0149ab3..51efe44 100644
> > --- a/migration.c
> > +++ b/migration.c
> > @@ -39,6 +39,11 @@ enum {
> > MIG_STATE_COMPLETED,
> > };
> >
> > +enum {
> > + MIG_SUBSTATE_PRECOPY,
> > + MIG_SUBSTATE_POSTCOPY,
> > +};
> > +
> > #define MAX_THROTTLE (32 << 20) /* Migration speed throttling */
> >
> > static NotifierList migration_state_notifiers =
> > @@ -255,6 +260,18 @@ static void migrate_fd_put_ready(void *opaque)
> > return;
> > }
> >
> > + if (s->substate == MIG_SUBSTATE_POSTCOPY) {
> > + /* PRINTF("postcopy background\n"); */
> > + ret = postcopy_outgoing_ram_save_background(s->mon, s->file,
> > + s->postcopy);
> > + if (ret > 0) {
> > + migrate_fd_completed(s);
> > + } else if (ret < 0) {
> > + migrate_fd_error(s);
> > + }
> > + return;
> > + }
> > +
> > DPRINTF("iterate\n");
> > ret = qemu_savevm_state_iterate(s->mon, s->file);
> > if (ret < 0) {
> > @@ -265,6 +282,19 @@ static void migrate_fd_put_ready(void *opaque)
> > DPRINTF("done iterating\n");
> > vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> >
> > + if (s->params.postcopy) {
> > + if (qemu_savevm_state_complete(s->mon, s->file) < 0) {
> > + migrate_fd_error(s);
> > + if (old_vm_running) {
> > + vm_start();
> > + }
> > + return;
> > + }
> > + s->substate = MIG_SUBSTATE_POSTCOPY;
> > + s->postcopy = postcopy_outgoing_begin(s);
> > + return;
> > + }
> > +
> > if (qemu_savevm_state_complete(s->mon, s->file) < 0) {
> > migrate_fd_error(s);
> > } else {
> > @@ -357,6 +387,7 @@ void migrate_fd_connect(MigrationState *s)
> > int ret;
> >
> > s->state = MIG_STATE_ACTIVE;
> > + s->substate = MIG_SUBSTATE_PRECOPY;
> > s->file = qemu_fopen_ops_buffered(s,
> > s->bandwidth_limit,
> > migrate_fd_put_buffer,
> > diff --git a/migration.h b/migration.h
> > index 90ae362..2809e99 100644
> > --- a/migration.h
> > +++ b/migration.h
> > @@ -40,6 +40,12 @@ struct MigrationState
> > int (*write)(MigrationState *s, const void *buff, size_t size);
> > void *opaque;
> > MigrationParams params;
> > +
> > + /* for postcopy */
> > + int substate; /* precopy or postcopy */
> > + int fd_read;
> > + QEMUFile *file_read; /* connection from the detination */
> > + void *postcopy;
> > };
> >
> > void process_incoming_migration(QEMUFile *f);
> > @@ -86,6 +92,7 @@ uint64_t ram_bytes_remaining(void);
> > uint64_t ram_bytes_transferred(void);
> > uint64_t ram_bytes_total(void);
> >
> > +void ram_save_set_params(const MigrationParams *params, void *opaque);
> > void sort_ram_list(void);
> > int ram_save_block(QEMUFile *f);
> > void ram_save_memory_set_dirty(void);
> > @@ -107,7 +114,30 @@ void migrate_add_blocker(Error *reason);
> > */
> > void migrate_del_blocker(Error *reason);
> >
> > +/* For outgoing postcopy */
> > +int postcopy_outgoing_create_read_socket(MigrationState *s);
> > +int postcopy_outgoing_ram_save_live(Monitor *mon,
> > + QEMUFile *f, int stage, void *opaque);
> > +void *postcopy_outgoing_begin(MigrationState *s);
> > +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f,
> > + void *postcopy);
> > +
> > +/* For incoming postcopy */
> > extern bool incoming_postcopy;
> > extern unsigned long incoming_postcopy_flags;
> >
> > +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy);
> > +void postcopy_incoming_ram_alloc(const char *name,
> > + size_t size, uint8_t **hostp, UMem
> > **umemp);
> > +void postcopy_incoming_ram_free(UMem *umem);
> > +void postcopy_incoming_prepare(void);
> > +
> > +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id);
> > +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read);
> > +void postcopy_incoming_qemu_ready(void);
> > +void postcopy_incoming_qemu_cleanup(void);
> > +#ifdef NEED_CPU_H
> > +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t
> > size);
> > +#endif
> > +
> > #endif
> > diff --git a/qemu-common.h b/qemu-common.h
> > index 725922b..d74a8c9 100644
> > --- a/qemu-common.h
> > +++ b/qemu-common.h
> > @@ -17,6 +17,7 @@ typedef struct DeviceState DeviceState;
> >
> > struct Monitor;
> > typedef struct Monitor Monitor;
> > +typedef struct UMem UMem;
> >
> > /* we put basic includes here to avoid repeating them in device drivers */
> > #include <stdlib.h>
> > diff --git a/qemu-options.hx b/qemu-options.hx
> > index 5c5b8f3..19e20f9 100644
> > --- a/qemu-options.hx
> > +++ b/qemu-options.hx
> > @@ -2510,7 +2510,10 @@ DEF("postcopy-flags", HAS_ARG,
> > QEMU_OPTION_postcopy_flags,
> > "-postcopy-flags unsigned-int(flags)\n"
> > " flags for postcopy incoming migration\n"
> > " when -incoming and -postcopy are specified.\n"
> > - " This is for benchmark/debug purpose (default:
> > 0)\n",
> > + " This is for benchmark/debug purpose (default: 0)\n"
> > + " Currently supprted flags are\n"
> > + " 1: enable fault request from umemd to qemu\n"
> > + " (default: disabled)\n",
> > QEMU_ARCH_ALL)
> > STEXI
> > @item -postcopy-flags int
>
> Can you move umem.h and umem.h to a separate patch please ,
> this patch
> > diff --git a/umem.c b/umem.c
> > new file mode 100644
> > index 0000000..b7be006
> > --- /dev/null
> > +++ b/umem.c
> > @@ -0,0 +1,379 @@
> > +/*
> > + * umem.c: user process backed memory module for postcopy livemigration
> > + *
> > + * Copyright (c) 2011
> > + * National Institute of Advanced Industrial Science and Technology
> > + *
> > + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> > + * Author: Isaku Yamahata <yamahata at valinux co jp>
> > + *
> > + * This program is free software; you can redistribute it and/or modify it
> > + * under the terms and conditions of the GNU General Public License,
> > + * version 2, as published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope it will be useful, but WITHOUT
> > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
> > for
> > + * more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#include <sys/ioctl.h>
> > +#include <sys/mman.h>
> > +
> > +#include <linux/umem.h>
> > +
> > +#include "bitops.h"
> > +#include "sysemu.h"
> > +#include "hw/hw.h"
> > +#include "umem.h"
> > +
> > +//#define DEBUG_UMEM
> > +#ifdef DEBUG_UMEM
> > +#include <sys/syscall.h>
> > +#define DPRINTF(format, ...) \
> > + do { \
> > + printf("%d:%ld %s:%d "format, getpid(), syscall(SYS_gettid), \
> > + __func__, __LINE__, ## __VA_ARGS__); \
> > + } while (0)
> > +#else
> > +#define DPRINTF(format, ...) do { } while (0)
> > +#endif
> > +
> > +#define DEV_UMEM "/dev/umem"
> > +
> > +struct UMemDev {
> > + int fd;
> > + int page_shift;
> > +};
> > +
> > +UMemDev *umem_dev_new(void)
> > +{
> > + UMemDev *umem_dev;
> > + int umem_dev_fd = open(DEV_UMEM, O_RDWR);
> > + if (umem_dev_fd < 0) {
> > + perror("can't open "DEV_UMEM);
> > + abort();
> > + }
> > +
> > + umem_dev = g_new(UMemDev, 1);
> > + umem_dev->fd = umem_dev_fd;
> > + umem_dev->page_shift = ffs(getpagesize()) - 1;
> > + return umem_dev;
> > +}
> > +
> > +void umem_dev_destroy(UMemDev *dev)
> > +{
> > + close(dev->fd);
> > + g_free(dev);
> > +}
> > +
> > +UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name)
> > +{
> > + struct umem_create create = {
> > + .size = size,
> > + .async_req_max = 0,
> > + .sync_req_max = 0,
> > + };
> > + UMem *umem;
> > +
> > + snprintf(create.name.id, sizeof(create.name.id),
> > + "pid-%"PRId64, (uint64_t)getpid());
> > + create.name.id[UMEM_ID_MAX - 1] = 0;
> > + strncpy(create.name.name, name, sizeof(create.name.name));
> > + create.name.name[UMEM_NAME_MAX - 1] = 0;
> > +
> > + assert((size % getpagesize()) == 0);
> > + if (ioctl(dev->fd, UMEM_DEV_CREATE_UMEM, &create) < 0) {
> > + perror("UMEM_DEV_CREATE_UMEM");
> > + abort();
> > + }
> > + if (ftruncate(create.shmem_fd, create.size) < 0) {
> > + perror("truncate(\"shmem_fd\")");
> > + abort();
> > + }
> > +
> > + umem = g_new(UMem, 1);
> > + umem->nbits = 0;
> > + umem->nsets = 0;
> > + umem->faulted = NULL;
> > + umem->page_shift = dev->page_shift;
> > + umem->fd = create.umem_fd;
> > + umem->shmem_fd = create.shmem_fd;
> > + umem->size = create.size;
> > + umem->umem = mmap(NULL, size, PROT_EXEC | PROT_READ | PROT_WRITE,
> > + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> > + if (umem->umem == MAP_FAILED) {
> > + perror("mmap(UMem) failed");
> > + abort();
> > + }
> > + return umem;
> > +}
> > +
> > +void umem_mmap(UMem *umem)
> > +{
> > + void *ret = mmap(umem->umem, umem->size,
> > + PROT_EXEC | PROT_READ | PROT_WRITE,
> > + MAP_PRIVATE | MAP_FIXED, umem->fd, 0);
> > + if (ret == MAP_FAILED) {
> > + perror("umem_mmap(UMem) failed");
> > + abort();
> > + }
> > +}
> > +
> > +void umem_destroy(UMem *umem)
> > +{
> > + if (umem->fd != -1) {
> > + close(umem->fd);
> > + }
> > + if (umem->shmem_fd != -1) {
> > + close(umem->shmem_fd);
> > + }
> > + g_free(umem->faulted);
> > + g_free(umem);
> > +}
> > +
> > +void umem_get_page_request(UMem *umem, struct umem_page_request
> > *page_request)
> > +{
> > + if (ioctl(umem->fd, UMEM_GET_PAGE_REQUEST, page_request)) {
> > + perror("daemon: UMEM_GET_PAGE_REQUEST");
> > + abort();
> > + }
> > +}
> > +
> > +void umem_mark_page_cached(UMem *umem, struct umem_page_cached
> > *page_cached)
> > +{
> > + if (ioctl(umem->fd, UMEM_MARK_PAGE_CACHED, page_cached)) {
> > + perror("daemon: UMEM_MARK_PAGE_CACHED");
> > + abort();
> > + }
> > +}
> > +
> > +void umem_unmap(UMem *umem)
> > +{
> > + munmap(umem->umem, umem->size);
> > + umem->umem = NULL;
> > +}
> > +
> > +void umem_close(UMem *umem)
> > +{
> > + close(umem->fd);
> > + umem->fd = -1;
> > +}
> > +
> > +void *umem_map_shmem(UMem *umem)
> > +{
> > + umem->nbits = umem->size >> umem->page_shift;
> > + umem->nsets = 0;
> > + umem->faulted = g_new0(unsigned long, BITS_TO_LONGS(umem->nbits));
> > +
> > + umem->shmem = mmap(NULL, umem->size, PROT_READ | PROT_WRITE,
> > MAP_SHARED,
> > + umem->shmem_fd, 0);
> > + if (umem->shmem == MAP_FAILED) {
> > + perror("daemon: mmap(\"shmem\")");
> > + abort();
> > + }
> > + return umem->shmem;
> > +}
> > +
> > +void umem_unmap_shmem(UMem *umem)
> > +{
> > + munmap(umem->shmem, umem->size);
> > + umem->shmem = NULL;
> > +}
> > +
> > +void umem_remove_shmem(UMem *umem, size_t offset, size_t size)
> > +{
> > + int s = offset >> umem->page_shift;
> > + int e = (offset + size) >> umem->page_shift;
> > + int i;
> > +
> > + for (i = s; i < e; i++) {
> > + if (!test_and_set_bit(i, umem->faulted)) {
> > + umem->nsets++;
> > +#if defined(CONFIG_MADVISE) && defined(MADV_REMOVE)
> > + madvise(umem->shmem + offset, size, MADV_REMOVE);
> > +#endif
> > + }
> > + }
> > +}
> > +
> > +void umem_close_shmem(UMem *umem)
> > +{
> > + close(umem->shmem_fd);
> > + umem->shmem_fd = -1;
> > +}
> > +
> > +/***************************************************************************/
> > +/* qemu <-> umem daemon communication */
> > +
> > +size_t umem_pages_size(uint64_t nr)
> > +{
> > + return sizeof(struct umem_pages) + nr * sizeof(uint64_t);
> > +}
> > +
> > +static void umem_write_cmd(int fd, uint8_t cmd)
> > +{
> > + DPRINTF("write cmd %c\n", cmd);
> > +
> > + for (;;) {
> > + ssize_t ret = write(fd, &cmd, 1);
> > + if (ret == -1) {
> > + if (errno == EINTR) {
> > + continue;
> > + } else if (errno == EPIPE) {
> > + perror("pipe");
> > + DPRINTF("write cmd %c %zd %d: pipe is closed\n",
> > + cmd, ret, errno);
> > + break;
> > + }
> > +
> > + perror("pipe");
> > + DPRINTF("write cmd %c %zd %d\n", cmd, ret, errno);
> > + abort();
> > + }
> > +
> > + break;
> > + }
> > +}
> > +
> > +static void umem_read_cmd(int fd, uint8_t expect)
> > +{
> > + uint8_t cmd;
> > + for (;;) {
> > + ssize_t ret = read(fd, &cmd, 1);
> > + if (ret == -1) {
> > + if (errno == EINTR) {
> > + continue;
> > + }
> > + perror("pipe");
> > + DPRINTF("read error cmd %c %zd %d\n", cmd, ret, errno);
> > + abort();
> > + }
> > +
> > + if (ret == 0) {
> > + DPRINTF("read cmd %c %zd: pipe is closed\n", cmd, ret);
> > + abort();
> > + }
> > +
> > + break;
> > + }
> > +
> > + DPRINTF("read cmd %c\n", cmd);
> > + if (cmd != expect) {
> > + DPRINTF("cmd %c expect %d\n", cmd, expect);
> > + abort();
> > + }
> > +}
> > +
> > +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset)
> > +{
> > + int ret;
> > + uint64_t nr;
> > + size_t size;
> > + struct umem_pages *pages;
> > +
> > + ret = qemu_peek_buffer(f, (uint8_t*)&nr, sizeof(nr), *offset);
> > + *offset += sizeof(nr);
> > + DPRINTF("ret %d nr %ld\n", ret, nr);
> > + if (ret != sizeof(nr) || nr == 0) {
> > + return NULL;
> > + }
> > +
> > + size = umem_pages_size(nr);
> > + pages = g_malloc(size);
> > + pages->nr = nr;
> > + size -= sizeof(pages->nr);
> > +
> > + ret = qemu_peek_buffer(f, (uint8_t*)pages->pgoffs, size, *offset);
> > + *offset += size;
> > + if (ret != size) {
> > + g_free(pages);
> > + return NULL;
> > + }
> > + return pages;
> > +}
> > +
> > +static void umem_send_pages(QEMUFile *f, const struct umem_pages *pages)
> > +{
> > + size_t len = umem_pages_size(pages->nr);
> > + qemu_put_buffer(f, (const uint8_t*)pages, len);
> > +}
> > +
> > +/* umem daemon -> qemu */
> > +void umem_daemon_ready(int to_qemu_fd)
> > +{
> > + umem_write_cmd(to_qemu_fd, UMEM_DAEMON_READY);
> > +}
> > +
> > +void umem_daemon_quit(QEMUFile *to_qemu)
> > +{
> > + qemu_put_byte(to_qemu, UMEM_DAEMON_QUIT);
> > +}
> > +
> > +void umem_daemon_send_pages_present(QEMUFile *to_qemu,
> > + struct umem_pages *pages)
> > +{
> > + qemu_put_byte(to_qemu, UMEM_DAEMON_TRIGGER_PAGE_FAULT);
> > + umem_send_pages(to_qemu, pages);
> > +}
> > +
> > +void umem_daemon_wait_for_qemu(int from_qemu_fd)
> > +{
> > + umem_read_cmd(from_qemu_fd, UMEM_QEMU_READY);
> > +}
> > +
> > +/* qemu -> umem daemon */
> > +void umem_qemu_wait_for_daemon(int from_umemd_fd)
> > +{
> > + umem_read_cmd(from_umemd_fd, UMEM_DAEMON_READY);
> > +}
> > +
> > +void umem_qemu_ready(int to_umemd_fd)
> > +{
> > + umem_write_cmd(to_umemd_fd, UMEM_QEMU_READY);
> > +}
> > +
> > +void umem_qemu_quit(QEMUFile *to_umemd)
> > +{
> > + qemu_put_byte(to_umemd, UMEM_QEMU_QUIT);
> > +}
> > +
> > +/* qemu side handler */
> > +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd,
> > + int *offset)
> > +{
> > + uint64_t i;
> > + int page_shift = ffs(getpagesize()) - 1;
> > + struct umem_pages *pages = umem_recv_pages(from_umemd, offset);
> > + if (pages == NULL) {
> > + return NULL;
> > + }
> > +
> > + for (i = 0; i < pages->nr; i++) {
> > + ram_addr_t addr = pages->pgoffs[i] << page_shift;
> > +
> > + /* make pages present by forcibly triggering page fault. */
> > + volatile uint8_t *ram = qemu_get_ram_ptr(addr);
> > + uint8_t dummy_read = ram[0];
> > + (void)dummy_read; /* suppress unused variable warning */
> > + }
> > +
> > + return pages;
> > +}
> > +
> > +void umem_qemu_send_pages_present(QEMUFile *to_umemd,
> > + const struct umem_pages *pages)
> > +{
> > + qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_FAULTED);
> > + umem_send_pages(to_umemd, pages);
> > +}
> > +
> > +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd,
> > + const struct umem_pages *pages)
> > +{
> > + qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_UNMAPPED);
> > + umem_send_pages(to_umemd, pages);
> > +}
> > diff --git a/umem.h b/umem.h
> > new file mode 100644
> > index 0000000..5ca19ef
> > --- /dev/null
> > +++ b/umem.h
> > @@ -0,0 +1,105 @@
> > +/*
> > + * umem.h: user process backed memory module for postcopy livemigration
> > + *
> > + * Copyright (c) 2011
> > + * National Institute of Advanced Industrial Science and Technology
> > + *
> > + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> > + * Author: Isaku Yamahata <yamahata at valinux co jp>
> > + *
> > + * This program is free software; you can redistribute it and/or modify it
> > + * under the terms and conditions of the GNU General Public License,
> > + * version 2, as published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope it will be useful, but WITHOUT
> > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
> > for
> > + * more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +
> > +#ifndef QEMU_UMEM_H
> > +#define QEMU_UMEM_H
> > +
> > +#include <linux/umem.h>
> > +
> > +#include "qemu-common.h"
> > +
> > +typedef struct UMemDev UMemDev;
> > +
> > +struct UMem {
> > + void *umem;
> > + int fd;
> > + void *shmem;
> > + int shmem_fd;
> > + uint64_t size;
> > +
> > + /* indexed by host page size */
> > + int page_shift;
> > + int nbits;
> > + int nsets;
> > + unsigned long *faulted;
> > +};
> > +
> > +UMemDev *umem_dev_new(void);
> > +void umem_dev_destroy(UMemDev *dev);
> > +UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name);
> > +void umem_mmap(UMem *umem);
> > +
> > +void umem_destroy(UMem *umem);
> > +
> > +/* umem device operations */
> > +void umem_get_page_request(UMem *umem, struct umem_page_request
> > *page_request);
> > +void umem_mark_page_cached(UMem *umem, struct umem_page_cached
> > *page_cached);
> > +void umem_unmap(UMem *umem);
> > +void umem_close(UMem *umem);
> > +
> > +/* umem shmem operations */
> > +void *umem_map_shmem(UMem *umem);
> > +void umem_unmap_shmem(UMem *umem);
> > +void umem_remove_shmem(UMem *umem, size_t offset, size_t size);
> > +void umem_close_shmem(UMem *umem);
> > +
> > +/* qemu on source <-> umem daemon communication */
> > +
> > +struct umem_pages {
> > + uint64_t nr; /* nr = 0 means completed */
> > + uint64_t pgoffs[0];
> > +};
> > +
> > +/* daemon -> qemu */
> > +#define UMEM_DAEMON_READY 'R'
> > +#define UMEM_DAEMON_QUIT 'Q'
> > +#define UMEM_DAEMON_TRIGGER_PAGE_FAULT 'T'
> > +#define UMEM_DAEMON_ERROR 'E'
> > +
> > +/* qemu -> daemon */
> > +#define UMEM_QEMU_READY 'r'
> > +#define UMEM_QEMU_QUIT 'q'
> > +#define UMEM_QEMU_PAGE_FAULTED 't'
> > +#define UMEM_QEMU_PAGE_UNMAPPED 'u'
> > +
> > +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset);
> > +size_t umem_pages_size(uint64_t nr);
> > +
> > +/* for umem daemon */
> > +void umem_daemon_ready(int to_qemu_fd);
> > +void umem_daemon_wait_for_qemu(int from_qemu_fd);
> > +void umem_daemon_quit(QEMUFile *to_qemu);
> > +void umem_daemon_send_pages_present(QEMUFile *to_qemu,
> > + struct umem_pages *pages);
> > +
> > +/* for qemu */
> > +void umem_qemu_wait_for_daemon(int from_umemd_fd);
> > +void umem_qemu_ready(int to_umemd_fd);
> > +void umem_qemu_quit(QEMUFile *to_umemd);
> > +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd,
> > + int *offset);
> > +void umem_qemu_send_pages_present(QEMUFile *to_umemd,
> > + const struct umem_pages *pages);
> > +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd,
> > + const struct umem_pages *pages);
> > +
> > +#endif /* QEMU_UMEM_H */
> > diff --git a/vl.c b/vl.c
> > index 5430b8c..17427a0 100644
> > --- a/vl.c
> > +++ b/vl.c
> > @@ -3274,8 +3274,12 @@ int main(int argc, char **argv, char **envp)
> > default_drive(default_sdcard, snapshot, machine->use_scsi,
> > IF_SD, 0, SD_OPTS);
> >
> > - register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID, NULL,
> > - ram_save_live, NULL, ram_load, NULL);
> > + if (postcopy_incoming_init(incoming, incoming_postcopy) < 0) {
> > + exit(1);
> > + }
> > + register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID,
> > + ram_save_set_params, ram_save_live, NULL,
> > + ram_load, NULL);
> >
> > if (nb_numa_nodes > 0) {
> > int i;
> > @@ -3471,6 +3475,9 @@ int main(int argc, char **argv, char **envp)
> >
> > if (incoming) {
> > runstate_set(RUN_STATE_INMIGRATE);
> > + if (incoming_postcopy) {
> > + postcopy_incoming_prepare();
> >+ }
>
> how about moving postcopy_incoming_prepare into qemu_start_incoming_migration
> ?
>
> > int ret = qemu_start_incoming_migration(incoming);
> > if (ret < 0) {
> > fprintf(stderr, "Migration failed. Exit code %s(%d),
> > exiting.\n",
> > @@ -3488,6 +3495,9 @@ int main(int argc, char **argv, char **envp)
> > bdrv_close_all();
> > pause_all_vcpus();
> > net_cleanup();
> > + if (incoming_postcopy) {
> > + postcopy_incoming_qemu_cleanup();
> > + }
> > res_free();
> >
> > return 0;
>
> Orit
>
--
yamahata
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- Re: [Qemu-devel] [PATCH 21/21] postcopy: implement postcopy livemigration,
Isaku Yamahata <=