qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH 21/21] postcopy: implement postcopy livemigration


From: Isaku Yamahata
Subject: [Qemu-devel] [PATCH 21/21] postcopy: implement postcopy livemigration
Date: Thu, 29 Dec 2011 10:26:00 +0900

This patch implements postcopy livemigration.

Signed-off-by: Isaku Yamahata <address@hidden>
---
 Makefile.target           |    4 +
 arch_init.c               |   26 +-
 cpu-all.h                 |    7 +
 exec.c                    |   20 +-
 migration-exec.c          |    8 +
 migration-fd.c            |   30 +
 migration-postcopy-stub.c |   77 ++
 migration-postcopy.c      | 1891 +++++++++++++++++++++++++++++++++++++++++++++
 migration-tcp.c           |   37 +-
 migration-unix.c          |   32 +-
 migration.c               |   31 +
 migration.h               |   30 +
 qemu-common.h             |    1 +
 qemu-options.hx           |    5 +-
 umem.c                    |  379 +++++++++
 umem.h                    |  105 +++
 vl.c                      |   14 +-
 17 files changed, 2677 insertions(+), 20 deletions(-)
 create mode 100644 migration-postcopy-stub.c
 create mode 100644 migration-postcopy.c
 create mode 100644 umem.c
 create mode 100644 umem.h

diff --git a/Makefile.target b/Makefile.target
index 3261383..d94c53f 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -4,6 +4,7 @@ GENERATED_HEADERS = config-target.h
 CONFIG_NO_PCI = $(if $(subst n,,$(CONFIG_PCI)),n,y)
 CONFIG_NO_KVM = $(if $(subst n,,$(CONFIG_KVM)),n,y)
 CONFIG_NO_XEN = $(if $(subst n,,$(CONFIG_XEN)),n,y)
+CONFIG_NO_POSTCOPY = $(if $(subst n,,$(CONFIG_POSTCOPY)),n,y)
 
 include ../config-host.mak
 include config-devices.mak
@@ -199,6 +200,9 @@ obj-$(CONFIG_NO_KVM) += kvm-stub.o
 obj-y += memory.o
 LIBS+=-lz
 
+common-obj-$(CONFIG_POSTCOPY) += migration-postcopy.o umem.o
+common-obj-$(CONFIG_NO_POSTCOPY) += migration-postcopy-stub.o
+
 QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
 QEMU_CFLAGS += $(VNC_SASL_CFLAGS)
 QEMU_CFLAGS += $(VNC_JPEG_CFLAGS)
diff --git a/arch_init.c b/arch_init.c
index bc53092..8b3130d 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -102,6 +102,13 @@ static int is_dup_page(uint8_t *page, uint8_t ch)
     return 1;
 }
 
+static bool outgoing_postcopy = false;
+
+void ram_save_set_params(const MigrationParams *params, void *opaque)
+{
+    outgoing_postcopy = params->postcopy;
+}
+
 static RAMBlock *last_block_sent = NULL;
 
 int ram_save_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
@@ -284,6 +291,17 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, 
void *opaque)
     uint64_t expected_time = 0;
     int ret;
 
+    if (stage == 1) {
+        last_block_sent = NULL;
+
+        bytes_transferred = 0;
+        last_block = NULL;
+        last_offset = 0;
+    }
+    if (outgoing_postcopy) {
+        return postcopy_outgoing_ram_save_live(mon, f, stage, opaque);
+    }
+
     if (stage < 0) {
         cpu_physical_memory_set_dirty_tracking(0);
         return 0;
@@ -295,10 +313,6 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, 
void *opaque)
     }
 
     if (stage == 1) {
-        bytes_transferred = 0;
-        last_block_sent = NULL;
-        last_block = NULL;
-        last_offset = 0;
         sort_ram_list();
 
         /* Make sure all dirty bits are set */
@@ -436,6 +450,10 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
     int flags;
     int error;
 
+    if (incoming_postcopy) {
+        return postcopy_incoming_ram_load(f, opaque, version_id);
+    }
+
     if (version_id < 3 || version_id > RAM_SAVE_VERSION_ID) {
         return -EINVAL;
     }
diff --git a/cpu-all.h b/cpu-all.h
index 0244f7a..2e9d8a7 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -475,6 +475,9 @@ extern ram_addr_t ram_size;
 /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
 #define RAM_PREALLOC_MASK   (1 << 0)
 
+/* RAM is allocated via umem for postcopy incoming mode */
+#define RAM_POSTCOPY_UMEM_MASK  (1 << 1)
+
 typedef struct RAMBlock {
     uint8_t *host;
     ram_addr_t offset;
@@ -485,6 +488,10 @@ typedef struct RAMBlock {
 #if defined(__linux__) && !defined(TARGET_S390X)
     int fd;
 #endif
+
+#ifdef CONFIG_POSTCOPY
+    UMem *umem;    /* for incoming postcopy mode */
+#endif
 } RAMBlock;
 
 typedef struct RAMList {
diff --git a/exec.c b/exec.c
index c8c6692..90b0491 100644
--- a/exec.c
+++ b/exec.c
@@ -35,6 +35,7 @@
 #include "qemu-timer.h"
 #include "memory.h"
 #include "exec-memory.h"
+#include "migration.h"
 #if defined(CONFIG_USER_ONLY)
 #include <qemu.h>
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -2949,6 +2950,13 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, 
const char *name,
         new_block->host = host;
         new_block->flags |= RAM_PREALLOC_MASK;
     } else {
+#ifdef CONFIG_POSTCOPY
+        if (incoming_postcopy) {
+            postcopy_incoming_ram_alloc(name, size,
+                                        &new_block->host, &new_block->umem);
+            new_block->flags |= RAM_POSTCOPY_UMEM_MASK;
+        } else
+#endif
         if (mem_path) {
 #if defined (__linux__) && !defined(TARGET_S390X)
             new_block->host = file_ram_alloc(new_block, size, mem_path);
@@ -3027,7 +3035,13 @@ void qemu_ram_free(ram_addr_t addr)
             QLIST_REMOVE(block, next);
             if (block->flags & RAM_PREALLOC_MASK) {
                 ;
-            } else if (mem_path) {
+            }
+#ifdef CONFIG_POSTCOPY
+            else if (block->flags & RAM_POSTCOPY_UMEM_MASK) {
+                postcopy_incoming_ram_free(block->umem);
+            }
+#endif
+            else if (mem_path) {
 #if defined (__linux__) && !defined(TARGET_S390X)
                 if (block->fd) {
                     munmap(block->host, block->length);
@@ -3073,6 +3087,10 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
             } else {
                 flags = MAP_FIXED;
                 munmap(vaddr, length);
+                if (block->flags & RAM_POSTCOPY_UMEM_MASK) {
+                    postcopy_incoming_qemu_pages_unmapped(addr, length);
+                    block->flags &= ~RAM_POSTCOPY_UMEM_MASK;
+                }
                 if (mem_path) {
 #if defined(__linux__) && !defined(TARGET_S390X)
                     if (block->fd) {
diff --git a/migration-exec.c b/migration-exec.c
index e14552e..2bd0c3b 100644
--- a/migration-exec.c
+++ b/migration-exec.c
@@ -62,6 +62,10 @@ int exec_start_outgoing_migration(MigrationState *s, const 
char *command)
 {
     FILE *f;
 
+    if (s->params.postcopy) {
+        return -ENOSYS;
+    }
+
     f = popen(command, "w");
     if (f == NULL) {
         DPRINTF("Unable to popen exec target\n");
@@ -104,6 +108,10 @@ int exec_start_incoming_migration(const char *command)
 {
     QEMUFile *f;
 
+    if (incoming_postcopy) {
+        return -ENOSYS;
+    }
+
     DPRINTF("Attempting to start an incoming migration\n");
     f = qemu_popen_cmd(command, "r");
     if(f == NULL) {
diff --git a/migration-fd.c b/migration-fd.c
index 6211124..5a62ab9 100644
--- a/migration-fd.c
+++ b/migration-fd.c
@@ -88,6 +88,23 @@ int fd_start_outgoing_migration(MigrationState *s, const 
char *fdname)
     s->write = fd_write;
     s->close = fd_close;
 
+    if (s->params.postcopy) {
+        int flags = fcntl(s->fd, F_GETFL);
+        if ((flags & O_ACCMODE) != O_RDWR) {
+            goto err_after_open;
+        }
+
+        s->fd_read = dup(s->fd);
+        if (s->fd_read == -1) {
+            goto err_after_open;
+        }
+        s->file_read = qemu_fdopen(s->fd_read, "r");
+        if (s->file_read == NULL) {
+            close(s->fd_read);
+            goto err_after_open;
+        }
+    }
+
     migrate_fd_connect(s);
     return 0;
 
@@ -103,7 +120,14 @@ static void fd_accept_incoming_migration(void *opaque)
 
     process_incoming_migration(f);
     qemu_set_fd_handler2(qemu_stdio_fd(f), NULL, NULL, NULL, NULL);
+    if (incoming_postcopy) {
+        postcopy_incoming_fork_umemd(qemu_stdio_fd(f), f);
+    }
     qemu_fclose(f);
+    if (incoming_postcopy) {
+        postcopy_incoming_qemu_ready();
+    }
+    return;
 }
 
 int fd_start_incoming_migration(const char *infd)
@@ -114,6 +138,12 @@ int fd_start_incoming_migration(const char *infd)
     DPRINTF("Attempting to start an incoming migration via fd\n");
 
     fd = strtol(infd, NULL, 0);
+    if (incoming_postcopy) {
+        int flags = fcntl(fd, F_GETFL);
+        if ((flags & O_ACCMODE) != O_RDWR) {
+            return -EINVAL;
+        }
+    }
     f = qemu_fdopen(fd, "rb");
     if(f == NULL) {
         DPRINTF("Unable to apply qemu wrapper to file descriptor\n");
diff --git a/migration-postcopy-stub.c b/migration-postcopy-stub.c
new file mode 100644
index 0000000..0b78de7
--- /dev/null
+++ b/migration-postcopy-stub.c
@@ -0,0 +1,77 @@
+/*
+ * migration-postcopy-stub.c: postcopy livemigration
+ *                            stub functions for non-supported hosts
+ *
+ * Copyright (c) 2011
+ * National Institute of Advanced Industrial Science and Technology
+ *
+ * https://sites.google.com/site/grivonhome/quick-kvm-migration
+ * Author: Isaku Yamahata <yamahata at valinux co jp>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "sysemu.h"
+#include "migration.h"
+
+int postcopy_outgoing_create_read_socket(MigrationState *s)
+{
+    return -ENOSYS;
+}
+
+int postcopy_outgoing_ram_save_live(Monitor *mon,
+                                    QEMUFile *f, int stage, void *opaque)
+{
+    return -ENOSYS;
+}
+
+void *postcopy_outgoing_begin(MigrationState *ms)
+{
+    return NULL;
+}
+
+int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f,
+                                          void *postcopy)
+{
+    return -ENOSYS;
+}
+
+int postcopy_incoming_init(const char *incoming, bool incoming_postcopy)
+{
+    return -ENOSYS;
+}
+
+void postcopy_incoming_prepare(void)
+{
+}
+
+int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id)
+{
+    return -ENOSYS;
+}
+
+void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read)
+{
+}
+
+void postcopy_incoming_qemu_ready(void)
+{
+}
+
+void postcopy_incoming_qemu_cleanup(void)
+{
+}
+
+void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size)
+{
+}
diff --git a/migration-postcopy.c b/migration-postcopy.c
new file mode 100644
index 0000000..ed0d574
--- /dev/null
+++ b/migration-postcopy.c
@@ -0,0 +1,1891 @@
+/*
+ * migration-postcopy.c: postcopy livemigration
+ *
+ * Copyright (c) 2011
+ * National Institute of Advanced Industrial Science and Technology
+ *
+ * https://sites.google.com/site/grivonhome/quick-kvm-migration
+ * Author: Isaku Yamahata <yamahata at valinux co jp>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "bitmap.h"
+#include "sysemu.h"
+#include "hw/hw.h"
+#include "arch_init.h"
+#include "migration.h"
+#include "umem.h"
+
+#include "memory.h"
+#define WANT_EXEC_OBSOLETE
+#include "exec-obsolete.h"
+
+//#define DEBUG_POSTCOPY
+#ifdef DEBUG_POSTCOPY
+#include <sys/syscall.h>
+#define DPRINTF(fmt, ...)                                               \
+    do {                                                                \
+        printf("%d:%ld %s:%d: " fmt, getpid(), syscall(SYS_gettid),     \
+               __func__, __LINE__, ## __VA_ARGS__);                     \
+    } while (0)
+#else
+#define DPRINTF(fmt, ...)       do { } while (0)
+#endif
+
+#define ALIGN_UP(size, align)   (((size) + (align) - 1) & ~((align) - 1))
+
+static void fd_close(int *fd)
+{
+    if (*fd >= 0) {
+        close(*fd);
+        *fd = -1;
+    }
+}
+
+/***************************************************************************
+ * QEMUFile for non blocking pipe
+ */
+
+/* read only */
+struct QEMUFilePipe {
+    int fd;
+    QEMUFile *file;
+};
+typedef struct QEMUFilePipe QEMUFilePipe;
+
+static int pipe_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
+{
+    QEMUFilePipe *s = opaque;
+    ssize_t len = 0;
+
+    while (size > 0) {
+        ssize_t ret = read(s->fd, buf, size);
+        if (ret == -1) {
+            if (errno == EINTR) {
+                continue;
+            }
+            if (len == 0) {
+                len = -errno;
+            }
+            break;
+        }
+
+        if (ret == 0) {
+            /* the write end of the pipe is closed */
+            break;
+        }
+        len += ret;
+        buf += ret;
+        size -= ret;
+    }
+
+    return len;
+}
+
+static int pipe_close(void *opaque)
+{
+    QEMUFilePipe *s = opaque;
+    g_free(s);
+    return 0;
+}
+
+static QEMUFile *qemu_fopen_pipe(int fd)
+{
+    QEMUFilePipe *s = g_malloc0(sizeof(*s));
+
+    s->fd = fd;
+    fcntl_setfl(fd, O_NONBLOCK);
+    s->file = qemu_fopen_ops(s, NULL, pipe_get_buffer, pipe_close,
+                             NULL, NULL, NULL);
+    return s->file;
+}
+
+/* write only */
+struct QEMUFileNonblock {
+    int fd;
+    QEMUFile *file;
+
+    /* for pipe-write nonblocking mode */
+#define BUF_SIZE_INC    (32 * 1024)     /* = IO_BUF_SIZE */
+    uint8_t *buffer;
+    size_t buffer_size;
+    size_t buffer_capacity;
+    bool freeze_output;
+};
+typedef struct QEMUFileNonblock QEMUFileNonblock;
+
+static void nonblock_flush_buffer(QEMUFileNonblock *s)
+{
+    size_t offset = 0;
+    ssize_t ret;
+
+    while (offset < s->buffer_size) {
+        ret = write(s->fd, s->buffer + offset, s->buffer_size - offset);
+        if (ret == -1) {
+            if (errno == EINTR) {
+                continue;
+            } else if (errno == EAGAIN) {
+                s->freeze_output = true;
+            } else {
+                qemu_file_set_error(s->file, errno);
+            }
+            break;
+        }
+
+        if (ret == 0) {
+            DPRINTF("ret == 0\n");
+            break;
+        }
+
+        offset += ret;
+    }
+
+    if (offset > 0) {
+        assert(s->buffer_size >= offset);
+        memmove(s->buffer, s->buffer + offset, s->buffer_size - offset);
+        s->buffer_size -= offset;
+    }
+    if (s->buffer_size > 0) {
+        s->freeze_output = true;
+    }
+}
+
+static int nonblock_put_buffer(void *opaque,
+                               const uint8_t *buf, int64_t pos, int size)
+{
+    QEMUFileNonblock *s = opaque;
+    int error;
+    ssize_t len = 0;
+
+    error = qemu_file_get_error(s->file);
+    if (error) {
+        return error;
+    }
+
+    nonblock_flush_buffer(s);
+    error = qemu_file_get_error(s->file);
+    if (error) {
+        return error;
+    }
+
+    while (!s->freeze_output && size > 0) {
+        ssize_t ret;
+        assert(s->buffer_size == 0);
+
+        ret = write(s->fd, buf, size);
+        if (ret == -1) {
+            if (errno == EINTR) {
+                continue;
+            } else if (errno == EAGAIN) {
+                s->freeze_output = true;
+            } else {
+                qemu_file_set_error(s->file, errno);
+            }
+            break;
+        }
+
+        len += ret;
+        buf += ret;
+        size -= ret;
+    }
+
+    if (size > 0) {
+        int inc = size - (s->buffer_capacity - s->buffer_size);
+        if (inc > 0) {
+            s->buffer_capacity +=
+                DIV_ROUND_UP(inc, BUF_SIZE_INC) * BUF_SIZE_INC;
+            s->buffer = g_realloc(s->buffer, s->buffer_capacity);
+        }
+        memcpy(s->buffer + s->buffer_size, buf, size);
+        s->buffer_size += size;
+
+        len += size;
+    }
+
+    return len;
+}
+
+static int nonblock_pending_size(QEMUFileNonblock *s)
+{
+    return qemu_pending_size(s->file) + s->buffer_size;
+}
+
+static void nonblock_fflush(QEMUFileNonblock *s)
+{
+    s->freeze_output = false;
+    nonblock_flush_buffer(s);
+    if (!s->freeze_output) {
+        qemu_fflush(s->file);
+    }
+}
+
+static void nonblock_wait_for_flush(QEMUFileNonblock *s)
+{
+    while (nonblock_pending_size(s) > 0) {
+        fd_set fds;
+        FD_ZERO(&fds);
+        FD_SET(s->fd, &fds);
+        select(s->fd + 1, NULL, &fds, NULL, NULL);
+
+        nonblock_fflush(s);
+    }
+}
+
+static int nonblock_close(void *opaque)
+{
+    QEMUFileNonblock *s = opaque;
+    nonblock_wait_for_flush(s);
+    g_free(s->buffer);
+    g_free(s);
+    return 0;
+}
+
+static QEMUFileNonblock *qemu_fopen_nonblock(int fd)
+{
+    QEMUFileNonblock *s = g_malloc0(sizeof(*s));
+
+    s->fd = fd;
+    fcntl_setfl(fd, O_NONBLOCK);
+    s->file = qemu_fopen_ops(s, nonblock_put_buffer, NULL, nonblock_close,
+                             NULL, NULL, NULL);
+    return s;
+}
+
+/***************************************************************************
+ * umem daemon on destination <-> qemu on source protocol
+ */
+
+#define QEMU_UMEM_REQ_INIT              0x00
+#define QEMU_UMEM_REQ_ON_DEMAND         0x01
+#define QEMU_UMEM_REQ_ON_DEMAND_CONT    0x02
+#define QEMU_UMEM_REQ_BACKGROUND        0x03
+#define QEMU_UMEM_REQ_BACKGROUND_CONT   0x04
+#define QEMU_UMEM_REQ_REMOVE            0x05
+#define QEMU_UMEM_REQ_EOC               0x06
+
+struct qemu_umem_req {
+    int8_t cmd;
+    uint8_t len;
+    char *idstr;        /* ON_DEMAND, BACKGROUND, REMOVE */
+    uint32_t nr;        /* ON_DEMAND, ON_DEMAND_CONT,
+                           BACKGROUND, BACKGROUND_CONT, REMOVE */
+
+    /* in target page size as qemu migration protocol */
+    uint64_t *pgoffs;   /* ON_DEMAND, ON_DEMAND_CONT,
+                           BACKGROUND, BACKGROUND_CONT, REMOVE */
+};
+
+static void postcopy_incoming_send_req_idstr(QEMUFile *f, const char* idstr)
+{
+    qemu_put_byte(f, strlen(idstr));
+    qemu_put_buffer(f, (uint8_t *)idstr, strlen(idstr));
+}
+
+static void postcopy_incoming_send_req_pgoffs(QEMUFile *f, uint32_t nr,
+                                              const uint64_t *pgoffs)
+{
+    uint32_t i;
+
+    qemu_put_be32(f, nr);
+    for (i = 0; i < nr; i++) {
+        qemu_put_be64(f, pgoffs[i]);
+    }
+}
+
+static void postcopy_incoming_send_req_one(QEMUFile *f,
+                                           const struct qemu_umem_req *req)
+{
+    DPRINTF("cmd %d\n", req->cmd);
+    qemu_put_byte(f, req->cmd);
+    switch (req->cmd) {
+    case QEMU_UMEM_REQ_INIT:
+    case QEMU_UMEM_REQ_EOC:
+        /* nothing */
+        break;
+    case QEMU_UMEM_REQ_ON_DEMAND:
+    case QEMU_UMEM_REQ_BACKGROUND:
+    case QEMU_UMEM_REQ_REMOVE:
+        postcopy_incoming_send_req_idstr(f, req->idstr);
+        postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs);
+        break;
+    case QEMU_UMEM_REQ_ON_DEMAND_CONT:
+    case QEMU_UMEM_REQ_BACKGROUND_CONT:
+        postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs);
+        break;
+    default:
+        abort();
+        break;
+    }
+}
+
+/* QEMUFile can buffer up to IO_BUF_SIZE = 32 * 1024.
+ * So one message size must be <= IO_BUF_SIZE
+ * cmd: 1
+ * id len: 1
+ * id: 256
+ * nr: 2
+ */
+#define MAX_PAGE_NR     ((32 * 1024 - 1 - 1 - 256 - 2) / sizeof(uint64_t))
+static void postcopy_incoming_send_req(QEMUFile *f,
+                                       const struct qemu_umem_req *req)
+{
+    uint32_t nr = req->nr;
+    struct qemu_umem_req tmp = *req;
+
+    switch (req->cmd) {
+    case QEMU_UMEM_REQ_INIT:
+    case QEMU_UMEM_REQ_EOC:
+        postcopy_incoming_send_req_one(f, &tmp);
+        break;
+    case QEMU_UMEM_REQ_ON_DEMAND:
+    case QEMU_UMEM_REQ_BACKGROUND:
+        tmp.nr = MIN(nr, MAX_PAGE_NR);
+        postcopy_incoming_send_req_one(f, &tmp);
+
+        nr -= tmp.nr;
+        tmp.pgoffs += tmp.nr;
+        if (tmp.cmd == QEMU_UMEM_REQ_ON_DEMAND) {
+            tmp.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT;
+        }else {
+            tmp.cmd = QEMU_UMEM_REQ_BACKGROUND_CONT;
+        }
+        /* fall through */
+    case QEMU_UMEM_REQ_REMOVE:
+    case QEMU_UMEM_REQ_ON_DEMAND_CONT:
+    case QEMU_UMEM_REQ_BACKGROUND_CONT:
+        while (nr > 0) {
+            tmp.nr = MIN(nr, MAX_PAGE_NR);
+            postcopy_incoming_send_req_one(f, &tmp);
+
+            nr -= tmp.nr;
+            tmp.pgoffs += tmp.nr;
+        }
+        break;
+    default:
+        abort();
+        break;
+    }
+}
+
+static int postcopy_outgoing_recv_req_idstr(QEMUFile *f,
+                                            struct qemu_umem_req *req,
+                                            size_t *offset)
+{
+    int ret;
+
+    req->len = qemu_peek_byte(f, *offset);
+    *offset += 1;
+    if (req->len == 0) {
+        return -EAGAIN;
+    }
+    req->idstr = g_malloc((int)req->len + 1);
+    ret = qemu_peek_buffer(f, (uint8_t*)req->idstr, req->len, *offset);
+    *offset += ret;
+    if (ret != req->len) {
+        g_free(req->idstr);
+        req->idstr = NULL;
+        return -EAGAIN;
+    }
+    req->idstr[req->len] = 0;
+    return 0;
+}
+
+static int postcopy_outgoing_recv_req_pgoffs(QEMUFile *f,
+                                             struct qemu_umem_req *req,
+                                             size_t *offset)
+{
+    int ret;
+    uint32_t be32;
+    uint32_t i;
+
+    ret = qemu_peek_buffer(f, (uint8_t*)&be32, sizeof(be32), *offset);
+    *offset += sizeof(be32);
+    if (ret != sizeof(be32)) {
+        return -EAGAIN;
+    }
+
+    req->nr = be32_to_cpu(be32);
+    req->pgoffs = g_new(uint64_t, req->nr);
+    for (i = 0; i < req->nr; i++) {
+        uint64_t be64;
+        ret = qemu_peek_buffer(f, (uint8_t*)&be64, sizeof(be64), *offset);
+        *offset += sizeof(be64);
+        if (ret != sizeof(be64)) {
+            g_free(req->pgoffs);
+            req->pgoffs = NULL;
+            return -EAGAIN;
+        }
+        req->pgoffs[i] = be64_to_cpu(be64);
+    }
+    return 0;
+}
+
+static int postcopy_outgoing_recv_req(QEMUFile *f, struct qemu_umem_req *req)
+{
+    int size;
+    int ret;
+    size_t offset = 0;
+
+    size = qemu_peek_buffer(f, (uint8_t*)&req->cmd, 1, offset);
+    if (size <= 0) {
+        return -EAGAIN;
+    }
+    offset += 1;
+
+    switch (req->cmd) {
+    case QEMU_UMEM_REQ_INIT:
+    case QEMU_UMEM_REQ_EOC:
+        /* nothing */
+        break;
+    case QEMU_UMEM_REQ_ON_DEMAND:
+    case QEMU_UMEM_REQ_BACKGROUND:
+    case QEMU_UMEM_REQ_REMOVE:
+        ret = postcopy_outgoing_recv_req_idstr(f, req, &offset);
+        if (ret < 0) {
+            return ret;
+        }
+        ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset);
+        if (ret < 0) {
+            return ret;
+        }
+        break;
+    case QEMU_UMEM_REQ_ON_DEMAND_CONT:
+    case QEMU_UMEM_REQ_BACKGROUND_CONT:
+        ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset);
+        if (ret < 0) {
+            return ret;
+        }
+        break;
+    default:
+        abort();
+        break;
+    }
+    qemu_file_skip(f, offset);
+    DPRINTF("cmd %d\n", req->cmd);
+    return 0;
+}
+
+static void postcopy_outgoing_free_req(struct qemu_umem_req *req)
+{
+    g_free(req->idstr);
+    g_free(req->pgoffs);
+}
+
+/***************************************************************************
+ * outgoing part
+ */
+
+#define QEMU_SAVE_LIVE_STAGE_START      0x01    /* = QEMU_VM_SECTION_START */
+#define QEMU_SAVE_LIVE_STAGE_PART       0x02    /* = QEMU_VM_SECTION_PART */
+#define QEMU_SAVE_LIVE_STAGE_END        0x03    /* = QEMU_VM_SECTION_END */
+
+enum POState {
+    PO_STATE_ERROR_RECEIVE,
+    PO_STATE_ACTIVE,
+    PO_STATE_EOC_RECEIVED,
+    PO_STATE_ALL_PAGES_SENT,
+    PO_STATE_COMPLETED,
+};
+typedef enum POState POState;
+
+struct PostcopyOutgoingState {
+    POState state;
+    QEMUFile *mig_read;
+    int fd_read;
+    RAMBlock *last_block_read;
+
+    QEMUFile *mig_buffered_write;
+    MigrationState *ms;
+
+    /* For nobg mode. Check if all pages are sent */
+    RAMBlock *block;
+    ram_addr_t addr;
+};
+typedef struct PostcopyOutgoingState PostcopyOutgoingState;
+
+int postcopy_outgoing_create_read_socket(MigrationState *s)
+{
+    if (!s->params.postcopy) {
+        return 0;
+    }
+
+    s->fd_read = dup(s->fd);
+    if (s->fd_read == -1) {
+        int ret = -errno;
+        perror("dup");
+        return ret;
+    }
+    s->file_read = qemu_fopen_socket(s->fd_read);
+    if (s->file_read == NULL) {
+        return -EINVAL;
+    }
+    return 0;
+}
+
+int postcopy_outgoing_ram_save_live(Monitor *mon,
+                                    QEMUFile *f, int stage, void *opaque)
+{
+    int ret = 0;
+    DPRINTF("stage %d\n", stage);
+    if (stage == QEMU_SAVE_LIVE_STAGE_START) {
+        sort_ram_list();
+        ram_save_live_mem_size(f);
+    }
+    if (stage == QEMU_SAVE_LIVE_STAGE_PART) {
+        ret = 1;
+    }
+    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
+    return ret;
+}
+
+static RAMBlock *postcopy_outgoing_find_block(const char *idstr)
+{
+    RAMBlock *block;
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        if (!strncmp(idstr, block->idstr, strlen(idstr))) {
+            return block;
+        }
+    }
+    return NULL;
+}
+
+/*
+ * return value
+ *   0: continue postcopy mode
+ * > 0: completed postcopy mode.
+ * < 0: error
+ */
+static int postcopy_outgoing_handle_req(PostcopyOutgoingState *s,
+                                        const struct qemu_umem_req *req,
+                                        bool *written)
+{
+    int i;
+    RAMBlock *block;
+
+    DPRINTF("cmd %d state %d\n", req->cmd, s->state);
+    switch(req->cmd) {
+    case QEMU_UMEM_REQ_INIT:
+        /* nothing */
+        break;
+    case QEMU_UMEM_REQ_EOC:
+        /* tell to finish migration. */
+        if (s->state == PO_STATE_ALL_PAGES_SENT) {
+            s->state = PO_STATE_COMPLETED;
+            DPRINTF("-> PO_STATE_COMPLETED\n");
+        } else {
+            s->state = PO_STATE_EOC_RECEIVED;
+            DPRINTF("-> PO_STATE_EOC_RECEIVED\n");
+        }
+        return 1;
+    case QEMU_UMEM_REQ_ON_DEMAND:
+    case QEMU_UMEM_REQ_BACKGROUND:
+        DPRINTF("idstr: %s\n", req->idstr);
+        block = postcopy_outgoing_find_block(req->idstr);
+        if (block == NULL) {
+            return -EINVAL;
+        }
+        s->last_block_read = block;
+        /* fall through */
+    case QEMU_UMEM_REQ_ON_DEMAND_CONT:
+    case QEMU_UMEM_REQ_BACKGROUND_CONT:
+        DPRINTF("nr %d\n", req->nr);
+        for (i = 0; i < req->nr; i++) {
+            DPRINTF("offs[%d] 0x%"PRIx64"\n", i, req->pgoffs[i]);
+            int ret = ram_save_page(s->mig_buffered_write, s->last_block_read,
+                                    req->pgoffs[i] << TARGET_PAGE_BITS);
+            if (ret > 0) {
+                *written = true;
+            }
+        }
+        break;
+    case QEMU_UMEM_REQ_REMOVE:
+        block = postcopy_outgoing_find_block(req->idstr);
+        if (block == NULL) {
+            return -EINVAL;
+        }
+        for (i = 0; i < req->nr; i++) {
+            ram_addr_t addr = block->offset +
+                (req->pgoffs[i] << TARGET_PAGE_BITS);
+            cpu_physical_memory_reset_dirty(addr,
+                                            addr + TARGET_PAGE_SIZE,
+                                            MIGRATION_DIRTY_FLAG);
+        }
+        break;
+    default:
+        return -EINVAL;
+    }
+    return 0;
+}
+
+static void postcopy_outgoing_close_mig_read(PostcopyOutgoingState *s)
+{
+    if (s->mig_read != NULL) {
+        qemu_set_fd_handler(s->fd_read, NULL, NULL, NULL);
+        qemu_fclose(s->mig_read);
+        s->mig_read = NULL;
+        fd_close(&s->fd_read);
+
+        s->ms->file_read = NULL;
+        s->ms->fd_read = -1;
+    }
+}
+
+static void postcopy_outgoing_completed(PostcopyOutgoingState *s)
+{
+    postcopy_outgoing_close_mig_read(s);
+    s->ms->postcopy = NULL;
+    g_free(s);
+}
+
+static void postcopy_outgoing_recv_handler(void *opaque)
+{
+    PostcopyOutgoingState *s = opaque;
+    bool written = false;
+    int ret = 0;
+
+    assert(s->state == PO_STATE_ACTIVE ||
+           s->state == PO_STATE_ALL_PAGES_SENT);
+
+    do {
+        struct qemu_umem_req req = {.idstr = NULL,
+                                    .pgoffs = NULL};
+
+        ret = postcopy_outgoing_recv_req(s->mig_read, &req);
+        if (ret < 0) {
+            if (ret == -EAGAIN) {
+                ret = 0;
+            }
+            break;
+        }
+        if (s->state == PO_STATE_ACTIVE) {
+            ret = postcopy_outgoing_handle_req(s, &req, &written);
+        }
+        postcopy_outgoing_free_req(&req);
+    } while (ret == 0);
+
+    /*
+     * flush buffered_file.
+     * Although mig_write is rate-limited buffered file, those written pages
+     * are requested on demand by the destination. So forcibly push
+     * those pages ignoring rate limiting
+     */
+    if (written) {
+        qemu_fflush(s->mig_buffered_write);
+        /* qemu_buffered_file_drain(s->mig_buffered_write); */
+    }
+
+    if (ret < 0) {
+        switch (s->state) {
+        case PO_STATE_ACTIVE:
+            s->state = PO_STATE_ERROR_RECEIVE;
+            DPRINTF("-> PO_STATE_ERROR_RECEIVE\n");
+            break;
+        case PO_STATE_ALL_PAGES_SENT:
+            s->state = PO_STATE_COMPLETED;
+            DPRINTF("-> PO_STATE_ALL_PAGES_SENT\n");
+            break;
+        default:
+            abort();
+        }
+    }
+    if (s->state == PO_STATE_ERROR_RECEIVE || s->state == PO_STATE_COMPLETED) {
+        postcopy_outgoing_close_mig_read(s);
+    }
+    if (s->state == PO_STATE_COMPLETED) {
+        DPRINTF("PO_STATE_COMPLETED\n");
+        MigrationState *ms = s->ms;
+        postcopy_outgoing_completed(s);
+        migrate_fd_completed(ms);
+    }
+}
+
+void *postcopy_outgoing_begin(MigrationState *ms)
+{
+    PostcopyOutgoingState *s = g_new(PostcopyOutgoingState, 1);
+    DPRINTF("outgoing begin\n");
+    qemu_fflush(ms->file);
+
+    s->ms = ms;
+    s->state = PO_STATE_ACTIVE;
+    s->fd_read = ms->fd_read;
+    s->mig_read = ms->file_read;
+    s->mig_buffered_write = ms->file;
+    s->block = NULL;
+    s->addr = 0;
+
+    /* Make sure all dirty bits are set */
+    ram_save_memory_set_dirty();
+
+    qemu_set_fd_handler(s->fd_read,
+                        &postcopy_outgoing_recv_handler, NULL, s);
+    return s;
+}
+
+static void postcopy_outgoing_ram_all_sent(QEMUFile *f,
+                                           PostcopyOutgoingState *s)
+{
+    assert(s->state == PO_STATE_ACTIVE);
+
+    s->state = PO_STATE_ALL_PAGES_SENT;
+    /* tell incoming side that all pages are sent */
+    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
+    qemu_fflush(f);
+    qemu_buffered_file_drain(f);
+    DPRINTF("sent RAM_SAVE_FLAG_EOS\n");
+    migrate_fd_cleanup(s->ms);
+
+    /* Later migrate_fd_complete() will be called which calls
+     * migrate_fd_cleanup() again. So dummy file is created
+     * for qemu monitor to keep working.
+     */
+    s->ms->file = qemu_fopen_ops(NULL, NULL, NULL, NULL, NULL,
+                                 NULL, NULL);
+}
+
+static int postcopy_outgoing_check_all_ram_sent(PostcopyOutgoingState *s,
+                                                RAMBlock *block,
+                                                ram_addr_t addr)
+{
+    if (block == NULL) {
+        block = QLIST_FIRST(&ram_list.blocks);
+        addr = block->offset;
+    }
+
+    for (; block != NULL;
+         s->block = QLIST_NEXT(s->block, next), addr = block->offset) {
+        for (; addr < block->offset + block->length;
+             addr += TARGET_PAGE_SIZE) {
+            if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) {
+                s->block = block;
+                s->addr = addr;
+                return 0;
+            }
+        }
+    }
+
+    return 1;
+}
+
+int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f,
+                                          void *postcopy)
+{
+    PostcopyOutgoingState *s = postcopy;
+
+    assert(s->state == PO_STATE_ACTIVE ||
+           s->state == PO_STATE_EOC_RECEIVED ||
+           s->state == PO_STATE_ERROR_RECEIVE);
+
+    switch (s->state) {
+    case PO_STATE_ACTIVE:
+        /* nothing. processed below */
+        break;
+    case PO_STATE_EOC_RECEIVED:
+        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
+        s->state = PO_STATE_COMPLETED;
+        postcopy_outgoing_completed(s);
+        DPRINTF("PO_STATE_COMPLETED\n");
+        return 1;
+    case PO_STATE_ERROR_RECEIVE:
+        postcopy_outgoing_completed(s);
+        DPRINTF("PO_STATE_ERROR_RECEIVE\n");
+        return -1;
+    default:
+        abort();
+    }
+
+    if (s->ms->params.nobg) {
+        /* See if all pages are sent. */
+        if (postcopy_outgoing_check_all_ram_sent(s, s->block, s->addr) == 0) {
+            return 0;
+        }
+        /* ram_list can be reordered. (it doesn't seem so during migration,
+           though) So the whole list needs to be checked again */
+        if (postcopy_outgoing_check_all_ram_sent(s, NULL, 0) == 0) {
+            return 0;
+        }
+
+        postcopy_outgoing_ram_all_sent(f, s);
+        return 0;
+    }
+
+    DPRINTF("outgoing background state: %d\n", s->state);
+
+    while (qemu_file_rate_limit(f) == 0) {
+        if (ram_save_block(f) == 0) { /* no more blocks */
+            assert(s->state == PO_STATE_ACTIVE);
+            postcopy_outgoing_ram_all_sent(f, s);
+            return 0;
+        }
+    }
+
+    return 0;
+}
+
+/***************************************************************************
+ * incoming part
+ */
+
+/* flags for incoming mode to modify the behavior.
+   This is for benchmark/debug purpose */
+#define INCOMING_FLAGS_FAULT_REQUEST 0x01
+
+
+static void postcopy_incoming_umemd(void);
+
+#define PIS_STATE_QUIT_RECEIVED         0x01
+#define PIS_STATE_QUIT_QUEUED           0x02
+#define PIS_STATE_QUIT_SENT             0x04
+
+#define PIS_STATE_QUIT_MASK             (PIS_STATE_QUIT_RECEIVED | \
+                                         PIS_STATE_QUIT_QUEUED | \
+                                         PIS_STATE_QUIT_SENT)
+
+struct PostcopyIncomingState {
+    /* dest qemu state */
+    uint32_t    state;
+
+    UMemDev *dev;
+    int host_page_size;
+    int host_page_shift;
+
+    /* qemu side */
+    int to_umemd_fd;
+    QEMUFileNonblock *to_umemd;
+#define MAX_FAULTED_PAGES       256
+    struct umem_pages *faulted_pages;
+
+    int from_umemd_fd;
+    QEMUFile *from_umemd;
+    int version_id;     /* save/load format version id */
+};
+typedef struct PostcopyIncomingState PostcopyIncomingState;
+
+
+#define UMEM_STATE_EOS_RECEIVED         0x01    /* umem daemon <-> src qemu */
+#define UMEM_STATE_EOC_SENT             0x02    /* umem daemon <-> src qemu */
+#define UMEM_STATE_QUIT_RECEIVED        0x04    /* umem daemon <-> dst qemu */
+#define UMEM_STATE_QUIT_QUEUED          0x08    /* umem daemon <-> dst qemu */
+#define UMEM_STATE_QUIT_SENT            0x10    /* umem daemon <-> dst qemu */
+
+#define UMEM_STATE_QUIT_MASK            (UMEM_STATE_QUIT_QUEUED | \
+                                         UMEM_STATE_QUIT_SENT | \
+                                         UMEM_STATE_QUIT_RECEIVED)
+#define UMEM_STATE_END_MASK             (UMEM_STATE_EOS_RECEIVED | \
+                                         UMEM_STATE_EOC_SENT | \
+                                         UMEM_STATE_QUIT_MASK)
+
+struct PostcopyIncomingUMemDaemon {
+    /* umem daemon side */
+    uint32_t state;
+
+    int host_page_size;
+    int host_page_shift;
+    int nr_host_pages_per_target_page;
+    int host_to_target_page_shift;
+    int nr_target_pages_per_host_page;
+    int target_to_host_page_shift;
+    int version_id;     /* save/load format version id */
+
+    int to_qemu_fd;
+    QEMUFileNonblock *to_qemu;
+    int from_qemu_fd;
+    QEMUFile *from_qemu;
+
+    int mig_read_fd;
+    QEMUFile *mig_read;         /* qemu on source -> umem daemon */
+
+    int mig_write_fd;
+    QEMUFileNonblock *mig_write;        /* umem daemon -> qemu on source */
+
+    /* = KVM_MAX_VCPUS * (ASYNC_PF_PER_VCPUS + 1) */
+#define MAX_REQUESTS    (512 * (64 + 1))
+
+    struct umem_page_request page_request;
+    struct umem_page_cached page_cached;
+
+#define MAX_PRESENT_REQUESTS    MAX_FAULTED_PAGES
+    struct umem_pages *present_request;
+
+    uint64_t *target_pgoffs;
+
+    /* bitmap indexed by target page offset */
+    unsigned long *phys_requested;
+
+    /* bitmap indexed by target page offset */
+    unsigned long *phys_received;
+
+    RAMBlock *last_block_read;  /* qemu on source -> umem daemon */
+    RAMBlock *last_block_write; /* umem daemon -> qemu on source */
+};
+typedef struct PostcopyIncomingUMemDaemon PostcopyIncomingUMemDaemon;
+
+static PostcopyIncomingState state = {
+    .state = 0,
+    .dev = NULL,
+    .to_umemd_fd = -1,
+    .to_umemd = NULL,
+    .from_umemd_fd = -1,
+    .from_umemd = NULL,
+};
+
+static PostcopyIncomingUMemDaemon umemd = {
+    .state = 0,
+    .to_qemu_fd = -1,
+    .to_qemu = NULL,
+    .from_qemu_fd = -1,
+    .from_qemu = NULL,
+    .mig_read_fd = -1,
+    .mig_read = NULL,
+    .mig_write_fd = -1,
+    .mig_write = NULL,
+};
+
+int postcopy_incoming_init(const char *incoming, bool incoming_postcopy)
+{
+    /* incoming_postcopy makes sense only when incoming migration mode */
+    if (!incoming && incoming_postcopy) {
+        return -EINVAL;
+    }
+
+    if (!incoming_postcopy) {
+        return 0;
+    }
+
+    state.state = 0;
+    state.dev = umem_dev_new();
+    state.host_page_size = getpagesize();
+    state.host_page_shift = ffs(state.host_page_size) - 1;
+    state.version_id = RAM_SAVE_VERSION_ID; /* = save version of
+                                               ram_save_live() */
+    return 0;
+}
+
+void postcopy_incoming_ram_alloc(const char *name,
+                                 size_t size, uint8_t **hostp, UMem **umemp)
+{
+    UMem *umem;
+    size = ALIGN_UP(size, state.host_page_size);
+    umem = umem_dev_create(state.dev, size, name);
+
+    *umemp = umem;
+    *hostp = umem->umem;
+}
+
+void postcopy_incoming_ram_free(UMem *umem)
+{
+    umem_unmap(umem);
+    umem_close(umem);
+    umem_destroy(umem);
+}
+
+void postcopy_incoming_prepare(void)
+{
+    RAMBlock *block;
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        if (block->umem != NULL) {
+            umem_mmap(block->umem);
+        }
+    }
+}
+
+static int postcopy_incoming_ram_load_get64(QEMUFile *f,
+                                             ram_addr_t *addr, int *flags)
+{
+    *addr = qemu_get_be64(f);
+    *flags = *addr & ~TARGET_PAGE_MASK;
+    *addr &= TARGET_PAGE_MASK;
+    return qemu_file_get_error(f);
+}
+
+int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id)
+{
+    ram_addr_t addr;
+    int flags;
+    int error;
+
+    DPRINTF("incoming ram load\n");
+    /*
+     * RAM_SAVE_FLAGS_EOS or
+     * RAM_SAVE_FLAGS_MEM_SIZE + mem size + RAM_SAVE_FLAGS_EOS
+     * see postcopy_outgoing_ram_save_live()
+     */
+
+    if (version_id != RAM_SAVE_VERSION_ID) {
+        DPRINTF("RAM_SAVE_VERSION_ID %d != %d\n",
+                version_id, RAM_SAVE_VERSION_ID);
+        return -EINVAL;
+    }
+    error = postcopy_incoming_ram_load_get64(f, &addr, &flags);
+    DPRINTF("addr 0x%lx flags 0x%x\n", addr, flags);
+    if (error) {
+        DPRINTF("error %d\n", error);
+        return error;
+    }
+    if (flags == RAM_SAVE_FLAG_EOS && addr == 0) {
+        DPRINTF("EOS\n");
+        return 0;
+    }
+
+    if (flags != RAM_SAVE_FLAG_MEM_SIZE) {
+        DPRINTF("-EINVAL flags 0x%x\n", flags);
+        return -EINVAL;
+    }
+    error = ram_load_mem_size(f, addr);
+    if (error) {
+        DPRINTF("addr 0x%lx error %d\n", addr, error);
+        return error;
+    }
+
+    error = postcopy_incoming_ram_load_get64(f, &addr, &flags);
+    if (error) {
+        DPRINTF("addr 0x%lx flags 0x%x error %d\n", addr, flags, error);
+        return error;
+    }
+    if (flags == RAM_SAVE_FLAG_EOS && addr == 0) {
+        DPRINTF("done\n");
+        return 0;
+    }
+    DPRINTF("-EINVAL\n");
+    return -EINVAL;
+}
+
+void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read)
+{
+    int fds[2];
+    RAMBlock *block;
+
+    DPRINTF("fork\n");
+
+    /* socketpair(AF_UNIX)? */
+
+    if (qemu_pipe(fds) == -1) {
+        perror("qemu_pipe");
+        abort();
+    }
+    state.from_umemd_fd = fds[0];
+    umemd.to_qemu_fd = fds[1];
+
+    if (qemu_pipe(fds) == -1) {
+        perror("qemu_pipe");
+        abort();
+    }
+    umemd.from_qemu_fd = fds[0];
+    state.to_umemd_fd = fds[1];
+
+    pid_t child = fork();
+    if (child < 0) {
+        perror("fork");
+        abort();
+    }
+
+    if (child == 0) {
+        int mig_write_fd;
+
+        fd_close(&state.to_umemd_fd);
+        fd_close(&state.from_umemd_fd);
+        umemd.host_page_size = state.host_page_size;
+        umemd.host_page_shift = state.host_page_shift;
+
+        umemd.nr_host_pages_per_target_page =
+            TARGET_PAGE_SIZE / umemd.host_page_size;
+        umemd.nr_target_pages_per_host_page =
+            umemd.host_page_size / TARGET_PAGE_SIZE;
+
+        umemd.target_to_host_page_shift =
+            ffs(umemd.nr_host_pages_per_target_page) - 1;
+        umemd.host_to_target_page_shift =
+            ffs(umemd.nr_target_pages_per_host_page) - 1;
+
+        umemd.state = 0;
+        umemd.version_id = state.version_id;
+        umemd.mig_read_fd = mig_read_fd;
+        umemd.mig_read = mig_read;
+
+        mig_write_fd = dup(mig_read_fd);
+        if (mig_write_fd < 0) {
+            perror("could not dup for writable socket \n");
+            abort();
+        }
+        umemd.mig_write_fd = mig_write_fd;
+        umemd.mig_write = qemu_fopen_nonblock(mig_write_fd);
+
+        postcopy_incoming_umemd(); /* noreturn */
+    }
+
+    DPRINTF("qemu pid: %d daemon pid: %d\n", getpid(), child);
+    fd_close(&umemd.to_qemu_fd);
+    fd_close(&umemd.from_qemu_fd);
+    state.faulted_pages = g_malloc(umem_pages_size(MAX_FAULTED_PAGES));
+    state.faulted_pages->nr = 0;
+
+    /* close all UMem.shmem_fd */
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        umem_close_shmem(block->umem);
+    }
+    umem_qemu_wait_for_daemon(state.from_umemd_fd);
+}
+
+static void postcopy_incoming_qemu_recv_quit(void)
+{
+    RAMBlock *block;
+    if (state.state & PIS_STATE_QUIT_RECEIVED) {
+        return;
+    }
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        if (block->umem != NULL) {
+            umem_destroy(block->umem);
+            block->umem = NULL;
+            block->flags &= ~RAM_POSTCOPY_UMEM_MASK;
+        }
+    }
+
+    DPRINTF("|= PIS_STATE_QUIT_RECEIVED\n");
+    state.state |= PIS_STATE_QUIT_RECEIVED;
+    qemu_set_fd_handler(state.from_umemd_fd, NULL, NULL, NULL);
+    qemu_fclose(state.from_umemd);
+    state.from_umemd = NULL;
+    fd_close(&state.from_umemd_fd);
+}
+
+static void postcopy_incoming_qemu_fflush_to_umemd_handler(void *opaque)
+{
+    assert(state.to_umemd != NULL);
+
+    nonblock_fflush(state.to_umemd);
+    if (nonblock_pending_size(state.to_umemd) > 0) {
+        return;
+    }
+
+    qemu_set_fd_handler(state.to_umemd->fd, NULL, NULL, NULL);
+    if (state.state & PIS_STATE_QUIT_QUEUED) {
+        DPRINTF("|= PIS_STATE_QUIT_SENT\n");
+        state.state |= PIS_STATE_QUIT_SENT;
+        qemu_fclose(state.to_umemd->file);
+        state.to_umemd = NULL;
+        fd_close(&state.to_umemd_fd);
+        g_free(state.faulted_pages);
+        state.faulted_pages = NULL;
+    }
+}
+
+static void postcopy_incoming_qemu_fflush_to_umemd(void)
+{
+    qemu_set_fd_handler(state.to_umemd->fd, NULL,
+                        postcopy_incoming_qemu_fflush_to_umemd_handler, NULL);
+    postcopy_incoming_qemu_fflush_to_umemd_handler(NULL);
+}
+
+static void postcopy_incoming_qemu_queue_quit(void)
+{
+    if (state.state & PIS_STATE_QUIT_QUEUED) {
+        return;
+    }
+
+    DPRINTF("|= PIS_STATE_QUIT_QUEUED\n");
+    umem_qemu_quit(state.to_umemd->file);
+    state.state |= PIS_STATE_QUIT_QUEUED;
+}
+
+static void postcopy_incoming_qemu_send_pages_present(void)
+{
+    if (state.faulted_pages->nr > 0) {
+        umem_qemu_send_pages_present(state.to_umemd->file,
+                                     state.faulted_pages);
+        state.faulted_pages->nr = 0;
+    }
+}
+
+static void postcopy_incoming_qemu_faulted_pages(
+    const struct umem_pages *pages)
+{
+    assert(pages->nr <= MAX_FAULTED_PAGES);
+    assert(state.faulted_pages != NULL);
+
+    if (state.faulted_pages->nr + pages->nr > MAX_FAULTED_PAGES) {
+        postcopy_incoming_qemu_send_pages_present();
+    }
+    memcpy(&state.faulted_pages->pgoffs[state.faulted_pages->nr],
+           &pages->pgoffs[0], sizeof(pages->pgoffs[0]) * pages->nr);
+    state.faulted_pages->nr += pages->nr;
+}
+
+static void postcopy_incoming_qemu_cleanup_umem(void);
+
+static int postcopy_incoming_qemu_handle_req_one(void)
+{
+    int offset = 0;
+    int ret;
+    uint8_t cmd;
+
+    ret = qemu_peek_buffer(state.from_umemd, &cmd, sizeof(cmd), offset);
+    offset += sizeof(cmd);
+    if (ret != sizeof(cmd)) {
+        return -EAGAIN;
+    }
+    DPRINTF("cmd %c\n", cmd);
+
+    switch (cmd) {
+    case UMEM_DAEMON_QUIT:
+        postcopy_incoming_qemu_recv_quit();
+        postcopy_incoming_qemu_queue_quit();
+        postcopy_incoming_qemu_cleanup_umem();
+        break;
+    case UMEM_DAEMON_TRIGGER_PAGE_FAULT: {
+        struct umem_pages *pages =
+            umem_qemu_trigger_page_fault(state.from_umemd, &offset);
+        if (pages == NULL) {
+            return -EAGAIN;
+        }
+        if (state.to_umemd_fd >= 0 && !(state.state & PIS_STATE_QUIT_QUEUED)) {
+            postcopy_incoming_qemu_faulted_pages(pages);
+            g_free(pages);
+        }
+        break;
+    }
+    case UMEM_DAEMON_ERROR:
+        /* umem daemon hit troubles, so it warned us to stop vm execution */
+        vm_stop(RUN_STATE_IO_ERROR); /* or RUN_STATE_INTERNAL_ERROR */
+        break;
+    default:
+        abort();
+        break;
+    }
+
+    if (state.from_umemd != NULL) {
+        qemu_file_skip(state.from_umemd, offset);
+    }
+    return 0;
+}
+
+static void postcopy_incoming_qemu_handle_req(void *opaque)
+{
+    do {
+        int ret = postcopy_incoming_qemu_handle_req_one();
+        if (ret == -EAGAIN) {
+            break;
+        }
+    } while (state.from_umemd != NULL &&
+             qemu_pending_size(state.from_umemd) > 0);
+
+    if (state.to_umemd != NULL) {
+        if (state.faulted_pages->nr > 0) {
+            postcopy_incoming_qemu_send_pages_present();
+        }
+        postcopy_incoming_qemu_fflush_to_umemd();
+    }
+}
+
+void postcopy_incoming_qemu_ready(void)
+{
+    umem_qemu_ready(state.to_umemd_fd);
+
+    state.from_umemd = qemu_fopen_pipe(state.from_umemd_fd);
+    state.to_umemd = qemu_fopen_nonblock(state.to_umemd_fd);
+    qemu_set_fd_handler(state.from_umemd_fd,
+                        postcopy_incoming_qemu_handle_req, NULL, NULL);
+}
+
+static void postcopy_incoming_qemu_cleanup_umem(void)
+{
+    /* when qemu will quit before completing postcopy, tell umem daemon
+       to tear down umem device and exit. */
+    if (state.to_umemd_fd >= 0) {
+        postcopy_incoming_qemu_queue_quit();
+        postcopy_incoming_qemu_fflush_to_umemd();
+    }
+
+    if (state.dev) {
+        umem_dev_destroy(state.dev);
+        state.dev = NULL;
+    }
+}
+
+void postcopy_incoming_qemu_cleanup(void)
+{
+    postcopy_incoming_qemu_cleanup_umem();
+    if (state.to_umemd != NULL) {
+        nonblock_wait_for_flush(state.to_umemd);
+    }
+}
+
+void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size)
+{
+    uint64_t nr = DIV_ROUND_UP(size, state.host_page_size);
+    size_t len = umem_pages_size(nr);
+    ram_addr_t end = addr + size;
+    struct umem_pages *pages;
+    int i;
+
+    if (state.to_umemd_fd < 0 || state.state & PIS_STATE_QUIT_QUEUED) {
+        return;
+    }
+    pages = g_malloc(len);
+    pages->nr = nr;
+    for (i = 0; addr < end; addr += state.host_page_size, i++) {
+        pages->pgoffs[i] = addr >> state.host_page_shift;
+    }
+    umem_qemu_send_pages_unmapped(state.to_umemd->file, pages);
+    g_free(pages);
+    assert(state.to_umemd != NULL);
+    postcopy_incoming_qemu_fflush_to_umemd();
+}
+
+/**************************************************************************
+ * incoming umem daemon
+ */
+
+static void postcopy_incoming_umem_recv_quit(void)
+{
+    if (umemd.state & UMEM_STATE_QUIT_RECEIVED) {
+        return;
+    }
+    DPRINTF("|= UMEM_STATE_QUIT_RECEIVED\n");
+    umemd.state |= UMEM_STATE_QUIT_RECEIVED;
+    qemu_fclose(umemd.from_qemu);
+    umemd.from_qemu = NULL;
+    fd_close(&umemd.from_qemu_fd);
+}
+
+static void postcopy_incoming_umem_queue_quit(void)
+{
+    if (umemd.state & UMEM_STATE_QUIT_QUEUED) {
+        return;
+    }
+    DPRINTF("|= UMEM_STATE_QUIT_QUEUED\n");
+    umem_daemon_quit(umemd.to_qemu->file);
+    umemd.state |= UMEM_STATE_QUIT_QUEUED;
+}
+
+static void postcopy_incoming_umem_send_eoc_req(void)
+{
+    struct qemu_umem_req req;
+
+    if (umemd.state & UMEM_STATE_EOC_SENT) {
+        return;
+    }
+
+    DPRINTF("|= UMEM_STATE_EOC_SENT\n");
+    req.cmd = QEMU_UMEM_REQ_EOC;
+    postcopy_incoming_send_req(umemd.mig_write->file, &req);
+    umemd.state |= UMEM_STATE_EOC_SENT;
+    qemu_fclose(umemd.mig_write->file);
+    umemd.mig_write = NULL;
+    fd_close(&umemd.mig_write_fd);
+}
+
+static void postcopy_incoming_umem_send_page_req(RAMBlock *block)
+{
+    struct qemu_umem_req req;
+    int bit;
+    uint64_t target_pgoff;
+    int i;
+
+    umemd.page_request.nr = MAX_REQUESTS;
+    umem_get_page_request(block->umem, &umemd.page_request);
+    DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n",
+            block->idstr, umemd.page_request.nr,
+            (uint64_t)umemd.page_request.pgoffs[0],
+            (uint64_t)umemd.page_request.pgoffs[1]);
+
+    if (umemd.last_block_write != block) {
+        req.cmd = QEMU_UMEM_REQ_ON_DEMAND;
+        req.idstr = block->idstr;
+    } else {
+        req.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT;
+    }
+
+    req.nr = 0;
+    req.pgoffs = umemd.target_pgoffs;
+    if (TARGET_PAGE_SIZE >= umemd.host_page_size) {
+        for (i = 0; i < umemd.page_request.nr; i++) {
+            target_pgoff =
+                umemd.page_request.pgoffs[i] >> 
umemd.host_to_target_page_shift;
+            bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff;
+
+            if (!test_and_set_bit(bit, umemd.phys_requested)) {
+                req.pgoffs[req.nr] = target_pgoff;
+                req.nr++;
+            }
+        }
+    } else {
+        for (i = 0; i < umemd.page_request.nr; i++) {
+            int j;
+            target_pgoff =
+                umemd.page_request.pgoffs[i] << 
umemd.host_to_target_page_shift;
+            bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff;
+
+            for (j = 0; j < umemd.nr_target_pages_per_host_page; j++) {
+                if (!test_and_set_bit(bit + j, umemd.phys_requested)) {
+                    req.pgoffs[req.nr] = target_pgoff + j;
+                    req.nr++;
+                }
+            }
+        }
+    }
+
+    DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n",
+            block->idstr, req.nr, req.pgoffs[0], req.pgoffs[1]);
+    if (req.nr > 0 && umemd.mig_write != NULL) {
+        postcopy_incoming_send_req(umemd.mig_write->file, &req);
+        umemd.last_block_write = block;
+    }
+}
+
+static void postcopy_incoming_umem_send_pages_present(void)
+{
+    if (umemd.present_request->nr > 0) {
+        umem_daemon_send_pages_present(umemd.to_qemu->file,
+                                       umemd.present_request);
+        umemd.present_request->nr = 0;
+    }
+}
+
+static void postcopy_incoming_umem_pages_present_one(
+    uint32_t nr, const __u64 *pgoffs, uint64_t ramblock_pgoffset)
+{
+    uint32_t i;
+    assert(nr <= MAX_PRESENT_REQUESTS);
+
+    if (umemd.present_request->nr + nr > MAX_PRESENT_REQUESTS) {
+        postcopy_incoming_umem_send_pages_present();
+    }
+
+    for (i = 0; i < nr; i++) {
+        umemd.present_request->pgoffs[umemd.present_request->nr + i] =
+            pgoffs[i] + ramblock_pgoffset;
+    }
+    umemd.present_request->nr += nr;
+}
+
+static void postcopy_incoming_umem_pages_present(
+    const struct umem_page_cached *page_cached, uint64_t ramblock_pgoffset)
+{
+    uint32_t left = page_cached->nr;
+    uint32_t offset = 0;
+
+    while (left > 0) {
+        uint32_t nr = MIN(left, MAX_PRESENT_REQUESTS);
+        postcopy_incoming_umem_pages_present_one(
+            nr, &page_cached->pgoffs[offset], ramblock_pgoffset);
+
+        left -= nr;
+        offset += nr;
+    }
+}
+
+static int postcopy_incoming_umem_ram_load(void)
+{
+    ram_addr_t offset;
+    int flags;
+    int error;
+    void *shmem;
+    int i;
+    int bit;
+
+    if (umemd.version_id != RAM_SAVE_VERSION_ID) {
+        return -EINVAL;
+    }
+
+    offset = qemu_get_be64(umemd.mig_read);
+
+    flags = offset & ~TARGET_PAGE_MASK;
+    offset &= TARGET_PAGE_MASK;
+
+    assert(!(flags & RAM_SAVE_FLAG_MEM_SIZE));
+
+    if (flags & RAM_SAVE_FLAG_EOS) {
+        DPRINTF("RAM_SAVE_FLAG_EOS\n");
+        postcopy_incoming_umem_send_eoc_req();
+
+        qemu_fclose(umemd.mig_read);
+        umemd.mig_read = NULL;
+        fd_close(&umemd.mig_read_fd);
+        umemd.state |= UMEM_STATE_EOS_RECEIVED;
+
+        postcopy_incoming_umem_queue_quit();
+        DPRINTF("|= UMEM_STATE_EOS_RECEIVED\n");
+        return 0;
+    }
+
+    shmem = ram_load_host_from_stream_offset(umemd.mig_read, offset, flags,
+                                             &umemd.last_block_read);
+    if (!shmem) {
+        DPRINTF("shmem == NULL\n");
+        return -EINVAL;
+    }
+
+    if (flags & RAM_SAVE_FLAG_COMPRESS) {
+        uint8_t ch = qemu_get_byte(umemd.mig_read);
+        memset(shmem, ch, TARGET_PAGE_SIZE);
+    } else if (flags & RAM_SAVE_FLAG_PAGE) {
+        qemu_get_buffer(umemd.mig_read, shmem, TARGET_PAGE_SIZE);
+    }
+
+    error = qemu_file_get_error(umemd.mig_read);
+    if (error) {
+        DPRINTF("error %d\n", error);
+        return error;
+    }
+
+    umemd.page_cached.nr = 0;
+    bit = (umemd.last_block_read->offset + offset) >> TARGET_PAGE_BITS;
+    if (!test_and_set_bit(bit, umemd.phys_received)) {
+        if (TARGET_PAGE_SIZE >= umemd.host_page_size) {
+            __u64 pgoff = offset >> umemd.host_page_shift;
+            for (i = 0; i < umemd.nr_host_pages_per_target_page; i++) {
+                umemd.page_cached.pgoffs[umemd.page_cached.nr] = pgoff + i;
+                umemd.page_cached.nr++;
+            }
+        } else {
+            bool mark_cache = true;
+            for (i = 0; i < umemd.nr_target_pages_per_host_page; i++) {
+                if (!test_bit(bit + i, umemd.phys_received)) {
+                    mark_cache = false;
+                    break;
+                }
+            }
+            if (mark_cache) {
+                umemd.page_cached.pgoffs[0] = offset >> umemd.host_page_shift;
+                umemd.page_cached.nr = 1;
+            }
+        }
+    }
+
+    if (umemd.page_cached.nr > 0) {
+        umem_mark_page_cached(umemd.last_block_read->umem, &umemd.page_cached);
+
+        if (!(umemd.state & UMEM_STATE_QUIT_QUEUED) && umemd.to_qemu_fd >=0 &&
+            (incoming_postcopy_flags & INCOMING_FLAGS_FAULT_REQUEST)) {
+            uint64_t ramblock_pgoffset;
+
+            ramblock_pgoffset =
+                umemd.last_block_read->offset >> umemd.host_page_shift;
+            postcopy_incoming_umem_pages_present(&umemd.page_cached,
+                                                 ramblock_pgoffset);
+        }
+    }
+
+    return 0;
+}
+
+static bool postcopy_incoming_umem_check_umem_done(void)
+{
+    bool all_done = true;
+    RAMBlock *block;
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        UMem *umem = block->umem;
+        if (umem != NULL && umem->nsets == umem->nbits) {
+            umem_unmap_shmem(umem);
+            umem_destroy(umem);
+            block->umem = NULL;
+        }
+        if (block->umem != NULL) {
+            all_done = false;
+        }
+    }
+    return all_done;
+}
+
+static bool postcopy_incoming_umem_page_faulted(const struct umem_pages *pages)
+{
+    int i;
+
+    for (i = 0; i < pages->nr; i++) {
+        ram_addr_t addr = pages->pgoffs[i] << umemd.host_page_shift;
+        RAMBlock *block = qemu_get_ram_block(addr);
+        addr -= block->offset;
+        umem_remove_shmem(block->umem, addr, umemd.host_page_size);
+    }
+    return postcopy_incoming_umem_check_umem_done();
+}
+
+static bool
+postcopy_incoming_umem_page_unmapped(const struct umem_pages *pages)
+{
+    RAMBlock *block;
+    ram_addr_t addr;
+    int i;
+
+    struct qemu_umem_req req = {
+        .cmd = QEMU_UMEM_REQ_REMOVE,
+        .nr = 0,
+        .pgoffs = (uint64_t*)pages->pgoffs,
+    };
+
+    addr = pages->pgoffs[0] << umemd.host_page_shift;
+    block = qemu_get_ram_block(addr);
+
+    for (i = 0; i < pages->nr; i++)  {
+        int pgoff;
+
+        addr = pages->pgoffs[i] << umemd.host_page_shift;
+        pgoff = addr >> TARGET_PAGE_BITS;
+        if (!test_bit(pgoff, umemd.phys_received) &&
+            !test_bit(pgoff, umemd.phys_requested)) {
+            req.pgoffs[req.nr] = pgoff;
+            req.nr++;
+        }
+        set_bit(pgoff, umemd.phys_received);
+        set_bit(pgoff, umemd.phys_requested);
+
+        umem_remove_shmem(block->umem,
+                          addr - block->offset, umemd.host_page_size);
+    }
+    if (req.nr > 0 && umemd.mig_write != NULL) {
+        req.idstr = block->idstr;
+        postcopy_incoming_send_req(umemd.mig_write->file, &req);
+    }
+
+    return postcopy_incoming_umem_check_umem_done();
+}
+
+static void postcopy_incoming_umem_done(void)
+{
+    postcopy_incoming_umem_send_eoc_req();
+    postcopy_incoming_umem_queue_quit();
+}
+
+static int postcopy_incoming_umem_handle_qemu(void)
+{
+    int ret;
+    int offset = 0;
+    uint8_t cmd;
+
+    ret = qemu_peek_buffer(umemd.from_qemu, &cmd, sizeof(cmd), offset);
+    offset += sizeof(cmd);
+    if (ret != sizeof(cmd)) {
+        return -EAGAIN;
+    }
+    DPRINTF("cmd %c\n", cmd);
+    switch (cmd) {
+    case UMEM_QEMU_QUIT:
+        postcopy_incoming_umem_recv_quit();
+        postcopy_incoming_umem_done();
+        break;
+    case UMEM_QEMU_PAGE_FAULTED: {
+        struct umem_pages *pages = umem_recv_pages(umemd.from_qemu,
+                                                   &offset);
+        if (pages == NULL) {
+            return -EAGAIN;
+        }
+        if (postcopy_incoming_umem_page_faulted(pages)){
+            postcopy_incoming_umem_done();
+        }
+        g_free(pages);
+        break;
+    }
+    case UMEM_QEMU_PAGE_UNMAPPED: {
+        struct umem_pages *pages = umem_recv_pages(umemd.from_qemu,
+                                                   &offset);
+        if (pages == NULL) {
+            return -EAGAIN;
+        }
+        if (postcopy_incoming_umem_page_unmapped(pages)){
+            postcopy_incoming_umem_done();
+        }
+        g_free(pages);
+        break;
+    }
+    default:
+        abort();
+        break;
+    }
+    if (umemd.from_qemu != NULL) {
+        qemu_file_skip(umemd.from_qemu, offset);
+    }
+    return 0;
+}
+
+static void set_fd(int fd, fd_set *fds, int *nfds)
+{
+    FD_SET(fd, fds);
+    if (fd > *nfds) {
+        *nfds = fd;
+    }
+}
+
+static int postcopy_incoming_umemd_main_loop(void)
+{
+    fd_set writefds;
+    fd_set readfds;
+    int nfds;
+    RAMBlock *block;
+    int ret;
+
+    int pending_size;
+    bool get_page_request;
+
+    nfds = -1;
+    FD_ZERO(&writefds);
+    FD_ZERO(&readfds);
+
+    if (umemd.mig_write != NULL) {
+        pending_size = nonblock_pending_size(umemd.mig_write);
+        if (pending_size > 0) {
+            set_fd(umemd.mig_write_fd, &writefds, &nfds);
+        }
+    } else {
+        pending_size = 0;
+    }
+
+#define PENDING_SIZE_MAX (MAX_REQUESTS * sizeof(uint64_t) * 2)
+    /* If page request to the migration source is accumulated,
+       suspend getting page fault request. */
+    get_page_request = (pending_size <= PENDING_SIZE_MAX);
+
+    if (get_page_request) {
+        QLIST_FOREACH(block, &ram_list.blocks, next) {
+            if (block->umem != NULL) {
+                set_fd(block->umem->fd, &readfds, &nfds);
+            }
+        }
+    }
+
+    if (umemd.mig_read_fd >= 0) {
+        set_fd(umemd.mig_read_fd, &readfds, &nfds);
+    }
+
+    if (umemd.to_qemu != NULL &&
+        nonblock_pending_size(umemd.to_qemu) > 0) {
+        set_fd(umemd.to_qemu_fd, &writefds, &nfds);
+    }
+    if (umemd.from_qemu_fd >= 0) {
+        set_fd(umemd.from_qemu_fd, &readfds, &nfds);
+    }
+
+    ret = select(nfds + 1, &readfds, &writefds, NULL, NULL);
+    if (ret == -1) {
+        if (errno == EINTR) {
+            return 0;
+        }
+        return ret;
+    }
+
+    if (umemd.mig_write_fd >= 0 && FD_ISSET(umemd.mig_write_fd, &writefds)) {
+        nonblock_fflush(umemd.mig_write);
+    }
+    if (umemd.to_qemu_fd >= 0 && FD_ISSET(umemd.to_qemu_fd, &writefds)) {
+        nonblock_fflush(umemd.to_qemu);
+    }
+    if (get_page_request) {
+        QLIST_FOREACH(block, &ram_list.blocks, next) {
+            if (block->umem != NULL && FD_ISSET(block->umem->fd, &readfds)) {
+                postcopy_incoming_umem_send_page_req(block);
+            }
+        }
+    }
+    if (umemd.mig_read_fd >= 0 && FD_ISSET(umemd.mig_read_fd, &readfds)) {
+        do {
+            ret = postcopy_incoming_umem_ram_load();
+            if (ret < 0) {
+                return ret;
+            }
+        } while (umemd.mig_read != NULL &&
+                 qemu_pending_size(umemd.mig_read) > 0);
+    }
+    if (umemd.from_qemu_fd >= 0 && FD_ISSET(umemd.from_qemu_fd, &readfds)) {
+        do {
+            ret = postcopy_incoming_umem_handle_qemu();
+            if (ret == -EAGAIN) {
+                break;
+            }
+        } while (umemd.from_qemu != NULL &&
+                 qemu_pending_size(umemd.from_qemu) > 0);
+    }
+
+    if (umemd.mig_write != NULL) {
+        nonblock_fflush(umemd.mig_write);
+    }
+    if (umemd.to_qemu != NULL) {
+        if (!(umemd.state & UMEM_STATE_QUIT_QUEUED)) {
+            postcopy_incoming_umem_send_pages_present();
+        }
+        nonblock_fflush(umemd.to_qemu);
+        if ((umemd.state & UMEM_STATE_QUIT_QUEUED) &&
+            nonblock_pending_size(umemd.to_qemu) == 0) {
+            DPRINTF("|= UMEM_STATE_QUIT_SENT\n");
+            qemu_fclose(umemd.to_qemu->file);
+            umemd.to_qemu = NULL;
+            fd_close(&umemd.to_qemu_fd);
+            umemd.state |= UMEM_STATE_QUIT_SENT;
+        }
+    }
+
+    return (umemd.state & UMEM_STATE_END_MASK) == UMEM_STATE_END_MASK;
+}
+
+static void postcopy_incoming_umemd(void)
+{
+    ram_addr_t last_ram_offset;
+    int nbits;
+    RAMBlock *block;
+    int ret;
+
+    qemu_daemon(1, 1);
+    signal(SIGPIPE, SIG_IGN);
+    DPRINTF("daemon pid: %d\n", getpid());
+
+    umemd.page_request.pgoffs = g_new(__u64, MAX_REQUESTS);
+    umemd.page_cached.pgoffs =
+        g_new(__u64, MAX_REQUESTS *
+              (TARGET_PAGE_SIZE >= umemd.host_page_size ?
+               1: umemd.nr_host_pages_per_target_page));
+    umemd.target_pgoffs =
+        g_new(uint64_t, MAX_REQUESTS *
+              MAX(umemd.nr_host_pages_per_target_page,
+                  umemd.nr_target_pages_per_host_page));
+    umemd.present_request = g_malloc(umem_pages_size(MAX_PRESENT_REQUESTS));
+    umemd.present_request->nr = 0;
+
+    last_ram_offset = qemu_last_ram_offset();
+    nbits = last_ram_offset >> TARGET_PAGE_BITS;
+    umemd.phys_requested = g_new0(unsigned long, BITS_TO_LONGS(nbits));
+    umemd.phys_received = g_new0(unsigned long, BITS_TO_LONGS(nbits));
+    umemd.last_block_read = NULL;
+    umemd.last_block_write = NULL;
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        UMem *umem = block->umem;
+        umem->umem = NULL;      /* umem mapping area has VM_DONT_COPY flag,
+                                   so we lost those mappings by fork */
+        block->host = umem_map_shmem(umem);
+        umem_close_shmem(umem);
+    }
+    umem_daemon_ready(umemd.to_qemu_fd);
+    umemd.to_qemu = qemu_fopen_nonblock(umemd.to_qemu_fd);
+
+    /* wait for qemu to disown migration_fd */
+    umem_daemon_wait_for_qemu(umemd.from_qemu_fd);
+    umemd.from_qemu = qemu_fopen_pipe(umemd.from_qemu_fd);
+
+    DPRINTF("entering umemd main loop\n");
+    for (;;) {
+        ret = postcopy_incoming_umemd_main_loop();
+        if (ret != 0) {
+            break;
+        }
+    }
+    DPRINTF("exiting umemd main loop\n");
+
+    /* This daemon forked from qemu and the parent qemu is still running.
+     * Cleanups of linked libraries like SDL should not be triggered,
+     * otherwise the parent qemu may use resources which was already freed.
+     */
+    fflush(stdout);
+    fflush(stderr);
+    _exit(ret < 0? EXIT_FAILURE: 0);
+}
diff --git a/migration-tcp.c b/migration-tcp.c
index cf6a9b8..aa35050 100644
--- a/migration-tcp.c
+++ b/migration-tcp.c
@@ -63,18 +63,25 @@ static void tcp_wait_for_connect(void *opaque)
     } while (ret == -1 && (socket_error()) == EINTR);
 
     if (ret < 0) {
-        migrate_fd_error(s);
-        return;
+        goto error_out;
     }
 
     qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
 
-    if (val == 0)
+    if (val == 0) {
+        ret = postcopy_outgoing_create_read_socket(s);
+        if (ret < 0) {
+            goto error_out;
+        }
         migrate_fd_connect(s);
-    else {
+    } else {
         DPRINTF("error connecting %d\n", val);
-        migrate_fd_error(s);
+        goto error_out;
     }
+    return;
+
+error_out:
+    migrate_fd_error(s);
 }
 
 int tcp_start_outgoing_migration(MigrationState *s, const char *host_port)
@@ -112,11 +119,19 @@ int tcp_start_outgoing_migration(MigrationState *s, const 
char *host_port)
 
     if (ret < 0) {
         DPRINTF("connect failed\n");
-        migrate_fd_error(s);
-        return ret;
+        goto error_out;
+    }
+
+    ret = postcopy_outgoing_create_read_socket(s);
+    if (ret < 0) {
+        goto error_out;
     }
     migrate_fd_connect(s);
     return 0;
+
+error_out:
+    migrate_fd_error(s);
+    return ret;
 }
 
 static void tcp_accept_incoming_migration(void *opaque)
@@ -145,7 +160,15 @@ static void tcp_accept_incoming_migration(void *opaque)
     }
 
     process_incoming_migration(f);
+    if (incoming_postcopy) {
+        postcopy_incoming_fork_umemd(c, f);
+    }
     qemu_fclose(f);
+    if (incoming_postcopy) {
+        /* now socket is disowned.
+           So tell umem server that it's safe to use it */
+        postcopy_incoming_qemu_ready();
+    }
 out:
     close(c);
 out2:
diff --git a/migration-unix.c b/migration-unix.c
index dfcf203..3707505 100644
--- a/migration-unix.c
+++ b/migration-unix.c
@@ -69,12 +69,20 @@ static void unix_wait_for_connect(void *opaque)
 
     qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
 
-    if (val == 0)
+    if (val == 0) {
+        ret = postcopy_outgoing_create_read_socket(s);
+        if (ret < 0) {
+            goto error_out;
+        }
         migrate_fd_connect(s);
-    else {
+    } else {
         DPRINTF("error connecting %d\n", val);
-        migrate_fd_error(s);
+        goto error_out;
     }
+    return;
+
+error_out:
+    migrate_fd_error(s);
 }
 
 int unix_start_outgoing_migration(MigrationState *s, const char *path)
@@ -109,11 +117,19 @@ int unix_start_outgoing_migration(MigrationState *s, 
const char *path)
 
     if (ret < 0) {
         DPRINTF("connect failed\n");
-        migrate_fd_error(s);
-        return ret;
+        goto error_out;
+    }
+
+    ret = postcopy_outgoing_create_read_socket(s);
+    if (ret < 0) {
+        goto error_out;
     }
     migrate_fd_connect(s);
     return 0;
+
+error_out:
+    migrate_fd_error(s);
+    return ret;
 }
 
 static void unix_accept_incoming_migration(void *opaque)
@@ -142,7 +158,13 @@ static void unix_accept_incoming_migration(void *opaque)
     }
 
     process_incoming_migration(f);
+    if (incoming_postcopy) {
+        postcopy_incoming_fork_umemd(c, f);
+    }
     qemu_fclose(f);
+    if (incoming_postcopy) {
+        postcopy_incoming_qemu_ready();
+    }
 out:
     close(c);
 out2:
diff --git a/migration.c b/migration.c
index 0149ab3..51efe44 100644
--- a/migration.c
+++ b/migration.c
@@ -39,6 +39,11 @@ enum {
     MIG_STATE_COMPLETED,
 };
 
+enum {
+    MIG_SUBSTATE_PRECOPY,
+    MIG_SUBSTATE_POSTCOPY,
+};
+
 #define MAX_THROTTLE  (32 << 20)      /* Migration speed throttling */
 
 static NotifierList migration_state_notifiers =
@@ -255,6 +260,18 @@ static void migrate_fd_put_ready(void *opaque)
         return;
     }
 
+    if (s->substate == MIG_SUBSTATE_POSTCOPY) {
+        /* PRINTF("postcopy background\n"); */
+        ret = postcopy_outgoing_ram_save_background(s->mon, s->file,
+                                                    s->postcopy);
+        if (ret > 0) {
+            migrate_fd_completed(s);
+        } else if (ret < 0) {
+            migrate_fd_error(s);
+        }
+        return;
+    }
+
     DPRINTF("iterate\n");
     ret = qemu_savevm_state_iterate(s->mon, s->file);
     if (ret < 0) {
@@ -265,6 +282,19 @@ static void migrate_fd_put_ready(void *opaque)
         DPRINTF("done iterating\n");
         vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
 
+        if (s->params.postcopy) {
+            if (qemu_savevm_state_complete(s->mon, s->file) < 0) {
+                migrate_fd_error(s);
+                if (old_vm_running) {
+                    vm_start();
+                }
+                return;
+            }
+            s->substate = MIG_SUBSTATE_POSTCOPY;
+            s->postcopy = postcopy_outgoing_begin(s);
+            return;
+        }
+
         if (qemu_savevm_state_complete(s->mon, s->file) < 0) {
             migrate_fd_error(s);
         } else {
@@ -357,6 +387,7 @@ void migrate_fd_connect(MigrationState *s)
     int ret;
 
     s->state = MIG_STATE_ACTIVE;
+    s->substate = MIG_SUBSTATE_PRECOPY;
     s->file = qemu_fopen_ops_buffered(s,
                                       s->bandwidth_limit,
                                       migrate_fd_put_buffer,
diff --git a/migration.h b/migration.h
index 90ae362..2809e99 100644
--- a/migration.h
+++ b/migration.h
@@ -40,6 +40,12 @@ struct MigrationState
     int (*write)(MigrationState *s, const void *buff, size_t size);
     void *opaque;
     MigrationParams params;
+
+    /* for postcopy */
+    int substate;              /* precopy or postcopy */
+    int fd_read;
+    QEMUFile *file_read;        /* connection from the detination */
+    void *postcopy;
 };
 
 void process_incoming_migration(QEMUFile *f);
@@ -86,6 +92,7 @@ uint64_t ram_bytes_remaining(void);
 uint64_t ram_bytes_transferred(void);
 uint64_t ram_bytes_total(void);
 
+void ram_save_set_params(const MigrationParams *params, void *opaque);
 void sort_ram_list(void);
 int ram_save_block(QEMUFile *f);
 void ram_save_memory_set_dirty(void);
@@ -107,7 +114,30 @@ void migrate_add_blocker(Error *reason);
  */
 void migrate_del_blocker(Error *reason);
 
+/* For outgoing postcopy */
+int postcopy_outgoing_create_read_socket(MigrationState *s);
+int postcopy_outgoing_ram_save_live(Monitor *mon,
+                                    QEMUFile *f, int stage, void *opaque);
+void *postcopy_outgoing_begin(MigrationState *s);
+int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f,
+                                          void *postcopy);
+
+/* For incoming postcopy */
 extern bool incoming_postcopy;
 extern unsigned long incoming_postcopy_flags;
 
+int postcopy_incoming_init(const char *incoming, bool incoming_postcopy);
+void postcopy_incoming_ram_alloc(const char *name,
+                                 size_t size, uint8_t **hostp, UMem **umemp);
+void postcopy_incoming_ram_free(UMem *umem);
+void postcopy_incoming_prepare(void);
+
+int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id);
+void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read);
+void postcopy_incoming_qemu_ready(void);
+void postcopy_incoming_qemu_cleanup(void);
+#ifdef NEED_CPU_H
+void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size);
+#endif
+
 #endif
diff --git a/qemu-common.h b/qemu-common.h
index 725922b..d74a8c9 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -17,6 +17,7 @@ typedef struct DeviceState DeviceState;
 
 struct Monitor;
 typedef struct Monitor Monitor;
+typedef struct UMem UMem;
 
 /* we put basic includes here to avoid repeating them in device drivers */
 #include <stdlib.h>
diff --git a/qemu-options.hx b/qemu-options.hx
index 5c5b8f3..19e20f9 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2510,7 +2510,10 @@ DEF("postcopy-flags", HAS_ARG, 
QEMU_OPTION_postcopy_flags,
     "-postcopy-flags unsigned-int(flags)\n"
     "                  flags for postcopy incoming migration\n"
     "                   when -incoming and -postcopy are specified.\n"
-    "                   This is for benchmark/debug purpose (default: 0)\n",
+    "                   This is for benchmark/debug purpose (default: 0)\n"
+    "                   Currently supprted flags are\n"
+    "                   1: enable fault request from umemd to qemu\n"
+    "                      (default: disabled)\n",
     QEMU_ARCH_ALL)
 STEXI
 @item -postcopy-flags int
diff --git a/umem.c b/umem.c
new file mode 100644
index 0000000..b7be006
--- /dev/null
+++ b/umem.c
@@ -0,0 +1,379 @@
+/*
+ * umem.c: user process backed memory module for postcopy livemigration
+ *
+ * Copyright (c) 2011
+ * National Institute of Advanced Industrial Science and Technology
+ *
+ * https://sites.google.com/site/grivonhome/quick-kvm-migration
+ * Author: Isaku Yamahata <yamahata at valinux co jp>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <linux/umem.h>
+
+#include "bitops.h"
+#include "sysemu.h"
+#include "hw/hw.h"
+#include "umem.h"
+
+//#define DEBUG_UMEM
+#ifdef DEBUG_UMEM
+#include <sys/syscall.h>
+#define DPRINTF(format, ...)                                            \
+    do {                                                                \
+        printf("%d:%ld %s:%d "format, getpid(), syscall(SYS_gettid),    \
+               __func__, __LINE__, ## __VA_ARGS__);                     \
+    } while (0)
+#else
+#define DPRINTF(format, ...)    do { } while (0)
+#endif
+
+#define DEV_UMEM        "/dev/umem"
+
+struct UMemDev {
+    int fd;
+    int page_shift;
+};
+
+UMemDev *umem_dev_new(void)
+{
+    UMemDev *umem_dev;
+    int umem_dev_fd = open(DEV_UMEM, O_RDWR);
+    if (umem_dev_fd < 0) {
+        perror("can't open "DEV_UMEM);
+        abort();
+    }
+
+    umem_dev = g_new(UMemDev, 1);
+    umem_dev->fd = umem_dev_fd;
+    umem_dev->page_shift = ffs(getpagesize()) - 1;
+    return umem_dev;
+}
+
+void umem_dev_destroy(UMemDev *dev)
+{
+    close(dev->fd);
+    g_free(dev);
+}
+
+UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name)
+{
+    struct umem_create create = {
+        .size = size,
+        .async_req_max = 0,
+        .sync_req_max = 0,
+    };
+    UMem *umem;
+
+    snprintf(create.name.id, sizeof(create.name.id),
+             "pid-%"PRId64, (uint64_t)getpid());
+    create.name.id[UMEM_ID_MAX - 1] = 0;
+    strncpy(create.name.name, name, sizeof(create.name.name));
+    create.name.name[UMEM_NAME_MAX - 1] = 0;
+
+    assert((size % getpagesize()) == 0);
+    if (ioctl(dev->fd, UMEM_DEV_CREATE_UMEM, &create) < 0) {
+        perror("UMEM_DEV_CREATE_UMEM");
+        abort();
+    }
+    if (ftruncate(create.shmem_fd, create.size) < 0) {
+        perror("truncate(\"shmem_fd\")");
+        abort();
+    }
+
+    umem = g_new(UMem, 1);
+    umem->nbits = 0;
+    umem->nsets = 0;
+    umem->faulted = NULL;
+    umem->page_shift = dev->page_shift;
+    umem->fd = create.umem_fd;
+    umem->shmem_fd = create.shmem_fd;
+    umem->size = create.size;
+    umem->umem = mmap(NULL, size, PROT_EXEC | PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (umem->umem == MAP_FAILED) {
+        perror("mmap(UMem) failed");
+        abort();
+    }
+    return umem;
+}
+
+void umem_mmap(UMem *umem)
+{
+    void *ret = mmap(umem->umem, umem->size,
+                     PROT_EXEC | PROT_READ | PROT_WRITE,
+                     MAP_PRIVATE | MAP_FIXED, umem->fd, 0);
+    if (ret == MAP_FAILED) {
+        perror("umem_mmap(UMem) failed");
+        abort();
+    }
+}
+
+void umem_destroy(UMem *umem)
+{
+    if (umem->fd != -1) {
+        close(umem->fd);
+    }
+    if (umem->shmem_fd != -1) {
+        close(umem->shmem_fd);
+    }
+    g_free(umem->faulted);
+    g_free(umem);
+}
+
+void umem_get_page_request(UMem *umem, struct umem_page_request *page_request)
+{
+    if (ioctl(umem->fd, UMEM_GET_PAGE_REQUEST, page_request)) {
+        perror("daemon: UMEM_GET_PAGE_REQUEST");
+        abort();
+    }
+}
+
+void umem_mark_page_cached(UMem *umem, struct umem_page_cached *page_cached)
+{
+    if (ioctl(umem->fd, UMEM_MARK_PAGE_CACHED, page_cached)) {
+        perror("daemon: UMEM_MARK_PAGE_CACHED");
+        abort();
+    }
+}
+
+void umem_unmap(UMem *umem)
+{
+    munmap(umem->umem, umem->size);
+    umem->umem = NULL;
+}
+
+void umem_close(UMem *umem)
+{
+    close(umem->fd);
+    umem->fd = -1;
+}
+
+void *umem_map_shmem(UMem *umem)
+{
+    umem->nbits = umem->size >> umem->page_shift;
+    umem->nsets = 0;
+    umem->faulted = g_new0(unsigned long, BITS_TO_LONGS(umem->nbits));
+
+    umem->shmem = mmap(NULL, umem->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                       umem->shmem_fd, 0);
+    if (umem->shmem == MAP_FAILED) {
+        perror("daemon: mmap(\"shmem\")");
+        abort();
+    }
+    return umem->shmem;
+}
+
+void umem_unmap_shmem(UMem *umem)
+{
+    munmap(umem->shmem, umem->size);
+    umem->shmem = NULL;
+}
+
+void umem_remove_shmem(UMem *umem, size_t offset, size_t size)
+{
+    int s = offset >> umem->page_shift;
+    int e = (offset + size) >> umem->page_shift;
+    int i;
+
+    for (i = s; i < e; i++) {
+        if (!test_and_set_bit(i, umem->faulted)) {
+            umem->nsets++;
+#if defined(CONFIG_MADVISE) && defined(MADV_REMOVE)
+            madvise(umem->shmem + offset, size, MADV_REMOVE);
+#endif
+        }
+    }
+}
+
+void umem_close_shmem(UMem *umem)
+{
+    close(umem->shmem_fd);
+    umem->shmem_fd = -1;
+}
+
+/***************************************************************************/
+/* qemu <-> umem daemon communication */
+
+size_t umem_pages_size(uint64_t nr)
+{
+    return sizeof(struct umem_pages) + nr * sizeof(uint64_t);
+}
+
+static void umem_write_cmd(int fd, uint8_t cmd)
+{
+    DPRINTF("write cmd %c\n", cmd);
+
+    for (;;) {
+        ssize_t ret = write(fd, &cmd, 1);
+        if (ret == -1) {
+            if (errno == EINTR) {
+                continue;
+            } else if (errno == EPIPE) {
+                perror("pipe");
+                DPRINTF("write cmd %c %zd %d: pipe is closed\n",
+                        cmd, ret, errno);
+                break;
+            }
+
+            perror("pipe");
+            DPRINTF("write cmd %c %zd %d\n", cmd, ret, errno);
+            abort();
+        }
+
+        break;
+    }
+}
+
+static void umem_read_cmd(int fd, uint8_t expect)
+{
+    uint8_t cmd;
+    for (;;) {
+        ssize_t ret = read(fd, &cmd, 1);
+        if (ret == -1) {
+            if (errno == EINTR) {
+                continue;
+            }
+            perror("pipe");
+            DPRINTF("read error cmd %c %zd %d\n", cmd, ret, errno);
+            abort();
+        }
+
+        if (ret == 0) {
+            DPRINTF("read cmd %c %zd: pipe is closed\n", cmd, ret);
+            abort();
+        }
+
+        break;
+    }
+
+    DPRINTF("read cmd %c\n", cmd);
+    if (cmd != expect) {
+        DPRINTF("cmd %c expect %d\n", cmd, expect);
+        abort();
+    }
+}
+
+struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset)
+{
+    int ret;
+    uint64_t nr;
+    size_t size;
+    struct umem_pages *pages;
+
+    ret = qemu_peek_buffer(f, (uint8_t*)&nr, sizeof(nr), *offset);
+    *offset += sizeof(nr);
+    DPRINTF("ret %d nr %ld\n", ret, nr);
+    if (ret != sizeof(nr) || nr == 0) {
+        return NULL;
+    }
+
+    size = umem_pages_size(nr);
+    pages = g_malloc(size);
+    pages->nr = nr;
+    size -= sizeof(pages->nr);
+
+    ret = qemu_peek_buffer(f, (uint8_t*)pages->pgoffs, size, *offset);
+    *offset += size;
+    if (ret != size) {
+        g_free(pages);
+        return NULL;
+    }
+    return pages;
+}
+
+static void umem_send_pages(QEMUFile *f, const struct umem_pages *pages)
+{
+    size_t len = umem_pages_size(pages->nr);
+    qemu_put_buffer(f, (const uint8_t*)pages, len);
+}
+
+/* umem daemon -> qemu */
+void umem_daemon_ready(int to_qemu_fd)
+{
+    umem_write_cmd(to_qemu_fd, UMEM_DAEMON_READY);
+}
+
+void umem_daemon_quit(QEMUFile *to_qemu)
+{
+    qemu_put_byte(to_qemu, UMEM_DAEMON_QUIT);
+}
+
+void umem_daemon_send_pages_present(QEMUFile *to_qemu,
+                                    struct umem_pages *pages)
+{
+    qemu_put_byte(to_qemu, UMEM_DAEMON_TRIGGER_PAGE_FAULT);
+    umem_send_pages(to_qemu, pages);
+}
+
+void umem_daemon_wait_for_qemu(int from_qemu_fd)
+{
+    umem_read_cmd(from_qemu_fd, UMEM_QEMU_READY);
+}
+
+/* qemu -> umem daemon */
+void umem_qemu_wait_for_daemon(int from_umemd_fd)
+{
+    umem_read_cmd(from_umemd_fd, UMEM_DAEMON_READY);
+}
+
+void umem_qemu_ready(int to_umemd_fd)
+{
+    umem_write_cmd(to_umemd_fd, UMEM_QEMU_READY);
+}
+
+void umem_qemu_quit(QEMUFile *to_umemd)
+{
+    qemu_put_byte(to_umemd, UMEM_QEMU_QUIT);
+}
+
+/* qemu side handler */
+struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd,
+                                                int *offset)
+{
+    uint64_t i;
+    int page_shift = ffs(getpagesize()) - 1;
+    struct umem_pages *pages = umem_recv_pages(from_umemd, offset);
+    if (pages == NULL) {
+        return NULL;
+    }
+
+    for (i = 0; i < pages->nr; i++) {
+        ram_addr_t addr = pages->pgoffs[i] << page_shift;
+
+        /* make pages present by forcibly triggering page fault. */
+        volatile uint8_t *ram = qemu_get_ram_ptr(addr);
+        uint8_t dummy_read = ram[0];
+        (void)dummy_read;   /* suppress unused variable warning */
+    }
+
+    return pages;
+}
+
+void umem_qemu_send_pages_present(QEMUFile *to_umemd,
+                                  const struct umem_pages *pages)
+{
+    qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_FAULTED);
+    umem_send_pages(to_umemd, pages);
+}
+
+void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd,
+                                   const struct umem_pages *pages)
+{
+    qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_UNMAPPED);
+    umem_send_pages(to_umemd, pages);
+}
diff --git a/umem.h b/umem.h
new file mode 100644
index 0000000..5ca19ef
--- /dev/null
+++ b/umem.h
@@ -0,0 +1,105 @@
+/*
+ * umem.h: user process backed memory module for postcopy livemigration
+ *
+ * Copyright (c) 2011
+ * National Institute of Advanced Industrial Science and Technology
+ *
+ * https://sites.google.com/site/grivonhome/quick-kvm-migration
+ * Author: Isaku Yamahata <yamahata at valinux co jp>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QEMU_UMEM_H
+#define QEMU_UMEM_H
+
+#include <linux/umem.h>
+
+#include "qemu-common.h"
+
+typedef struct UMemDev UMemDev;
+
+struct UMem {
+    void *umem;
+    int fd;
+    void *shmem;
+    int shmem_fd;
+    uint64_t size;
+
+    /* indexed by host page size */
+    int page_shift;
+    int nbits;
+    int nsets;
+    unsigned long *faulted;
+};
+
+UMemDev *umem_dev_new(void);
+void umem_dev_destroy(UMemDev *dev);
+UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name);
+void umem_mmap(UMem *umem);
+
+void umem_destroy(UMem *umem);
+
+/* umem device operations */
+void umem_get_page_request(UMem *umem, struct umem_page_request *page_request);
+void umem_mark_page_cached(UMem *umem, struct umem_page_cached *page_cached);
+void umem_unmap(UMem *umem);
+void umem_close(UMem *umem);
+
+/* umem shmem operations */
+void *umem_map_shmem(UMem *umem);
+void umem_unmap_shmem(UMem *umem);
+void umem_remove_shmem(UMem *umem, size_t offset, size_t size);
+void umem_close_shmem(UMem *umem);
+
+/* qemu on source <-> umem daemon communication */
+
+struct umem_pages {
+    uint64_t nr;        /* nr = 0 means completed */
+    uint64_t pgoffs[0];
+};
+
+/* daemon -> qemu */
+#define UMEM_DAEMON_READY               'R'
+#define UMEM_DAEMON_QUIT                'Q'
+#define UMEM_DAEMON_TRIGGER_PAGE_FAULT  'T'
+#define UMEM_DAEMON_ERROR               'E'
+
+/* qemu -> daemon */
+#define UMEM_QEMU_READY                 'r'
+#define UMEM_QEMU_QUIT                  'q'
+#define UMEM_QEMU_PAGE_FAULTED          't'
+#define UMEM_QEMU_PAGE_UNMAPPED         'u'
+
+struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset);
+size_t umem_pages_size(uint64_t nr);
+
+/* for umem daemon */
+void umem_daemon_ready(int to_qemu_fd);
+void umem_daemon_wait_for_qemu(int from_qemu_fd);
+void umem_daemon_quit(QEMUFile *to_qemu);
+void umem_daemon_send_pages_present(QEMUFile *to_qemu,
+                                    struct umem_pages *pages);
+
+/* for qemu */
+void umem_qemu_wait_for_daemon(int from_umemd_fd);
+void umem_qemu_ready(int to_umemd_fd);
+void umem_qemu_quit(QEMUFile *to_umemd);
+struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd,
+                                                int *offset);
+void umem_qemu_send_pages_present(QEMUFile *to_umemd,
+                                  const struct umem_pages *pages);
+void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd,
+                                   const struct umem_pages *pages);
+
+#endif /* QEMU_UMEM_H */
diff --git a/vl.c b/vl.c
index 5430b8c..17427a0 100644
--- a/vl.c
+++ b/vl.c
@@ -3274,8 +3274,12 @@ int main(int argc, char **argv, char **envp)
     default_drive(default_sdcard, snapshot, machine->use_scsi,
                   IF_SD, 0, SD_OPTS);
 
-    register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID, NULL,
-                         ram_save_live, NULL, ram_load, NULL);
+    if (postcopy_incoming_init(incoming, incoming_postcopy) < 0) {
+        exit(1);
+    }
+    register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID,
+                         ram_save_set_params, ram_save_live, NULL,
+                         ram_load, NULL);
 
     if (nb_numa_nodes > 0) {
         int i;
@@ -3471,6 +3475,9 @@ int main(int argc, char **argv, char **envp)
 
     if (incoming) {
         runstate_set(RUN_STATE_INMIGRATE);
+        if (incoming_postcopy) {
+            postcopy_incoming_prepare();
+        }
         int ret = qemu_start_incoming_migration(incoming);
         if (ret < 0) {
             fprintf(stderr, "Migration failed. Exit code %s(%d), exiting.\n",
@@ -3488,6 +3495,9 @@ int main(int argc, char **argv, char **envp)
     bdrv_close_all();
     pause_all_vcpus();
     net_cleanup();
+    if (incoming_postcopy) {
+        postcopy_incoming_qemu_cleanup();
+    }
     res_free();
 
     return 0;
-- 
1.7.1.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]