[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH][RFC] post copy chardevice (was Re: [RFC] postco
From: |
Blue Swirl |
Subject: |
Re: [Qemu-devel] [PATCH][RFC] post copy chardevice (was Re: [RFC] postcopy livemigration proposal) |
Date: |
Fri, 12 Aug 2011 21:26:41 +0000 |
On Fri, Aug 12, 2011 at 11:07 AM, Isaku Yamahata <address@hidden> wrote:
> Here is the what I have right now for post copy chardevice.
> The sample user land will follow.
> It would give you more concrete idea and help further discussion, I hope.
> This is just for discussion, so it's incomplete.
>
> I'm open to other ideas and quite happy to throw away this patch and
> go for better way.
>
> thanks,
>
> From e262979e95b3c5a095c8cb0bc178309baa861a3f Mon Sep 17 00:00:00 2001
> Message-Id: <address@hidden>
> From: Isaku Yamahata <address@hidden>
> Date: Wed, 10 Aug 2011 18:28:05 +0900
> Subject: [PATCH] kvm/postcopy: chardevice for postcopy
>
> This is a character device to hook page access.
> The page fault in the area is reported to another user process by
> this chardriver. Then, the process fills the page contents and
> resolves the page fault.
>
> Signed-off-by: Isaku Yamahata <address@hidden>
> ---
> arch/x86/kvm/Kconfig | 1 +
> arch/x86/kvm/Makefile | 1 +
> include/linux/kvm.h | 45 +++
> include/linux/kvm_host.h | 2 +
> mm/memcontrol.c | 1 +
> mm/shmem.c | 1 +
> virt/kvm/Kconfig | 3 +
> virt/kvm/kvm_main.c | 6 +
> virt/kvm/vmem.c | 847
> ++++++++++++++++++++++++++++++++++++++++++++++
> virt/kvm/vmem.h | 68 ++++
> 10 files changed, 975 insertions(+), 0 deletions(-)
> create mode 100644 virt/kvm/vmem.c
> create mode 100644 virt/kvm/vmem.h
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 0a09b58..dcbd52e 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -29,6 +29,7 @@ config KVM
> select HAVE_KVM_EVENTFD
> select KVM_APIC_ARCHITECTURE
> select KVM_ASYNC_PF
> + select KVM_VMEM
> select USER_RETURN_NOTIFIER
> select KVM_MMIO
> select TASKSTATS
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index f15501f..6125f4c 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -10,6 +10,7 @@ kvm-y += $(addprefix ../../../virt/kvm/,
> kvm_main.o ioapic.o \
> assigned-dev.o)
> kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
> kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
> +kvm-$(CONFIG_KVM_VMEM) += $(addprefix ../../../virt/kvm/, vmem.o)
>
> kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
> i8254.o timer.o
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 55f5afb..623109e 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo {
> #define KVM_CAP_PPC_SMT 64
> #define KVM_CAP_PPC_RMA 65
> #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */
> +#define KVM_CAP_POST_COPY_MEMORY 67
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
> @@ -760,6 +761,50 @@ struct kvm_clock_data {
> /* Available with KVM_CAP_RMA */
> #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma)
>
> +struct kvm_vmem_create {
> + __u64 size; /* in bytes */
> + __s32 vmem_fd;
> + __s32 shmem_fd;
> +};
> +
> +struct kvm_vmem_page_request {
> + __u32 nr;
Padding will be needed here on 64 bit hosts unless the order is switched.
> + __u64 __user *pgoffs;
> +};
> +
> +struct kvm_vmem_page_cached {
> + __u32 nr;
Also here.
> + __u64 __user *pgoffs;
> +};
> +
> +struct kvm_vmem_page_range {
> + __u64 pgoff;
> + __u64 nr_pages;
> +};
> +
> +struct kvm_vmem_make_pages_present {
> + __u32 nr;
And here.
> + struct kvm_vmem_page_range __user *ranges;
> +};
> +
> +/* Available with KVM_CAP_POST_COPY_MEMORY */
> +#define KVM_CREATE_VMEM_DEV _IO(KVMIO, 0xb0)
> +
> +/* ioctl for vmem_dev fd */
> +#define KVM_CREATE_VMEM _IOR(KVMIO, 0xb1, __u32)
> +
> +/* ioctl for vmem fd */
> +#define KVM_VMEM_WAIT_READY _IO(KVMIO, 0xb2)
> +#define KVM_VMEM_READY _IO(KVMIO, 0xb3)
> +#define KVM_VMEM_GET_PAGE_REQUEST \
> + _IOWR(KVMIO, 0xb4, struct kvm_vmem_page_request)
> +#define KVM_VMEM_MARK_PAGE_CACHED \
> + _IOW(KVMIO, 0xb5, struct kvm_vmem_page_cached)
> +#define KVM_VMEM_MAKE_PAGES_PRESENT \
> + _IOW(KVMIO, 0xb6, struct kvm_vmem_make_pages_present)
> +#define KVM_VMEM_MAKE_VMA_ANONYMOUS _IO(KVMIO, 0xb7)
> +
> +
> #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
>
> struct kvm_assigned_pci_dev {
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ff4d406..8b3dafa 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -222,6 +222,8 @@ struct kvm_irq_routing_table {};
>
> #endif
>
> +long kvm_dev_ioctl_create_vmem_dev(void);
> +
> struct kvm_memslots {
> int nmemslots;
> u64 generation;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e013b8e..7f3fc4e 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2838,6 +2838,7 @@ int mem_cgroup_cache_charge(struct page *page, struct
> mm_struct *mm,
>
> return ret;
> }
> +EXPORT_SYMBOL_GPL(mem_cgroup_cache_charge);
>
> /*
> * While swap-in, try_charge -> commit or cancel, the page is locked.
> diff --git a/mm/shmem.c b/mm/shmem.c
> index fcedf54..ae7d61f 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -3035,6 +3035,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
> vma->vm_flags |= VM_CAN_NONLINEAR;
> return 0;
> }
> +EXPORT_SYMBOL_GPL(shmem_zero_setup);
>
> /**
> * shmem_read_mapping_page_gfp - read into page cache, using specified page
> allocation flags.
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index f63ccb0..d3040ea 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -18,3 +18,6 @@ config KVM_MMIO
>
> config KVM_ASYNC_PF
> bool
> +
> +config KVM_VMEM
> + bool
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index aefdda3..9e47e20 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2184,6 +2184,7 @@ static long kvm_dev_ioctl_check_extension_generic(long
> arg)
> case KVM_CAP_SET_BOOT_CPU_ID:
> #endif
> case KVM_CAP_INTERNAL_ERROR_DATA:
> + case KVM_CAP_POST_COPY_MEMORY:
> return 1;
> #ifdef CONFIG_HAVE_KVM_IRQCHIP
> case KVM_CAP_IRQ_ROUTING:
> @@ -2233,6 +2234,11 @@ static long kvm_dev_ioctl(struct file *filp,
> case KVM_TRACE_DISABLE:
> r = -EOPNOTSUPP;
> break;
> +#ifdef CONFIG_KVM_VMEM
> + case KVM_CREATE_VMEM_DEV:
> + r = kvm_dev_ioctl_create_vmem_dev();
> + break;
> +#endif
> default:
> return kvm_arch_dev_ioctl(filp, ioctl, arg);
> }
> diff --git a/virt/kvm/vmem.c b/virt/kvm/vmem.c
> new file mode 100644
> index 0000000..b413663
> --- /dev/null
> +++ b/virt/kvm/vmem.c
> @@ -0,0 +1,847 @@
> +/*
> + * KVM post copy vmem
> + *
> + * Copyright (c) 2011,
> + * National Institute of Advanced Industrial Science and Technology
> + *
> + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> + * Author: Isaku Yamahata <yamahata at valinux co jp>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> with
> + * this program; if not, write to the Free Software Foundation, Inc., 59
> Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.
The current address is:
51 Franklin Street, Fifth Floor
Boston, MA 02110-1301
USA
Then there is the version used in QEMU:
if not, see <http://www.gnu.org/licenses/>.
I don't know which one is preferred with kernel.
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/kvm.h>
> +#include <linux/pagemap.h>
> +#include <linux/mm.h>
> +#include <linux/memcontrol.h>
> +#include <linux/poll.h>
> +#include <linux/file.h>
> +#include <linux/anon_inodes.h>
> +#include "vmem.h"
> +
> +static void kvm_vmem_release_fake_vmf(int ret, struct vm_fault *fake_vmf)
> +{
> + if (ret & VM_FAULT_LOCKED) {
> + unlock_page(fake_vmf->page);
> + }
> + page_cache_release(fake_vmf->page);
> +}
> +
> +static int kvm_vmem_minor_fault(struct kvm_vmem *vmem,
> + struct vm_area_struct *vma,
> + struct vm_fault *vmf)
> +{
> + struct vm_fault fake_vmf;
> + int ret;
> + struct page *page;
> +
> + BUG_ON(!test_bit(vmf->pgoff, vmem->cached));
> + fake_vmf = *vmf;
> + fake_vmf.page = NULL;
> + ret = vmem->vma->vm_ops->fault(vmem->vma, &fake_vmf);
> + if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))
> + return ret;
> +
> + /*
> + * TODO: pull out fake_vmf->page from shmem file and donate it
> + * to this vma resolving the page fault.
> + * vmf->page = fake_vmf->page;
> + */
> +
> + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
> + if (!page)
> + return VM_FAULT_OOM;
> + if (mem_cgroup_cache_charge(page, vma->vm_mm, GFP_KERNEL)) {
> + kvm_vmem_release_fake_vmf(ret, &fake_vmf);
> + page_cache_release(page);
> + return VM_FAULT_OOM;
> + }
> +
> + copy_highpage(page, fake_vmf.page);
> + kvm_vmem_release_fake_vmf(ret, &fake_vmf);
> +
> + ret |= VM_FAULT_LOCKED;
> + SetPageUptodate(page);
> + vmf->page = page;
> + set_bit(vmf->pgoff, vmem->faulted);
> +
> + return ret;
> +}
> +
> +static int kvm_vmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> + struct file *filp = vma->vm_file;
> + struct kvm_vmem *vmem = filp->private_data;
> +
> + if (vmf->pgoff >= vmem->pgoff_end) {
> + return VM_FAULT_SIGBUS;
> + }
> +
> + BUG_ON(test_bit(vmf->pgoff, vmem->faulted));
> +
> + if (!test_bit(vmf->pgoff, vmem->cached)) {
> + /* major fault */
> + unsigned long bit;
> + DEFINE_WAIT(wait);
> +
> + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
> + /* async page fault */
> + spin_lock(&vmem->lock);
> + if (vmem->async_req_nr < vmem->async_req_max) {
> + vmem->async_req[vmem->async_req_nr] =
> + vmf->pgoff;
> + vmem->async_req_nr++;
> + }
> + spin_unlock(&vmem->lock);
> + wake_up_poll(&vmem->req_wait, POLLIN);
> +
> + if (test_bit(vmf->pgoff, vmem->cached))
> + return kvm_vmem_minor_fault(vmem, vma, vmf);
> + return VM_FAULT_MAJOR | VM_FAULT_RETRY;
> + }
> +
> + spin_lock(&vmem->lock);
> + bit = find_first_zero_bit(vmem->sync_wait_bitmap,
> + vmem->sync_req_max);
> + if (likely(bit < vmem->sync_req_max)) {
> + vmem->sync_req[bit] = vmf->pgoff;
> + prepare_to_wait(&vmem->page_wait[bit], &wait,
> + TASK_UNINTERRUPTIBLE);
> + set_bit(bit, vmem->sync_req_bitmap);
> + set_bit(bit, vmem->sync_wait_bitmap);
> + spin_unlock(&vmem->lock);
> + wake_up_poll(&vmem->req_wait, POLLIN);
> +
> + if (!test_bit(vmf->pgoff, vmem->cached))
> + schedule();
> + finish_wait(&vmem->page_wait[bit], &wait);
> + clear_bit(bit, vmem->sync_wait_bitmap);
> + } else {
> + struct kvm_vmem_page_req_list page_req_list = {
> + .pgoff = vmf->pgoff,
> + };
> + vmem->req_list_nr++;
> + list_add_tail(&page_req_list.list, &vmem->req_list);
> + wake_up_poll(&vmem->req_wait, POLLIN);
> + for (;;) {
> + prepare_to_wait(&vmem->req_list_wait, &wait,
> + TASK_UNINTERRUPTIBLE);
> + if (test_bit(vmf->pgoff, vmem->cached)) {
> + vmem->req_list_nr--;
> + break;
> + }
> + spin_unlock(&vmem->lock);
> + schedule();
> + spin_lock(&vmem->lock);
> + }
> + spin_unlock(&vmem->lock);
> + finish_wait(&vmem->req_list_wait, &wait);
> + }
> +
> + return kvm_vmem_minor_fault(vmem, vma, vmf) | VM_FAULT_MAJOR;
> + }
> +
> + return kvm_vmem_minor_fault(vmem, vma, vmf);
> +}
> +
> +/* for partial munmap */
> +static void kvm_vmem_vma_open(struct vm_area_struct *vma)
> +{
> + struct file *filp = vma->vm_file;
> + struct kvm_vmem *vmem = filp->private_data;
> +
> + spin_lock(&vmem->lock);
> + vmem->vma_nr++;
> + spin_unlock(&vmem->lock);
> +}
> +
> +static void kvm_vmem_vma_close(struct vm_area_struct *vma)
> +{
> + struct file *filp = vma->vm_file;
> + struct kvm_vmem *vmem = filp->private_data;
> + struct task_struct *task = NULL;
> +
> + spin_lock(&vmem->lock);
> + vmem->vma_nr--;
> + if (vmem->vma_nr == 0) {
> + task = vmem->task;
> + vmem->task = NULL;
> + }
> + spin_unlock(&vmem->lock);
> +
> + if (task)
> + put_task_struct(task);
> +}
> +
> +static const struct vm_operations_struct kvm_vmem_vm_ops = {
> + .open = kvm_vmem_vma_open,
> + .close = kvm_vmem_vma_close,
> + .fault = kvm_vmem_fault,
> +};
> +
> +static int kvm_vmem_mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> + struct kvm_vmem *vmem = filp->private_data;
> + int error;
> +
> + /* allow mmap() only once */
> + spin_lock(&vmem->lock);
> + if (vmem->mmapped) {
> + error = -EBUSY;
> + goto out;
> + }
> + if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff >
> + vmem->pgoff_end) {
> + error = -EINVAL;
> + goto out;
> + }
> +
> + vmem->mmapped = true;
> + vmem->vma_nr = 1;
> + vmem->vm_start = vma->vm_start;
> + get_task_struct(current);
> + vmem->task = current;
> + spin_unlock(&vmem->lock);
> +
> + vma->vm_ops = &kvm_vmem_vm_ops;
> + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
> + vma->vm_flags &= ~VM_SHARED;
> + return 0;
> +
> +out:
> + spin_unlock(&vmem->lock);
> + return error;
> +}
> +
> +static bool kvm_vmem_req_pending(struct kvm_vmem* vmem)
> +{
> + return !list_empty(&vmem->req_list) ||
> + !bitmap_empty(vmem->sync_req_bitmap, vmem->sync_req_max) ||
> + (vmem->async_req_nr > 0);
> +}
> +
> +static unsigned int kvm_vmem_poll(struct file* filp, poll_table *wait)
> +{
> + struct kvm_vmem *vmem = filp->private_data;
> + unsigned int events = 0;
> +
> + poll_wait(filp, &vmem->req_wait, wait);
> +
> + spin_lock(&vmem->lock);
> + if (kvm_vmem_req_pending(vmem))
> + events |= POLLIN;
> + spin_unlock(&vmem->lock);
> +
> + return events;
> +}
> +
> +/*
> + * return value
> + * true: finished
> + * false: more request
> + */
> +static bool kvm_vmem_copy_page_request(struct kvm_vmem *vmem,
> + pgoff_t *pgoffs, int req_max,
> + int *req_nr)
> +{
> + struct kvm_vmem_page_req_list *req_list;
> + struct kvm_vmem_page_req_list *tmp;
> +
> + unsigned long bit;
> +
> + *req_nr = 0;
> + list_for_each_entry_safe(req_list, tmp, &vmem->req_list, list) {
> + list_del(&req_list->list);
> + pgoffs[*req_nr] = req_list->pgoff;
> + (*req_nr)++;
> + if (*req_nr >= req_max)
> + return false;
> + }
> +
> + bit = 0;
> + for (;;) {
> + bit = find_next_bit(vmem->sync_req_bitmap, vmem->sync_req_max,
> + bit);
> + if (bit >= vmem->sync_req_max)
> + break;
> + pgoffs[*req_nr] = vmem->sync_req[bit];
> + (*req_nr)++;
> + clear_bit(bit, vmem->sync_req_bitmap);
> + if (*req_nr >= req_max)
> + return false;
> + bit++;
> + }
> +
> + if (vmem->async_req_nr > 0) {
> + int nr = min(req_max - *req_nr, vmem->async_req_nr);
> + memcpy(pgoffs + *req_nr, vmem->async_req,
> + sizeof(*vmem->async_req) * nr);
> + vmem->async_req_nr -= nr;
> + *req_nr += nr;
> + memmove(vmem->async_req, vmem->sync_req + nr,
> + vmem->async_req_nr * sizeof(*vmem->async_req));
> +
> + }
> + return vmem->async_req_nr == 0;
> +}
> +
> +static int kvm_vmem_get_page_request(struct kvm_vmem *vmem,
> + struct kvm_vmem_page_request *page_req)
> +{
> + DEFINE_WAIT(wait);
> +#define REQ_MAX ((__u32)32)
> + pgoff_t pgoffs[REQ_MAX];
> + __u32 req_copied = 0;
> + int ret = 0;
> +
> + spin_lock(&vmem->lock);
> + for (;;) {
> + prepare_to_wait(&vmem->req_wait, &wait, TASK_INTERRUPTIBLE);
> + if (kvm_vmem_req_pending(vmem)) {
> + break;
> + }
> + if (signal_pending(current)) {
> + ret = -ERESTARTSYS;
> + break;
> + }
> + spin_unlock(&vmem->lock);
> + schedule();
> + spin_lock(&vmem->lock);
> + }
> + finish_wait(&vmem->req_wait, &wait);
> + if (ret)
> + goto out_unlock;
> +
> + while (req_copied < page_req->nr) {
> + int req_max;
> + int req_nr;
> + bool finished;
> + req_max = min(page_req->nr - req_copied, REQ_MAX);
> + finished = kvm_vmem_copy_page_request(vmem, pgoffs, req_max,
> + &req_nr);
> +
> + spin_unlock(&vmem->lock);
> +
> + if (req_nr > 0) {
> + ret = 0;
> + if (copy_to_user(page_req->pgoffs + req_copied,
> pgoffs,
> + sizeof(*pgoffs) * req_nr)) {
> + ret = -EFAULT;
> + goto out;
> + }
> + }
> + req_copied += req_nr;
> + if (finished)
> + goto out;
> +
> + spin_lock(&vmem->lock);
> + }
> +
> +out_unlock:
> + spin_unlock(&vmem->lock);
> +out:
> + page_req->nr = req_copied;
> + return ret;
> +}
> +
> +static int kvm_vmem_mark_page_cached(struct kvm_vmem *vmem,
> + struct kvm_vmem_page_cached *page_cached)
> +{
> + int ret = 0;
> +#define PG_MAX ((__u32)32)
> + __u64 pgoffs[PG_MAX];
> + __u32 nr;
> + unsigned long bit;
> + bool wake_up_list = false;
> +
> + nr = 0;
> + while (nr < page_cached->nr) {
> + __u32 todo = min(PG_MAX, (page_cached->nr - nr));
> + int i;
> +
> + if (copy_from_user(pgoffs, page_cached->pgoffs + nr,
> + sizeof(*pgoffs) * todo)) {
> + ret = -EFAULT;
> + goto out;
> + }
> + for (i = 0; i < todo; ++i) {
> + if (pgoffs[i] >= vmem->pgoff_end) {
> + ret = -EINVAL;
> + goto out;
> + }
> + set_bit(pgoffs[i], vmem->cached);
> + }
> + nr += todo;
> + }
> +
> + spin_lock(&vmem->lock);
> + bit = 0;
> + for (;;) {
> + bit = find_next_bit(vmem->sync_wait_bitmap,
> vmem->sync_req_max,
> + bit);
> + if (bit >= vmem->sync_req_max)
> + break;
> + if (test_bit(vmem->sync_req[bit], vmem->cached))
> + wake_up(&vmem->page_wait[bit]);
> + bit++;
> + }
> +
> + if (vmem->req_list_nr > 0)
> + wake_up_list = true;
> + spin_unlock(&vmem->lock);
> +
> + if (wake_up_list)
> + wake_up_all(&vmem->req_list_wait);
> +
> +out:
> + return ret;
> +}
> +
> +static bool kvm_vmem_is_vmem_vma(const struct kvm_vmem *vmem,
> + const struct vm_area_struct *vma)
> +{
> + return vma->vm_file && vma->vm_file->private_data == vmem;
> +}
> +
> +static void kvm_vmem_make_pages_present_entry(struct kvm_vmem *vmem,
> + struct kvm_vmem_page_range
> *range,
> + struct task_struct *task,
> + struct mm_struct *mm,
> + unsigned long vm_start)
> +{
> + unsigned long pgoff = range->pgoff;
> + unsigned long range_end = range->pgoff + range->nr_pages;
> +
> + down_read(&mm->mmap_sem);
> +
> + while (pgoff < range->pgoff + range->nr_pages) {
> + unsigned long pgoff_end;
> + struct vm_area_struct *vma;
> + unsigned long saddr;
> + unsigned long eaddr;
> +
> + /* search unfaulted range */
> + spin_lock(&vmem->lock);
> + pgoff = find_next_zero_bit(vmem->faulted, range_end, pgoff);
> + if (pgoff >= range_end) {
> + spin_unlock(&vmem->lock);
> + break;
> + }
> + pgoff_end = find_next_bit(vmem->faulted, range_end, pgoff);
> + spin_unlock(&vmem->lock);
> +
> + saddr = vm_start + (pgoff << PAGE_SHIFT);
> + eaddr = vm_start + (pgoff_end << PAGE_SHIFT);
> + vma = find_vma(mm, saddr);
> + if (vma == NULL) {
> + break;
> + }
> + if (eaddr < vma->vm_start) {
> + pgoff = (vma->vm_start - vm_start) >> PAGE_SHIFT;
> + continue;
> + }
> +
> + if (kvm_vmem_is_vmem_vma(vmem, vma)) {
> + unsigned long start = max(vma->vm_start, saddr);
> + unsigned long end = min(vma->vm_end, eaddr);
> + int nr_pages = (end - start) >> PAGE_SHIFT;
> + get_user_pages(task, mm, start, nr_pages,
> + 1, 1, NULL, NULL);
> + pgoff = (end - vm_start) >> PAGE_SHIFT;
> + } else {
> + pgoff = (vma->vm_end - vm_start) >> PAGE_SHIFT;
> + }
> + }
> +
> + up_read(&mm->mmap_sem);
> +}
> +
> +static int kvm_vmem_make_pages_present(
> + struct kvm_vmem *vmem,
> + struct kvm_vmem_make_pages_present *pages_present)
> +{
> + struct task_struct *task;
> + struct mm_struct *mm;
> + pgoff_t pgoff_end;
> + unsigned long vm_start;
> + unsigned long vm_eaddr;
> +
> +#define NUM_ENTRIES ((__u32)32)
> + struct kvm_vmem_page_range kranges[NUM_ENTRIES];
> + __u32 nr = 0;
> + int ret;
> +
> + spin_lock(&vmem->lock);
> + task = vmem->task;
> + pgoff_end = vmem->pgoff_end;
> + vm_start = vmem->vm_start;
> + vm_eaddr = vm_start + vmem->size;
> + spin_unlock(&vmem->lock);
> + if (task == NULL)
> + return 0;
> + mm = get_task_mm(task);
> + if (mm == NULL)
> + return 0;
> +
> + ret = 0;
> + while (nr < pages_present->nr) {
> + int nr_ranges = min(NUM_ENTRIES, pages_present->nr - nr);
> + int i;
> +
> + if (copy_from_user(&kranges, pages_present->ranges + nr,
> + sizeof(kranges[0]) * nr_ranges)) {
> + ret = -EFAULT;
> + break;
> + }
> + for (i = 0; i < nr_ranges; ++i) {
> + struct kvm_vmem_page_range *range = &kranges[i];
> + if (range->pgoff >= pgoff_end ||
> + range->nr_pages >= pgoff_end ||
> + range->pgoff + range->nr_pages >= pgoff_end) {
> + ret = -EINVAL;
> + break;
> + }
> + kvm_vmem_make_pages_present_entry(vmem, range,
> + task, mm, vm_start);
> + }
> + nr += nr_ranges;
> + }
> +
> + mmput(mm);
> + return ret;
> +}
> +
> +static int kvm_vmem_make_vma_anonymous(struct kvm_vmem *vmem)
> +{
> +#if 1
> + return -ENOSYS;
> +#else
> + unsigned long saddr;
> + unsigned long eaddr;
> + unsigned long addr;
> + unsigned long bit;
> + struct task_struct *task;
> + struct mm_struct *mm;
> +
> + spin_lock(&vmem->lock);
> + task = vmem->task;
> + saddr = vmem->vm_start;
> + eaddr = saddr + vmem->size;
> + bit = find_first_zero_bit(vmem->faulted, vmem->pgoff_end);
> + if (bit < vmem->pgoff_end) {
> + spin_unlock(&vmem->lock);
> + return -EBUSY;
> + }
> + spin_unlock(&vmem->lock);
> + if (task == NULL)
> + return 0;
> + mm = get_task_mm(task);
> + if (mm == NULL)
> + return 0;
> +
> + addr = saddr;
> + down_write(&mm->mmap_sem);
> + while (addr < eaddr) {
> + struct vm_area_struct *vma;
> + vma = find_vma(mm, addr);
> + if (kvm_vmem_is_vmem_vma(vmem, vma)) {
> + /* XXX incorrect. race/locking and more fix up */
> + struct file *filp = vma->vm_file;
> + vma->vm_ops->close(vma);
> + vma->vm_ops = NULL;
> + vma->vm_file = NULL;
> + /* vma->vm_flags */
> + fput(filp);
> + }
> + addr = vma->vm_end;
> + }
> + up_write(&mm->mmap_sem);
> +
> + mmput(mm);
> + return 0;
> +#endif
> +}
> +
> +static void kvm_vmem_ready(struct kvm_vmem *vmem)
> +{
> + spin_lock(&vmem->lock);
> + vmem->ready = true;
> + spin_unlock(&vmem->lock);
> + wake_up_interruptible(&vmem->ready_wait);
> +}
> +
> +static int kvm_vmem_wait_ready(struct kvm_vmem *vmem)
> +{
> + int ret = 0;
> + DEFINE_WAIT(wait);
> +
> + spin_lock(&vmem->lock);
> + for (;;) {
> + prepare_to_wait(&vmem->ready_wait, &wait, TASK_INTERRUPTIBLE);
> + if (vmem->ready) {
> + break;
> + }
> + if (signal_pending(current)) {
> + ret = -ERESTARTSYS;
> + break;
> + }
> + spin_unlock(&vmem->lock);
> + schedule();
> + spin_lock(&vmem->lock);
> + }
> + spin_unlock(&vmem->lock);
> + finish_wait(&vmem->ready_wait, &wait);
> + return ret;
> +}
> +
> +static long kvm_vmem_ioctl(struct file *filp, unsigned int ioctl,
> + unsigned long arg)
> +{
> + struct kvm_vmem *vmem = filp->private_data;
> + void __user *argp = (void __user *) arg;
> + long ret = 0;
> +
> + switch (ioctl) {
> + case KVM_VMEM_READY:
> + kvm_vmem_ready(vmem);
> + ret = 0;
> + break;
> + case KVM_VMEM_WAIT_READY:
> + ret = kvm_vmem_wait_ready(vmem);
> + break;
> + case KVM_VMEM_GET_PAGE_REQUEST: {
> + struct kvm_vmem_page_request page_request;
> + ret = -EFAULT;
> + if (copy_from_user(&page_request, argp, sizeof(page_request)))
> + break;
> + ret = kvm_vmem_get_page_request(vmem, &page_request);
> + if (ret == 0 &&
> + copy_to_user(argp +
> + offsetof(struct kvm_vmem_page_request, nr),
> + &page_request.nr,
> + sizeof(page_request.nr))) {
> + ret = -EFAULT;
> + break;
> + }
> + break;
> + }
> + case KVM_VMEM_MARK_PAGE_CACHED: {
> + struct kvm_vmem_page_cached page_cached;
> + ret = -EFAULT;
> + if (copy_from_user(&page_cached, argp, sizeof(page_cached)))
> + break;
> + ret = kvm_vmem_mark_page_cached(vmem, &page_cached);
> + break;
> + }
> + case KVM_VMEM_MAKE_PAGES_PRESENT: {
> + struct kvm_vmem_make_pages_present pages_present;
> + ret = -EFAULT;
> + if (copy_from_user(&pages_present, argp,
> + sizeof(pages_present)))
> + break;
> + ret = kvm_vmem_make_pages_present(vmem, &pages_present);
> + break;
> + }
> + case KVM_VMEM_MAKE_VMA_ANONYMOUS:
> + ret = kvm_vmem_make_vma_anonymous(vmem);
> + break;
> + default:
> + ret = -EINVAL;
> + break;
> + }
> + return ret;
> +}
> +
> +static unsigned long kvm_vmem_bitmap_bytes(const struct kvm_vmem *vmem)
> +{
> + return round_up(vmem->pgoff_end, BITS_PER_LONG) / 8;
> +}
> +
> +static void kvm_vmem_free(struct kvm_vmem *vmem)
> +{
> + if (vmem->task) {
> + put_task_struct(vmem->task);
> + vmem->task = NULL;
> + }
> +
> + if (vmem->shmem_filp)
> + fput(vmem->shmem_filp);
> + if (kvm_vmem_bitmap_bytes(vmem) > PAGE_SIZE) {
> + vfree(vmem->cached);
> + vfree(vmem->faulted);
> + } else {
> + kfree(vmem->cached);
> + kfree(vmem->faulted);
> + }
> + kfree(vmem->vma);
> + kfree(vmem->async_req);
> + kfree(vmem->sync_req_bitmap);
> + kfree(vmem->sync_wait_bitmap);
> + kfree(vmem->page_wait);
> + kfree(vmem->sync_req);
> + kfree(vmem);
> +}
> +
> +static int kvm_vmem_release(struct inode *inode, struct file *filp)
> +{
> + struct kvm_vmem *vmem = filp->private_data;
> + kvm_vmem_free(vmem);
> + return 0;
> +}
> +
> +static struct file_operations kvm_vmem_fops = {
> + .release = kvm_vmem_release,
> + .unlocked_ioctl = kvm_vmem_ioctl,
> + .mmap = kvm_vmem_mmap,
> + .poll = kvm_vmem_poll,
> + .llseek = noop_llseek,
> +};
> +
> +static int kvm_create_vmem(struct kvm_vmem_create *create)
> +{
> + int error = 0;
> + struct kvm_vmem *vmem = NULL;
> + struct vm_area_struct *vma = NULL;
> + int shmem_fd;
> + unsigned long bitmap_bytes;
> + unsigned long sync_bitmap_bytes;
> + int i;
> +
> + vmem = kzalloc(sizeof(*vmem), GFP_KERNEL);
> + vmem->task = NULL;
Is this needed, doesn't kzalloc() return zeroed memory?
> + vmem->mmapped = false;
> + spin_lock_init(&vmem->lock);
> + vmem->size = roundup(create->size, PAGE_SIZE);
> + vmem->pgoff_end = vmem->size >> PAGE_SHIFT;
> + init_waitqueue_head(&vmem->req_wait);
> +
> + vma = kzalloc(sizeof(*vma), GFP_KERNEL);
> + vma->vm_start = 0;
Also here.
> + vma->vm_end = vmem->size;
> + /* this shmem file is used for temporal buffer for pages
> + so it's unlikely that so many pages exists in this shmem file */
> + vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY |
> + VM_DONTEXPAND;
> + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
> + vma->vm_pgoff = 0;
> + INIT_LIST_HEAD(&vma->anon_vma_chain);
> +
> + vmem->vma = vma;
> +
> + shmem_fd = get_unused_fd();
> + if (shmem_fd < 0) {
> + error = shmem_fd;
> + goto out;
> + }
> + error = shmem_zero_setup(vma);
> + if (error < 0) {
> + put_unused_fd(shmem_fd);
> + goto out;
> + }
> + vmem->shmem_filp = vma->vm_file;
> + get_file(vmem->shmem_filp);
> + fd_install(shmem_fd, vma->vm_file);
> + create->shmem_fd = shmem_fd;
> +
> + create->vmem_fd = anon_inode_getfd("kvm-vmem",
> + &kvm_vmem_fops, vmem, O_RDWR);
> + if (create->vmem_fd < 0) {
> + error = create->vmem_fd;
> + goto out;
> + }
> +
> + bitmap_bytes = kvm_vmem_bitmap_bytes(vmem);
> + if (bitmap_bytes > PAGE_SIZE) {
> + vmem->cached = vzalloc(bitmap_bytes);
> + vmem->faulted = vzalloc(bitmap_bytes);
> + } else {
> + vmem->cached = kzalloc(bitmap_bytes, GFP_KERNEL);
> + vmem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL);
> + }
> +
> +#define ASYNC_REQ_MAX (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)
> + vmem->async_req_max = ASYNC_REQ_MAX;
> + vmem->async_req_nr = 0;
> + vmem->async_req = kzalloc(sizeof(*vmem->async_req), GFP_KERNEL);
> +
> +#define SYNC_REQ_MAX (KVM_MAX_VCPUS)
> + vmem->sync_req_max = round_up(SYNC_REQ_MAX, BITS_PER_LONG);
> + sync_bitmap_bytes = sizeof(unsigned long) *
> + (vmem->sync_req_max / BITS_PER_LONG);
> + vmem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> + vmem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> + vmem->page_wait = kzalloc(sizeof(*vmem->page_wait) *
> + vmem->sync_req_max, GFP_KERNEL);
> + for (i = 0; i < vmem->sync_req_max; ++i)
> + init_waitqueue_head(&vmem->page_wait[i]);
> + vmem->sync_req = kzalloc(sizeof(*vmem->sync_req) *
> + vmem->sync_req_max, GFP_KERNEL);
> +
> + vmem->req_list_nr = 0;
> + INIT_LIST_HEAD(&vmem->req_list);
> + init_waitqueue_head(&vmem->req_list_wait);
> +
> + init_waitqueue_head(&vmem->ready_wait);
> + vmem->ready = false;
> +
> + return 0;
> +
> + out:
> + kvm_vmem_free(vmem);
> + return error;
> +}
> +
> +static long kvm_vmem_dev_ioctl(struct file *filp, unsigned int ioctl,
> + unsigned long arg)
> +{
> + void __user *argp = (void __user *) arg;
> + long ret;
> +
> + switch (ioctl) {
> + case KVM_CREATE_VMEM: {
> + struct kvm_vmem_create create;
> + if (copy_from_user(&create, argp, sizeof(create))) {
> + ret = -EFAULT;
> + break;
> + }
> + ret = kvm_create_vmem(&create);
> + if (copy_to_user(argp, &create, sizeof(create))) {
> + ret = -EFAULT;
> + break;
> + }
> + break;
> + }
> + default:
> + ret = -EINVAL;
> + break;
> + }
> + return ret;
> +}
> +
> +static int kvm_vmem_dev_release(struct inode *inode, struct file *filp)
> +{
> + return 0;
> +}
> +
> +static struct file_operations kvm_vmem_dev_fops = {
> + .release = kvm_vmem_dev_release,
> + .unlocked_ioctl = kvm_vmem_dev_ioctl,
> +};
> +
> +long kvm_dev_ioctl_create_vmem_dev(void)
> +{
> + return anon_inode_getfd("kvm-vmem-dev", &kvm_vmem_dev_fops,
> + NULL, O_RDWR);
> +}
> diff --git a/virt/kvm/vmem.h b/virt/kvm/vmem.h
> new file mode 100644
> index 0000000..bc7e8cf
> --- /dev/null
> +++ b/virt/kvm/vmem.h
> @@ -0,0 +1,68 @@
> +/*
> + * KVM post copy vmem
> + *
> + * Copyright (c) 2011,
> + * National Institute of Advanced Industrial Science and Technology
> + *
> + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> + * Author: Isaku Yamahata <yamahata at valinux co jp>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> with
> + * this program; if not, write to the Free Software Foundation, Inc., 59
> Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.
Old address also here.
> + */
> +
> +#ifndef __KVM_VMEM_H__
> +#define __KVM_VMEM_H__
> +
> +struct kvm_vmem_page_req_list {
> + struct list_head list;
> + pgoff_t pgoff;
> +};
> +
> +struct kvm_vmem {
> + loff_t size;
> + pgoff_t pgoff_end;
> + spinlock_t lock;
> +
> + wait_queue_head_t req_wait;
> +
> + int async_req_max;
> + int async_req_nr;
> + pgoff_t *async_req;
> +
> + int sync_req_max;
'int' between pointers would mean 4 bytes of structure padding on 64 bit hosts.
> + unsigned long *sync_req_bitmap;
> + unsigned long *sync_wait_bitmap;
> + pgoff_t *sync_req;
> + wait_queue_head_t *page_wait;
> +
> + int req_list_nr;
> + struct list_head req_list;
> + wait_queue_head_t req_list_wait;
> +
> + unsigned long *cached;
> + unsigned long *faulted;
> +
> + bool mmapped;
> + unsigned long vm_start;
> + unsigned int vma_nr;
> + struct task_struct *task;
> +
> + wait_queue_head_t ready_wait;
> + bool ready;
> +
> + struct file *shmem_filp;
> + struct vm_area_struct *vma;
> +};
> +
> +#endif /* __KVM_VMEM_H__ */
> --
> 1.7.1.1
>
>
> --
> yamahata
>
>
- Re: [Qemu-devel] [RFC] postcopy livemigration proposal, (continued)
[Qemu-devel] [PATCH][RFC] post copy chardevice (was Re: [RFC] postcopy livemigration proposal), Isaku Yamahata, 2011/08/12