qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 2/2] umem: chardevice for kvm postcopy


From: Isaku Yamahata
Subject: Re: [Qemu-devel] [PATCH 2/2] umem: chardevice for kvm postcopy
Date: Thu, 29 Dec 2011 21:22:50 +0900
User-agent: Mutt/1.5.19 (2009-01-05)

Thank you for review.

On Thu, Dec 29, 2011 at 01:17:51PM +0200, Avi Kivity wrote:
> > +   default n
> > +   help
> > +     User process backed memory driver provides /dev/umem device.
> > +     The /dev/umem device is designed for some sort of distributed
> > +     shared memory. Especially post-copy live migration with KVM.
> > +     When in doubt, say "N".
> > +
> 
> Need documentation of the protocol between the kernel and userspace; not
> just the ioctls, but also how faults are propagated.

Will do.

> 
> > +
> > +struct umem_page_req_list {
> > +   struct list_head list;
> > +   pgoff_t pgoff;
> > +};
> > +
> >
> > +
> > +
> > +static int umem_mark_page_cached(struct umem *umem,
> > +                            struct umem_page_cached *page_cached)
> > +{
> > +   int ret = 0;
> > +#define PG_MAX     ((__u32)32)
> > +   __u64 pgoffs[PG_MAX];
> > +   __u32 nr;
> > +   unsigned long bit;
> > +   bool wake_up_list = false;
> > +
> > +   nr = 0;
> > +   while (nr < page_cached->nr) {
> > +           __u32 todo = min(PG_MAX, (page_cached->nr - nr));
> > +           int i;
> > +
> > +           if (copy_from_user(pgoffs, page_cached->pgoffs + nr,
> > +                              sizeof(*pgoffs) * todo)) {
> > +                   ret = -EFAULT;
> > +                   goto out;
> > +           }
> > +           for (i = 0; i < todo; ++i) {
> > +                   if (pgoffs[i] >= umem->pgoff_end) {
> > +                           ret = -EINVAL;
> > +                           goto out;
> > +                   }
> > +                   set_bit(pgoffs[i], umem->cached);
> > +           }
> > +           nr += todo;
> > +   }
> > +
> 
> Probably need an smp_wmb() where.
> 
> > +   spin_lock(&umem->lock);
> > +   bit = 0;
> > +   for (;;) {
> > +           bit = find_next_bit(umem->sync_wait_bitmap, umem->sync_req_max,
> > +                               bit);
> > +           if (bit >= umem->sync_req_max)
> > +                   break;
> > +           if (test_bit(umem->sync_req[bit], umem->cached))
> > +                   wake_up(&umem->page_wait[bit]);
> 
> Why not do this test in the loop above?
> 
> > +           bit++;
> > +   }
> > +
> > +   if (umem->req_list_nr > 0)
> > +           wake_up_list = true;
> > +   spin_unlock(&umem->lock);
> > +
> > +   if (wake_up_list)
> > +           wake_up_all(&umem->req_list_wait);
> > +
> > +out:
> > +   return ret;
> > +}
> > +
> > +
> > +
> > +static void umem_put(struct umem *umem)
> > +{
> > +   int ret;
> > +
> > +   mutex_lock(&umem_list_mutex);
> > +   ret = kref_put(&umem->kref, umem_free);
> > +   if (ret == 0) {
> > +           mutex_unlock(&umem_list_mutex);
> > +   }
> 
> This looks wrong.
> 
> > +}
> > +
> > +
> > +static int umem_create_umem(struct umem_create *create)
> > +{
> > +   int error = 0;
> > +   struct umem *umem = NULL;
> > +   struct vm_area_struct *vma;
> > +   int shmem_fd;
> > +   unsigned long bitmap_bytes;
> > +   unsigned long sync_bitmap_bytes;
> > +   int i;
> > +
> > +   umem = kzalloc(sizeof(*umem), GFP_KERNEL);
> > +   umem->name = create->name;
> > +   kref_init(&umem->kref);
> > +   INIT_LIST_HEAD(&umem->list);
> > +
> > +   mutex_lock(&umem_list_mutex);
> > +   error = umem_add_list(umem);
> > +   if (error) {
> > +           goto out;
> > +   }
> > +
> > +   umem->task = NULL;
> > +   umem->mmapped = false;
> > +   spin_lock_init(&umem->lock);
> > +   umem->size = roundup(create->size, PAGE_SIZE);
> > +   umem->pgoff_end = umem->size >> PAGE_SHIFT;
> > +   init_waitqueue_head(&umem->req_wait);
> > +
> > +   vma = &umem->vma;
> > +   vma->vm_start = 0;
> > +   vma->vm_end = umem->size;
> > +   /* this shmem file is used for temporal buffer for pages
> > +      so it's unlikely that so many pages exists in this shmem file */
> > +   vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY |
> > +           VM_DONTEXPAND;
> > +   vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
> > +   vma->vm_pgoff = 0;
> > +   INIT_LIST_HEAD(&vma->anon_vma_chain);
> > +
> > +   shmem_fd = get_unused_fd();
> > +   if (shmem_fd < 0) {
> > +           error = shmem_fd;
> > +           goto out;
> > +   }
> > +   error = shmem_zero_setup(vma);
> > +   if (error < 0) {
> > +           put_unused_fd(shmem_fd);
> > +           goto out;
> > +   }
> > +   umem->shmem_filp = vma->vm_file;
> > +   get_file(umem->shmem_filp);
> > +   fd_install(shmem_fd, vma->vm_file);
> > +   create->shmem_fd = shmem_fd;
> > +
> > +   create->umem_fd = anon_inode_getfd("umem",
> > +                                      &umem_fops, umem, O_RDWR);
> > +   if (create->umem_fd < 0) {
> > +           error = create->umem_fd;
> > +           goto out;
> > +   }
> > +
> > +   bitmap_bytes = umem_bitmap_bytes(umem);
> > +   if (bitmap_bytes > PAGE_SIZE) {
> > +           umem->cached = vzalloc(bitmap_bytes);
> > +           umem->faulted = vzalloc(bitmap_bytes);
> > +   } else {
> > +           umem->cached = kzalloc(bitmap_bytes, GFP_KERNEL);
> > +           umem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL);
> > +   }
> > +
> > +   /* those constants are not exported.
> > +      They are just used for default value */
> > +#define KVM_MAX_VCPUS      256
> > +#define ASYNC_PF_PER_VCPU 64
> 
> Best to avoid defaults and require userspace choose.

Okay.


> > +
> > +#define ASYNC_REQ_MAX      (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)
> > +   if (create->async_req_max == 0)
> > +           create->async_req_max = ASYNC_REQ_MAX;
> > +   umem->async_req_max = create->async_req_max;
> > +   umem->async_req_nr = 0;
> > +   umem->async_req = kzalloc(
> > +           sizeof(*umem->async_req) * umem->async_req_max,
> > +           GFP_KERNEL);
> > +
> > +#define SYNC_REQ_MAX       (KVM_MAX_VCPUS)
> > +   if (create->sync_req_max == 0)
> > +           create->sync_req_max = SYNC_REQ_MAX;
> > +   umem->sync_req_max = round_up(create->sync_req_max, BITS_PER_LONG);
> > +   sync_bitmap_bytes = sizeof(unsigned long) *
> > +           (umem->sync_req_max / BITS_PER_LONG);
> > +   umem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> > +   umem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> > +   umem->page_wait = kzalloc(sizeof(*umem->page_wait) *
> > +                             umem->sync_req_max, GFP_KERNEL);
> > +   for (i = 0; i < umem->sync_req_max; ++i)
> > +           init_waitqueue_head(&umem->page_wait[i]);
> > +   umem->sync_req = kzalloc(sizeof(*umem->sync_req) *
> > +                            umem->sync_req_max, GFP_KERNEL);
> > +
> > +   umem->req_list_nr = 0;
> > +   INIT_LIST_HEAD(&umem->req_list);
> > +   init_waitqueue_head(&umem->req_list_wait);
> > +
> > +   mutex_unlock(&umem_list_mutex);
> > +   return 0;
> > +
> > + out:
> > +   umem_free(&umem->kref);
> > +   return error;
> > +}
> > +
> > +
> > +static int umem_reattach_umem(struct umem_create *create)
> > +{
> > +   struct umem *entry;
> > +
> > +   mutex_lock(&umem_list_mutex);
> > +   list_for_each_entry(entry, &umem_list, list) {
> > +           if (umem_name_eq(&entry->name, &create->name)) {
> > +                   kref_get(&entry->kref);
> > +                   mutex_unlock(&umem_list_mutex);
> > +
> > +                   create->shmem_fd = get_unused_fd();
> > +                   if (create->shmem_fd < 0) {
> > +                           umem_put(entry);
> > +                           return create->shmem_fd;
> > +                   }
> > +                   create->umem_fd = anon_inode_getfd(
> > +                           "umem", &umem_fops, entry, O_RDWR);
> > +                   if (create->umem_fd < 0) {
> > +                           put_unused_fd(create->shmem_fd);
> > +                           umem_put(entry);
> > +                           return create->umem_fd;
> > +                   }
> > +                   get_file(entry->shmem_filp);
> > +                   fd_install(create->shmem_fd, entry->shmem_filp);
> > +
> > +                   create->size = entry->size;
> > +                   create->sync_req_max = entry->sync_req_max;
> > +                   create->async_req_max = entry->async_req_max;
> > +                   return 0;
> > +           }
> > +   }
> > +   mutex_unlock(&umem_list_mutex);
> > +
> > +   return -ENOENT;
> > +}
> 
> Can you explain how reattach is used?
> 
> > +
> > +static long umem_dev_ioctl(struct file *filp, unsigned int ioctl,
> > +                      unsigned long arg)
> > +{
> > +   void __user *argp = (void __user *) arg;
> > +   long ret;
> > +   struct umem_create *create = NULL;
> > +
> > +
> > +   switch (ioctl) {
> > +   case UMEM_DEV_CREATE_UMEM:
> > +           create = kmalloc(sizeof(*create), GFP_KERNEL);
> > +           if (copy_from_user(create, argp, sizeof(*create))) {
> > +                   ret = -EFAULT;
> > +                   break;
> > +           }
> > +           ret = umem_create_umem(create);
> > +           if (copy_to_user(argp, create, sizeof(*create))) {
> > +                   ret = -EFAULT;
> > +                   break;
> > +           }
> > +           break;
> 
> A simpler approach is the open("/dev/umem") returns an mmap()able fd. 
> You need to call an ioctl() to set the size, etc. but only you only
> operate on that fd.

So you are suggesting that /dev/umem and /dev/umemctl should be introduced
and split the functionality.


> > +   case UMEM_DEV_LIST:
> > +           ret = umem_list_umem(argp);
> > +           break;
> > +   case UMEM_DEV_REATTACH:
> > +           create = kmalloc(sizeof(*create), GFP_KERNEL);
> > +           if (copy_from_user(create, argp, sizeof(*create))) {
> > +                   ret = -EFAULT;
> > +                   break;
> > +           }
> > +           ret = umem_reattach_umem(create);
> > +           if (copy_to_user(argp, create, sizeof(*create))) {
> > +                   ret = -EFAULT;
> > +                   break;
> > +           }
> > +           break;
> > +   default:
> > +           ret = -EINVAL;
> > +           break;
> > +   }
> > +
> > +   kfree(create);
> > +   return ret;
> > +}
> > +
> > +
> > +#ifdef __KERNEL__
> > +#include <linux/compiler.h>
> > +#else
> > +#define __user
> > +#endif
> 
> I think a #include <linux/compiler.h> is sufficient, the export process
> (see include/linux/Kbuild, add an entry there) takes care of __user.
> 
> > +
> > +#define UMEM_ID_MAX        256
> > +#define UMEM_NAME_MAX      256
> > +
> > +struct umem_name {
> > +   char id[UMEM_ID_MAX];           /* non-zero terminated */
> > +   char name[UMEM_NAME_MAX];       /* non-zero terminated */
> > +};
> 
> IMO, it would be better to avoid names, and use opaque __u64 identifiers
> assigned by userspace, or perhaps file descriptors generated by the
> kernel.  With names come the complications of namespaces, etc.  One user
> can DoS another by grabbing a name that it knows the other user wants to
> use.

So how about the kernel assigning identifiers which is system global?


> > +
> > +struct umem_create {
> > +   __u64 size;     /* in bytes */
> > +   __s32 umem_fd;
> > +   __s32 shmem_fd;
> > +   __u32 async_req_max;
> > +   __u32 sync_req_max;
> > +   struct umem_name name;
> > +};
> > +
> > +struct umem_page_request {
> > +   __u64 __user *pgoffs;
> 
> Pointers change their size in 32-bit and 64-bit userspace, best to avoid
> them.

Ah yes, right. How about following?
struct {
       __u32 nr;
       __u32 padding;
       __u64 pgoffs[0];
}

> > +   __u32 nr;
> > +   __u32 padding;
> > +};
> > +
> > +struct umem_page_cached {
> > +   __u64 __user *pgoffs;
> > +   __u32 nr;
> > +   __u32 padding;
> > +};
> > +
> > +#define UMEMIO     0x1E
> > +
> > +/* ioctl for umem_dev fd */
> > +#define UMEM_DEV_CREATE_UMEM       _IOWR(UMEMIO, 0x0, struct umem_create)
> > +#define UMEM_DEV_LIST              _IOWR(UMEMIO, 0x1, struct umem_list)
> 
> Why is _LIST needed?
> 
> > +#define UMEM_DEV_REATTACH  _IOWR(UMEMIO, 0x2, struct umem_create)
> > +
> > +/* ioctl for umem fd */
> > +#define UMEM_GET_PAGE_REQUEST      _IOWR(UMEMIO, 0x10, struct 
> > umem_page_request)
> > +#define UMEM_MARK_PAGE_CACHED      _IOW (UMEMIO, 0x11, struct 
> > umem_page_cached)
> 
> You could make the GET_PAGE_REQUEST / MARK_PAGE_CACHED protocol run over
> file descriptors, instead of an ioctl.  It allows you to implement the
> other side in either the kernel or userspace.  This is similar to how
> kvm uses an eventfd for communication with vhost-net in the kernel, or
> an implementation in userspace.

Do you mean that read/write on file descriptors is better than ioctl?
Okay, it would be easy to convert ioctl into read/write.


> > +#define UMEM_MAKE_VMA_ANONYMOUS    _IO  (UMEMIO, 0x12)
> > +
> > +#endif /* __LINUX_UMEM_H */
> 
> 
> -- 
> error compiling committee.c: too many arguments to function
> 

-- 
yamahata



reply via email to

[Prev in Thread] Current Thread [Next in Thread]