qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v3.1 25/31] hostmem: add properties for NUMA mem


From: Eduardo Habkost
Subject: Re: [Qemu-devel] [PATCH v3.1 25/31] hostmem: add properties for NUMA memory policy
Date: Mon, 19 May 2014 20:34:54 -0300
User-agent: Mutt/1.5.21 (2010-09-15)

On Tue, May 06, 2014 at 05:27:46PM +0800, Hu Tao wrote:
[...]
> @@ -203,6 +296,20 @@ host_memory_backend_memory_init(UserCreatable *uc, Error 
> **errp)
>      if (backend->prealloc) {
>          os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz);
>      }
> +
> +#ifdef CONFIG_NUMA
> +    unsigned long maxnode = find_last_bit(backend->host_nodes, MAX_NODES);
> +
> +    /* This is a workaround for a long standing bug in Linux'
> +     * mbind implementation, which cuts off the last specified
> +     * node.
> +     */

What if the bug is fixed? mbind() documentation says "nodemask points to
a bit mask of nodes containing up to maxnode bits", so we must ensure
backend->host_nodes has the one extra bit.

Also, if no bit is set, we can pass nodemask=NULL or maxnode=0 as
argument.

We could address both issues, and do this:

    struct HostMemoryBackend { [...]
        DECLARE_BITMAP(host_nodes, MAX_NODES + 1);
    [...]
    lastbit = find_last_bit(backend->host_nodes, MAX_NODES);
    /* lastbit == MAX_NODES means maxnode=0 */
    maxnode = (lastbit + 1) % (MAX_NODES + 1);
    /* We can have up to MAX_NODES nodes, but we need to pass maxnode+1
     * as argument to mbind() due to an old Linux bug (feature?) which
     * cuts off the last specified node. This means backend->host_nodes
     * must have MAX_NODES+1 bits available.
     */
    assert(sizeof(backend->host_nodes) >= BITS_TO_LONGS(MAX_NODES + 1) * 
sizeof(unsigned long));
    assert(maxnode <= MAX_NODES);
    mbind(ptr, sz, policy, maxnode ? backend->host_nodes : NULL, maxnode + 1, 
flags);


(I am starting to wonder if it was worth dropping the libnuma
requirement and implementing our own mbind()-calling code.)

> +    if (mbind(ptr, sz, backend->policy, backend->host_nodes, maxnode + 2, 
> 0)) {
> +        error_setg_errno(errp, errno,
> +                         "cannot bind memory to host NUMA nodes");

Don't we want to set flags to MPOL_MF_STRICT here? I believe we
shouldn't have any pages preallocated at this point, but in case we do,
I would expect them to be moved instead of ignoring the policy set by
the user.

> +        return;
> +    }
> +#endif
>  }
>  
>  MemoryRegion *
> diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h
> index 819b72d..4e96298 100644
> --- a/include/sysemu/hostmem.h
> +++ b/include/sysemu/hostmem.h
> @@ -12,8 +12,10 @@
>  #ifndef QEMU_HOSTMEM_H
>  #define QEMU_HOSTMEM_H
>  
> +#include "sysemu/sysemu.h" /* for MAX_NODES */
>  #include "qom/object.h"
>  #include "exec/memory.h"
> +#include "qemu/bitmap.h"
>  
>  #define TYPE_MEMORY_BACKEND "memory"
>  #define MEMORY_BACKEND(obj) \
> @@ -52,6 +54,8 @@ struct HostMemoryBackend {
>      uint64_t size;
>      bool merge, dump;
>      bool prealloc, force_prealloc;
> +    DECLARE_BITMAP(host_nodes, MAX_NODES);
> +    HostMemPolicy policy;
>  
>      MemoryRegion mr;
>  };
> diff --git a/qapi-schema.json b/qapi-schema.json
> index 5dd30eb..bea3476 100644
> --- a/qapi-schema.json
> +++ b/qapi-schema.json
> @@ -4732,3 +4732,23 @@
>     '*cpus':   ['uint16'],
>     '*mem':    'size',
>     '*memdev': 'str' }}
> +
> +##
> +# @HostMemPolicy
> +#
> +# Host memory policy types
> +#
> +# @default: restore default policy, remove any nondefault policy
> +#
> +# @preferred: set the preferred host nodes for allocation
> +#
> +# @bind: a strict policy that restricts memory allocation to the
> +#        host nodes specified
> +#
> +# @interleave: memory allocations are interleaved across the set
> +#              of host nodes specified
> +#
> +# Since 2.1
> +##
> +{ 'enum': 'HostMemPolicy',
> +  'data': [ 'default', 'preferred', 'bind', 'interleave' ] }
> -- 
> 1.8.5.2.229.g4448466
> 
> 

-- 
Eduardo



reply via email to

[Prev in Thread] Current Thread [Next in Thread]