qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH for-2.3] numa: pc: fix default VCPU to node mapp


From: Eduardo Habkost
Subject: Re: [Qemu-devel] [PATCH for-2.3] numa: pc: fix default VCPU to node mapping
Date: Tue, 17 Mar 2015 13:42:36 -0300
User-agent: Mutt/1.5.23 (2014-03-12)

On Tue, Mar 17, 2015 at 03:48:38PM +0000, Igor Mammedov wrote:
> since commit
>    dd0247e0 pc: acpi: mark all possible CPUs as enabled in SRAT
> Linux kernel actually tries to use CPU to Node mapping from
> QEMU provided SRAT table instead of discarding it, and that
> in some cases breaks build_sched_domains() which expects
> sane mapping where cores/threads belonging to the same socket
> are on the same NUMA node.
> 
> With current default round-robin mapping of VCPUs to nodes
> guest ends-up with cores/threads belonging to the same socket
> being on different NUMA nodes.
> 
> For example with following CLI:
> qemu-kvm -m 4G -smp 5,sockets=1,cores=4,threads=1,maxcpus=8 \
>          -numa node,nodeid=0 -numa node,nodeid=1
> 2.6.32 based kernels will hang on boot due to incorrectly build
> sched_group-s list in update_sd_lb_stats()
> so comment in QEMU justifying dumb default mapping:
>  "
>   guest OSes must cope with this anyway, because there are BIOSes
>   out there in real machines which also use this scheme.
>  "
> isn't really valid.
> 
> Replacing default mapping withi a manual, where VCPUs belonging to
> the same socket are on the same NUMA node, fixes issue for
> guests which can't handle nonsense topology i.e. cnaging CLI to:
>   -numa node,nodeid=0,cpus=0-3 -numa node,nodeid=1,cpus=4-7
> 
> So instead of simply scattering VCPUs around nodes, map
> the same socket VCPUs to the same NUMA node, which is what
> guest would expect from a sane hardware/BIOS.
> 
> Signed-off-by: Igor Mammedov <address@hidden>

I believe the proposed behavior is much better. But if we are going to
break compatibility, shouldn't we at least do that before the first -rc
so we get feedback in case it break existing configurations?

About qemu_cpu_socket_id_from_index(): all qemu-system-* binaries have
smp_cores and smp_threads available (even if machines ignore it), but
the default stub can return values that are larger than the number of
sockets if smp_cores*smp_threads > 1, which would be obviously
incorrect. Isn't it easier to simply make
"cpu_index/(smp_cores*smp_sockets)" be the default cpu_index->socket
mapping function, and allow machine-specific (not arch-specific)
overrides if necessary?


> ---
>  include/sysemu/cpus.h                 |  3 +++
>  numa.c                                | 14 ++++++++++----
>  stubs/Makefile.objs                   |  1 +
>  stubs/qemu_cpu_socket_id_from_index.c |  6 ++++++
>  target-i386/cpu.c                     | 11 +++++++++++
>  5 files changed, 31 insertions(+), 4 deletions(-)
>  create mode 100644 stubs/qemu_cpu_socket_id_from_index.c
> 
> diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h
> index 3f162a9..aacabcb 100644
> --- a/include/sysemu/cpus.h
> +++ b/include/sysemu/cpus.h
> @@ -1,5 +1,6 @@
>  #ifndef QEMU_CPUS_H
>  #define QEMU_CPUS_H
> +#include "qemu-common.h"
>  
>  /* cpus.c */
>  void qemu_init_cpu_loop(void);
> @@ -18,6 +19,8 @@ void qtest_clock_warp(int64_t dest);
>  /* vl.c */
>  extern int smp_cores;
>  extern int smp_threads;
> +
> +unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index);
>  #else
>  /* *-user doesn't have configurable SMP topology */
>  #define smp_cores   1
> diff --git a/numa.c b/numa.c
> index ffbec68..5297749 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -26,6 +26,7 @@
>  #include "exec/cpu-common.h"
>  #include "qemu/bitmap.h"
>  #include "qom/cpu.h"
> +#include "sysemu/cpus.h"
>  #include "qemu/error-report.h"
>  #include "include/exec/cpu-common.h" /* for RAM_ADDR_FMT */
>  #include "qapi-visit.h"
> @@ -233,13 +234,18 @@ void parse_numa_opts(void)
>                  break;
>              }
>          }
> -        /* assigning the VCPUs round-robin is easier to implement, guest OSes
> -         * must cope with this anyway, because there are BIOSes out there in
> -         * real machines which also use this scheme.
> +        /* Assign VCPUs from the same socket to the same node.
> +         * Since mapping is arch dependent, target that care about
> +         * correct mapping of VCPUs to node should implement
> +         * qemu_cpu_socket_id_from_index() function that maps cpu_index to
> +         * a socket #, for all other cases legacy round-robin mode
> +         * will be used.
>           */
>          if (i == nb_numa_nodes) {
>              for (i = 0; i < max_cpus; i++) {
> -                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
> +                unsigned socket_id = qemu_cpu_socket_id_from_index(i);
> +
> +                set_bit(i, numa_info[socket_id % nb_numa_nodes].node_cpu);
>              }
>          }
>      }
> diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
> index 8beff4c..86b8060 100644
> --- a/stubs/Makefile.objs
> +++ b/stubs/Makefile.objs
> @@ -39,3 +39,4 @@ stub-obj-$(CONFIG_WIN32) += fd-register.o
>  stub-obj-y += cpus.o
>  stub-obj-y += kvm.o
>  stub-obj-y += qmp_pc_dimm_device_list.o
> +stub-obj-y += qemu_cpu_socket_id_from_index.o
> diff --git a/stubs/qemu_cpu_socket_id_from_index.c 
> b/stubs/qemu_cpu_socket_id_from_index.c
> new file mode 100644
> index 0000000..3d8ea8b
> --- /dev/null
> +++ b/stubs/qemu_cpu_socket_id_from_index.c
> @@ -0,0 +1,6 @@
> +#include "sysemu/cpus.h"
> +
> +unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index)
> +{
> +    return cpu_index;
> +}
> diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> index ed7e5d5..7a7e236 100644
> --- a/target-i386/cpu.c
> +++ b/target-i386/cpu.c
> @@ -47,6 +47,7 @@
>  #include "hw/xen/xen.h"
>  #include "hw/i386/apic_internal.h"
>  #endif
> +#include "hw/i386/topology.h"
>  
>  
>  /* Cache topology CPUID constants: */
> @@ -2822,6 +2823,16 @@ out:
>      }
>  }
>  
> +#ifndef CONFIG_USER_ONLY
> +unsigned qemu_cpu_socket_id_from_index(unsigned int cpu_index)
> +{
> +    unsigned pkg_id, core_id, smt_id;
> +    x86_topo_ids_from_idx(smp_cores, smp_threads, cpu_index,
> +                          &pkg_id, &core_id, &smt_id);
> +    return pkg_id;
> +}
> +#endif
> +
>  static void x86_cpu_initfn(Object *obj)
>  {
>      CPUState *cs = CPU(obj);
> -- 
> 1.8.3.1
> 

-- 
Eduardo



reply via email to

[Prev in Thread] Current Thread [Next in Thread]