qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH] x86: Allow to set NUMA distance for different NUMA


From: He Chen
Subject: [Qemu-devel] [PATCH] x86: Allow to set NUMA distance for different NUMA nodes
Date: Fri, 10 Mar 2017 18:18:17 +0800

Current, QEMU does not provide a clear command to set vNUMA distance for
guest although we already have `-numa` command to set vNUMA nodes.

vNUMA distance makes sense in certain scenario.
But now, if we create a guest that has 4 vNUMA nodes, when we check NUMA
info via `numactl -H`, we will see:

node distance:
node    0    1    2    3
  0:   10   20   20   20
  1:   20   10   20   20
  2:   20   20   10   20
  3:   20   20   20   10

Guest kernel regards all local node as distance 10, and all remote node
as distance 20 when there is no SLIT table since QEMU doesn't build it.
It looks like a little strange when you have seen the distance in an
actual physical machine that contains 4 NUMA nodes. My machine shows:

node distance:
node    0    1    2    3
  0:   10   21   31   41
  1:   21   10   21   31
  2:   31   21   10   21
  3:   41   31   21   10

This patch is going to add SLIT table support in QEMU, and provides
addtional option `dist` for command `-numa` to allow user set vNUMA
distance by QEMU command.

With this patch, when a user wants to create a guest that contains
several vNUMA nodes and also wants to set distance among those nodes,
the QEMU command would like:

```
-object 
memory-backend-ram,size=1G,prealloc=yes,host-nodes=0,policy=bind,id=node0 \
-numa node,nodeid=0,cpus=0,memdev=node0 \
-object 
memory-backend-ram,size=1G,prealloc=yes,host-nodes=1,policy=bind,id=node1 \
-numa node,nodeid=1,cpus=1,memdev=node1 \
-object 
memory-backend-ram,size=1G,prealloc=yes,host-nodes=2,policy=bind,id=node2 \
-numa node,nodeid=2,cpus=2,memdev=node2 \
-object 
memory-backend-ram,size=1G,prealloc=yes,host-nodes=3,policy=bind,id=node3 \
-numa node,nodeid=3,cpus=3,memdev=node3 \
-numa dist,a=0,b=1,val=21 \
-numa dist,a=0,b=2,val=31 \
-numa dist,a=0,b=3,val=41 \
-numa dist,a=1,b=0,val=21 \
...
```

Thanks,
-He

Signed-off-by: He Chen <address@hidden>
---
 hw/i386/acpi-build.c        | 28 ++++++++++++++++++++++++++
 include/hw/acpi/acpi-defs.h |  9 +++++++++
 include/sysemu/numa.h       |  1 +
 numa.c                      | 48 +++++++++++++++++++++++++++++++++++++++++++++
 qapi-schema.json            | 24 +++++++++++++++++++++--
 qemu-options.hx             |  5 ++++-
 6 files changed, 112 insertions(+), 3 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 2073108..7ced37d 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2396,6 +2396,32 @@ build_srat(GArray *table_data, BIOSLinker *linker, 
MachineState *machine)
 }
 
 static void
+build_slit(GArray *table_data, BIOSLinker *linker, MachineState *machine)
+{
+    struct AcpiSystemLocalityDistanceTable *slit;
+    uint8_t *entry;
+    int slit_start, slit_data_len, i, j;
+    slit_start = table_data->len;
+
+    slit = acpi_data_push(table_data, sizeof(*slit));
+    slit->nb_localities = nb_numa_nodes;
+
+    slit_data_len = sizeof(uint8_t) * nb_numa_nodes * nb_numa_nodes;
+    entry = acpi_data_push(table_data, slit_data_len);
+
+    for (i = 0; i < nb_numa_nodes; i++) {
+        for (j = 0; j < nb_numa_nodes; j++) {
+            entry[i * nb_numa_nodes + j] = numa_info[i].distance[j];
+        }
+    }
+
+    build_header(linker, table_data,
+                 (void *)(table_data->data + slit_start),
+                 "SLIT",
+                 table_data->len - slit_start, 1, NULL, NULL);
+}
+
+static void
 build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info)
 {
     AcpiTableMcfg *mcfg;
@@ -2678,6 +2704,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState 
*machine)
     if (pcms->numa_nodes) {
         acpi_add_table(table_offsets, tables_blob);
         build_srat(tables_blob, tables->linker, machine);
+        acpi_add_table(table_offsets, tables_blob);
+        build_slit(tables_blob, tables->linker, machine);
     }
     if (acpi_get_mcfg(&mcfg)) {
         acpi_add_table(table_offsets, tables_blob);
diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
index 4cc3630..b183a8f 100644
--- a/include/hw/acpi/acpi-defs.h
+++ b/include/hw/acpi/acpi-defs.h
@@ -527,6 +527,15 @@ struct AcpiSratProcessorGiccAffinity
 
 typedef struct AcpiSratProcessorGiccAffinity AcpiSratProcessorGiccAffinity;
 
+/*
+ * SLIT (NUMA distance description) table
+ */
+struct AcpiSystemLocalityDistanceTable {
+    ACPI_TABLE_HEADER_DEF
+    uint64_t    nb_localities;
+} QEMU_PACKED;
+typedef struct AcpiSystemLocalityDistanceTable AcpiSystemLocalityDistanceTable;
+
 /* PCI fw r3.0 MCFG table. */
 /* Subtable */
 struct AcpiMcfgAllocation {
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index 8f09dcf..2f7a941 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -21,6 +21,7 @@ typedef struct node_info {
     struct HostMemoryBackend *node_memdev;
     bool present;
     QLIST_HEAD(, numa_addr_range) addr; /* List to store address ranges */
+    uint8_t distance[MAX_NODES];
 } NodeInfo;
 
 extern NodeInfo numa_info[MAX_NODES];
diff --git a/numa.c b/numa.c
index e01cb54..897657a 100644
--- a/numa.c
+++ b/numa.c
@@ -50,6 +50,9 @@ static int have_memdevs = -1;
 static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
                              * For all nodes, nodeid < max_numa_nodeid
                              */
+static int min_numa_distance = 10;
+static int def_numa_distance = 20;
+static int max_numa_distance = 255;
 int nb_numa_nodes;
 NodeInfo numa_info[MAX_NODES];
 
@@ -208,10 +211,33 @@ static void numa_node_parse(NumaNodeOptions *node, 
QemuOpts *opts, Error **errp)
         numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
     }
+
     numa_info[nodenr].present = true;
     max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
 }
 
+static void numa_distance_parse(NumaDistOptions *dist, QemuOpts *opts, Error 
**errp)
+{
+    uint8_t a = dist->a;
+    uint8_t b = dist->b;
+    uint8_t val = dist->val;
+
+    if (a >= MAX_NODES || b >= MAX_NODES) {
+        error_setg(errp, "Max number of NUMA nodes reached: %"
+                   PRIu16 "", a > b ? a : b);
+        return;
+    }
+
+    if (val > max_numa_distance || val < min_numa_distance) {
+        error_setg(errp,
+                "NUMA distance (%" PRIu8 ") out of range (%d)~(%d)",
+                dist->val, max_numa_distance, min_numa_distance);
+        return;
+    }
+
+    numa_info[a].distance[b] = val;
+}
+
 static int parse_numa(void *opaque, QemuOpts *opts, Error **errp)
 {
     NumaOptions *object = NULL;
@@ -235,6 +261,12 @@ static int parse_numa(void *opaque, QemuOpts *opts, Error 
**errp)
         }
         nb_numa_nodes++;
         break;
+    case NUMA_OPTIONS_TYPE_DIST:
+        numa_distance_parse(&object->u.dist, opts, &err);
+        if (err) {
+            goto end;
+        }
+        break;
     default:
         abort();
     }
@@ -294,6 +326,21 @@ static void validate_numa_cpus(void)
     g_free(seen_cpus);
 }
 
+static void default_numa_distance(void)
+{
+    int i, j;
+
+    for (i = 0; i < nb_numa_nodes; i++) {
+        for (j = 0; j < nb_numa_nodes; j++) {
+            if (i == j && numa_info[i].distance[j] != min_numa_distance) {
+                numa_info[i].distance[j] = min_numa_distance;
+            } else if (numa_info[i].distance[j] <= min_numa_distance) {
+                numa_info[i].distance[j] = def_numa_distance;
+            }
+        }
+    }
+}
+
 void parse_numa_opts(MachineClass *mc)
 {
     int i;
@@ -390,6 +437,7 @@ void parse_numa_opts(MachineClass *mc)
         }
 
         validate_numa_cpus();
+        default_numa_distance();
     } else {
         numa_set_mem_node_id(0, ram_size, 0);
     }
diff --git a/qapi-schema.json b/qapi-schema.json
index 32b4a4b..2988304 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -5647,7 +5647,7 @@
 # Since: 2.1
 ##
 { 'enum': 'NumaOptionsType',
-  'data': [ 'node' ] }
+  'data': [ 'node', 'dist' ] }
 
 ##
 # @NumaOptions:
@@ -5660,7 +5660,8 @@
   'base': { 'type': 'NumaOptionsType' },
   'discriminator': 'type',
   'data': {
-    'node': 'NumaNodeOptions' }}
+    'node': 'NumaNodeOptions',
+    'dist': 'NumaDistOptions' }}
 
 ##
 # @NumaNodeOptions:
@@ -5689,6 +5690,25 @@
    '*memdev': 'str' }}
 
 ##
+# @NumaDistOptions:
+#
+# Set distance between 2 NUMA nodes. (for OptsVisitor)
+#
+# @a: first NUMA node.
+#
+# @b: second NUMA node.
+#
+# @val: NUMA distance between 2 given NUMA nodes.
+#
+# Since: 2.9
+##
+{ 'struct': 'NumaDistOptions',
+  'data': {
+   'a':   'uint8',
+   'b':   'uint8',
+   'val': 'uint8' }}
+
+##
 # @HostMemPolicy:
 #
 # Host memory policy types
diff --git a/qemu-options.hx b/qemu-options.hx
index 8dd8ee3..0de5cf8 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -139,12 +139,15 @@ ETEXI
 
 DEF("numa", HAS_ARG, QEMU_OPTION_numa,
     "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
-    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n", 
QEMU_ARCH_ALL)
+    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
+    "-numa dist,a=firstnode,b=secondnode,val=distance\n", QEMU_ARCH_ALL)
 STEXI
 @item -numa node[,address@hidden,address@hidden@var{lastcpu}]][,address@hidden
 @itemx -numa node[,address@hidden,address@hidden@var{lastcpu}]][,address@hidden
address@hidden -numa dist,address@hidden,address@hidden,address@hidden
 @findex -numa
 Define a NUMA node and assign RAM and VCPUs to it.
+Set NUMA distance between 2 NUMA nodes.
 
 @var{firstcpu} and @var{lastcpu} are CPU indexes. Each
 @samp{cpus} option represent a contiguous range of CPU indexes
-- 
2.7.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]