qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v2 for-2.1 2/2] pc: hack for migration compatibi


From: Michael S. Tsirkin
Subject: Re: [Qemu-devel] [PATCH v2 for-2.1 2/2] pc: hack for migration compatibility from QEMU 2.0
Date: Mon, 28 Jul 2014 13:45:18 +0200

On Thu, Jul 24, 2014 at 04:32:09PM +0200, Paolo Bonzini wrote:
> Changing the ACPI table size causes migration to break, and the memory
> hotplug work opened our eyes on how horribly we were breaking things in
> 2.0 already.
> 
> The ACPI table size is rounded to the next 4k, which one would think
> gives some headroom.  In practice this is not the case, because the user
> can control the ACPI table size (each CPU adds 97 bytes to the SSDT and
> 8 to the MADT) and so some "-smp" values will break the 4k boundary and
> fail to migrate.  Similarly, PCI bridges add ~1870 bytes to the SSDT.
> 
> To fix this, hard-code 64k as the maximum ACPI table size, which
> (despite being an order of magnitude smaller than 640k) should be enough
> for everyone.
> 
> To fix migration from QEMU 2.0, compute the payload size of QEMU 2.0
> and always use that one.  The previous patch shrunk the ACPI tables
> enough that the QEMU 2.0 size should always be enough.
> 
> Migration from QEMU 1.7 should work for guests that have a number of CPUs
> other than 12, 13, 14, 54, 55, 56, 97, 98, 139, 140.  It was already
> broken from QEMU 1.7 to QEMU 2.0 in the same way, though.
> 
> Even with this patch, QEMU 1.7 and 2.0 have two different ideas of
> "-M pc-i440fx-2.0" when there are PCI bridges.  Igor sent a patch to
> adopt the QEMU 1.7 definition.  I think distributions should apply
> it if they move directly from QEMU 1.7 to 2.1+ without ever packaging
> version 2.0.
> 
> Signed-off-by: Paolo Bonzini <address@hidden>
> ---
>       replace magic constants with #defines [Igor]
>       remove stray line from comment [Laszlo]
> 
>  hw/i386/acpi-build.c | 71 
> +++++++++++++++++++++++++++++++++++++++++++++++++---
>  hw/i386/pc_piix.c    | 19 ++++++++++++++
>  hw/i386/pc_q35.c     |  5 ++++
>  include/hw/i386/pc.h |  1 +
>  4 files changed, 92 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index ebc5f03..26d8dfa 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -25,7 +25,9 @@
>  #include <glib.h>
>  #include "qemu-common.h"
>  #include "qemu/bitmap.h"
> +#include "qemu/osdep.h"
>  #include "qemu/range.h"
> +#include "qemu/error-report.h"
>  #include "hw/pci/pci.h"
>  #include "qom/cpu.h"
>  #include "hw/i386/pc.h"
> @@ -52,6 +54,16 @@
>  #include "qapi/qmp/qint.h"
>  #include "qom/qom-qobject.h"
>  
> +/* These are used to size the ACPI tables for -M pc-i440fx-1.7 and
> + * -M pc-i440fx-2.0.

Let's just say 2.0 and earlier?

>  Even if the actual amount of AML generated grows
> + * a little bit, there should be plenty of free space since the DSDT
> + * shrunk by ~1.5k between QEMU 2.0 and QEMU 2.1.
> + */
> +#define ACPI_BUILD_CPU_AML_SIZE    97
> +#define ACPI_BUILD_BRIDGE_AML_SIZE 1875

Let's put _LEGACY_ somewhere here?

> +
> +#define ACPI_BUILD_TABLE_SIZE      0x10000
> +
>  typedef struct AcpiCpuInfo {
>      DECLARE_BITMAP(found_cpus, ACPI_CPU_HOTPLUG_ID_LIMIT);
>  } AcpiCpuInfo;
> @@ -87,6 +99,8 @@ typedef struct AcpiBuildPciBusHotplugState {
>      struct AcpiBuildPciBusHotplugState *parent;
>  } AcpiBuildPciBusHotplugState;
>  
> +unsigned bsel_alloc;
> +

Patch will be better contained if instead of using a global
bsel_alloc, we actually go and count the devices that
have ACPI_PCIHP_PROP_BSEL.
You can just scan all devices, or all pci devices, it
should not matter.
This way, this code will be local to the legacy path.


>  static void acpi_get_dsdt(AcpiMiscInfo *info)
>  {
>      uint16_t *applesmc_sta;
> @@ -759,8 +773,8 @@ static void *acpi_set_bsel(PCIBus *bus, void *opaque)
>  static void acpi_set_pci_info(void)
>  {
>      PCIBus *bus = find_i440fx(); /* TODO: Q35 support */
> -    unsigned bsel_alloc = 0;
>  
> +    assert(bsel_alloc == 0);
>      if (bus) {
>          /* Scan all PCI buses. Set property to enable acpi based hotplug. */
>          pci_for_each_bus_depth_first(bus, acpi_set_bsel, NULL, &bsel_alloc);
> @@ -1440,13 +1454,14 @@ static
>  void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
>  {
>      GArray *table_offsets;
> -    unsigned facs, dsdt, rsdt;
> +    unsigned facs, ssdt, dsdt, rsdt;
>      AcpiCpuInfo cpu;
>      AcpiPmInfo pm;
>      AcpiMiscInfo misc;
>      AcpiMcfgInfo mcfg;
>      PcPciInfo pci;
>      uint8_t *u;
> +    size_t aml_len = 0;
>  
>      acpi_get_cpu_info(&cpu);
>      acpi_get_pm_info(&pm);
> @@ -1474,13 +1489,20 @@ void acpi_build(PcGuestInfo *guest_info, 
> AcpiBuildTables *tables)
>      dsdt = tables->table_data->len;
>      build_dsdt(tables->table_data, tables->linker, &misc);
>  
> +    /* Count the size of the DSDT and SSDT, we will need it for legacy
> +     * sizing of ACPI tables.
> +     */
> +    aml_len += tables->table_data->len - dsdt;
> +
>      /* ACPI tables pointed to by RSDT */
>      acpi_add_table(table_offsets, tables->table_data);
>      build_fadt(tables->table_data, tables->linker, &pm, facs, dsdt);
>  
> +    ssdt = tables->table_data->len;
>      acpi_add_table(table_offsets, tables->table_data);
>      build_ssdt(tables->table_data, tables->linker, &cpu, &pm, &misc, &pci,
>                 guest_info);
> +    aml_len += tables->table_data->len - ssdt;
>  
>      acpi_add_table(table_offsets, tables->table_data);
>      build_madt(tables->table_data, tables->linker, &cpu, guest_info);
> @@ -1513,12 +1535,53 @@ void acpi_build(PcGuestInfo *guest_info, 
> AcpiBuildTables *tables)
>      /* RSDP is in FSEG memory, so allocate it separately */
>      build_rsdp(tables->rsdp, tables->linker, rsdt);
>  
> -    /* We'll expose it all to Guest so align size to reduce
> +    /* We'll expose it all to Guest so we want to reduce
>       * chance of size changes.
>       * RSDP is small so it's easy to keep it immutable, no need to
>       * bother with alignment.
> +     *
> +     * We used to align the tables to 4k, but of course this would
> +     * too simple to be enough.  4k turned out to be too small an
> +     * alignment very soon, and in fact it is almost impossible to
> +     * keep the table size stable for all (max_cpus, max_memory_slots)
> +     * combinations.  So the table size is always 64k for pc-i440fx-2.1
> +     * and we give an error if the table grows beyond that limit.
> +     *
> +     * We still have the problem of migrating from "-M pc-i440fx-2.0".  For
> +     * that, we exploit the fact that QEMU 2.1 generates _smaller_ tables
> +     * than 2.0 and we can always pad the smaller tables with zeros.  We can
> +     * then use the exact size of the 2.0 tables.
> +     *
> +     * All this is for PIIX4, since QEMU 2.0 didn't support Q35 migration.
>       */
> -    acpi_align_size(tables->table_data, 0x1000);
> +    if (guest_info->legacy_acpi_table_size) {
> +        /* Subtracting aml_len gives the size of fixed tables.  Then add the
> +         * size of the PIIX4 DSDT/SSDT in QEMU 2.0.
> +         */
> +        int legacy_aml_len =
> +            guest_info->legacy_acpi_table_size +
> +            ACPI_BUILD_CPU_AML_SIZE * max_cpus +
> +            ACPI_BUILD_BRIDGE_AML_SIZE * (MAX(bsel_alloc, 1) - 1);
> +        int legacy_table_size =
> +            ROUND_UP(tables->table_data->len - aml_len + legacy_aml_len, 
> 0x1000);
> +        if (tables->table_data->len > legacy_table_size) {
> +            /* -M pc-i440fx-2.0 doesn't support memory hotplug, so this 
> should
> +             * never happen.
> +             */
> +            error_report("This configuration is not supported with -M 
> pc-i440fx-2.0.");
> +            error_report("Please report this to address@hidden");

Downstreams would need to patch this line out? Let's just drop the
second line.

> +            exit(1);


so what happens here is 2.x -> 2.0 migration becomes broken, but
2.0 -> 2.x could still work.
I think I would prefer a warning instead.

> +        }
> +        g_array_set_size(tables->table_data, legacy_table_size);
> +    } else {
> +        if (tables->table_data->len > ACPI_BUILD_TABLE_SIZE) {
> +            /* As of QEMU 2.1, this fires with 160 VCPUs and 255 memory 
> slots.  */
> +            error_report("ACPI tables are larger than 64k.  Please remove");
> +            error_report("CPUs, NUMA nodes, memory slots or PCI bridges.");
> +            exit(1);
> +        }
> +        g_array_set_size(tables->table_data, ACPI_BUILD_TABLE_SIZE);


Let's split the idea to use 64K always out, to a separate patch, ok?

> +    }
>  
>      acpi_align_size(tables->linker, 0x1000);
>  
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 7081c08..c8a8090 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -61,6 +61,7 @@ static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
>  
>  static bool has_pci_info;
>  static bool has_acpi_build = true;
> +static int legacy_acpi_table_size;
>  static bool smbios_defaults = true;
>  static bool smbios_legacy_mode;
>  /* Make sure that guest addresses aligned at 1Gbyte boundaries get mapped to
> @@ -163,6 +164,7 @@ static void pc_init1(MachineState *machine,
>      guest_info = pc_guest_info_init(below_4g_mem_size, above_4g_mem_size);
>  
>      guest_info->has_acpi_build = has_acpi_build;
> +    guest_info->legacy_acpi_table_size = legacy_acpi_table_size;
>  
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = !pci_enabled;
> @@ -297,6 +299,23 @@ static void pc_init_pci(MachineState *machine)
>  
>  static void pc_compat_2_0(MachineState *machine)
>  {
> +    /* This value depends on the actual DSDT and SSDT compiled into
> +     * the source QEMU; unfortunately it depends on the binary and
> +     * not on the machine type, so we cannot make pc-i440fx-1.7 work on
> +     * both QEMU 1.7 and QEMU 2.0.
> +     *
> +     * Large variations cause migration to fail for more than one
> +     * consecutive value of the "-smp" maxcpus option.
> +     *
> +     * For small variations of the kind caused by different iasl versions,
> +     * the 4k rounding usually leaves slack.  However, there could be still
> +     * one or two values that break.  For QEMU 1.7 and QEMU 2.0 the
> +     * slack is only ~10 bytes before one "-smp maxcpus" value breaks!
> +     *
> +     * 6652 is valid for QEMU 2.0, the right value for pc-i440fx-1.7 on
> +     * QEMU 1.7 it is 6414.  For RHEL/CentOS 7.0 it is 6358.
> +     */
> +    legacy_acpi_table_size = 6652;
>      smbios_legacy_mode = true;
>      has_reserved_memory = false;
>  }
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index f551961..c39ee98 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -155,6 +155,11 @@ static void pc_q35_init(MachineState *machine)
>      guest_info->has_acpi_build = has_acpi_build;
>      guest_info->has_reserved_memory = has_reserved_memory;
>  
> +    /* Migration was not supported in 2.0 for Q35, so do not bother
> +     * with this hack (see hw/i386/acpi-build.c).
> +     */
> +    guest_info->legacy_acpi_table_size = 0;
> +
>      if (smbios_defaults) {
>          MachineClass *mc = MACHINE_GET_CLASS(machine);
>          /* These values are guest ABI, do not change */
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 1c0c382..f4b9b2b 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -94,6 +94,7 @@ struct PcGuestInfo {
>      uint64_t *node_mem;
>      uint64_t *node_cpu;
>      FWCfgState *fw_cfg;
> +    int legacy_acpi_table_size;
>      bool has_acpi_build;
>      bool has_reserved_memory;
>  };
> -- 
> 1.8.3.1



reply via email to

[Prev in Thread] Current Thread [Next in Thread]