qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v5 05/10] ACPI ERST: support for ACPI ERST feature


From: Igor Mammedov
Subject: Re: [PATCH v5 05/10] ACPI ERST: support for ACPI ERST feature
Date: Mon, 26 Jul 2021 12:42:13 +0200

On Wed, 21 Jul 2021 11:07:40 -0500
Eric DeVolder <eric.devolder@oracle.com> wrote:

> On 7/20/21 7:17 AM, Igor Mammedov wrote:
> > On Wed, 30 Jun 2021 15:07:16 -0400
> > Eric DeVolder <eric.devolder@oracle.com> wrote:
> >   
> >> This change implements the support for the ACPI ERST feature.  
> > Drop this  
> Done
> 
> >   
> >>
> >> This implements a PCI device for ACPI ERST. This implments the  
> > s/implments/implements/  
> Corrected
> 
> >   
> >> non-NVRAM "mode" of operation for ERST.  
> > add here why non-NVRAM "mode" is implemented.  
> How about:
> This implements a PCI device for ACPI ERST. This implments the
> non-NVRAM "mode" of operation for ERST as it is supported by
> Linux and Windows and aligns with ERST support in most BIOS.

modulo typos looks good to me.
pls consider using a spell checker to check commit messages/comments.

> 
> > 
> > Also even if this non-NVRAM implementation, there is still
> > a lot of not necessary data copying (see below) so drop it
> > or justify why it's there.
> >     
> >> This change also includes erst.c in the build of general ACPI support.  
> > Drop this as well  
> Done
> 
> > 
> >   
> >> Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
> >> ---
> >>   hw/acpi/erst.c      | 704 
> >> ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>   hw/acpi/meson.build |   1 +
> >>   2 files changed, 705 insertions(+)
> >>   create mode 100644 hw/acpi/erst.c
> >>
> >> diff --git a/hw/acpi/erst.c b/hw/acpi/erst.c
> >> new file mode 100644
> >> index 0000000..6e9bd2e
> >> --- /dev/null
> >> +++ b/hw/acpi/erst.c
> >> @@ -0,0 +1,704 @@
> >> +/*
> >> + * ACPI Error Record Serialization Table, ERST, Implementation
> >> + *
> >> + * Copyright (c) 2021 Oracle and/or its affiliates.
> >> + *
> >> + * ACPI ERST introduced in ACPI 4.0, June 16, 2009.
> >> + * ACPI Platform Error Interfaces : Error Serialization
> >> + *
> >> + * This library is free software; you can redistribute it and/or
> >> + * modify it under the terms of the GNU Lesser General Public
> >> + * License as published by the Free Software Foundation;
> >> + * version 2 of the License.
> >> + *
> >> + * This library is distributed in the hope that it will be useful,
> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> + * Lesser General Public License for more details.
> >> + *
> >> + * You should have received a copy of the GNU Lesser General Public
> >> + * License along with this library; if not, see 
> >> <http://www.gnu.org/licenses/>

consider adding SPDX-License-Identifier to all new files.
 
> >> + */
> >> +
> >> +#include <sys/types.h>
> >> +#include <sys/stat.h>
> >> +#include <unistd.h>
> >> +
> >> +#include "qemu/osdep.h"
> >> +#include "qapi/error.h"
> >> +#include "hw/qdev-core.h"
> >> +#include "exec/memory.h"
> >> +#include "qom/object.h"
> >> +#include "hw/pci/pci.h"
> >> +#include "qom/object_interfaces.h"
> >> +#include "qemu/error-report.h"
> >> +#include "migration/vmstate.h"
> >> +#include "hw/qdev-properties.h"
> >> +#include "hw/acpi/acpi.h"
> >> +#include "hw/acpi/acpi-defs.h"
> >> +#include "hw/acpi/aml-build.h"
> >> +#include "hw/acpi/bios-linker-loader.h"
> >> +#include "exec/address-spaces.h"
> >> +#include "sysemu/hostmem.h"
> >> +#include "hw/acpi/erst.h"
> >> +#include "trace.h"
> >> +
> >> +/* UEFI 2.1: Append N Common Platform Error Record */
> >> +#define UEFI_CPER_RECORD_MIN_SIZE 128U
> >> +#define UEFI_CPER_RECORD_LENGTH_OFFSET 20U
> >> +#define UEFI_CPER_RECORD_ID_OFFSET 96U
> >> +#define IS_UEFI_CPER_RECORD(ptr) \
> >> +    (((ptr)[0] == 'C') && \
> >> +     ((ptr)[1] == 'P') && \
> >> +     ((ptr)[2] == 'E') && \
> >> +     ((ptr)[3] == 'R'))
> >> +#define THE_UEFI_CPER_RECORD_ID(ptr) \
> >> +    (*(uint64_t *)(&(ptr)[UEFI_CPER_RECORD_ID_OFFSET]))
> >> +
> >> +/*
> >> + * This implementation is an ACTION (cmd) and VALUE (data)
> >> + * interface consisting of just two 64-bit registers.
> >> + */
> >> +#define ERST_REG_SIZE (2UL * sizeof(uint64_t))  
> >   
> >> +#define ERST_CSR_ACTION (0UL << 3) /* action (cmd) */
> >> +#define ERST_CSR_VALUE  (1UL << 3) /* argument/value (data) */  
> > what's meaning of CRS?  
> CSR = control status register
> > Looking at patch both should be called ERST_[ACTION|VALUE]_OFFSET  
> Done
> > pls use explicit offset values instead of shifting bit.  
> Done
> > 
> >   
> >> +/*
> >> + * ERST_RECORD_SIZE is the buffer size for exchanging ERST
> >> + * record contents. Thus, it defines the maximum record size.
> >> + * As this is mapped through a PCI BAR, it must be a power of
> >> + * two, and should be at least PAGE_SIZE.
> >> + * Records are stored in the backing file in a simple fashion.
> >> + * The backing file is essentially divided into fixed size
> >> + * "slots", ERST_RECORD_SIZE in length, with each "slot"
> >> + * storing a single record. No attempt at optimizing storage
> >> + * through compression, compaction, etc is attempted.
> >> + * NOTE that any change to this value will make any pre-
> >> + * existing backing files, not of the same ERST_RECORD_SIZE,
> >> + * unusable to the guest.
> >> + */
> >> +/* 8KiB records, not too small, not too big */
> >> +#define ERST_RECORD_SIZE (2UL * 4096)
> >> +
> >> +#define ERST_INVALID_RECORD_ID (~0UL)
> >> +#define ERST_EXECUTE_OPERATION_MAGIC 0x9CUL
> >> +
> >> +/*
> >> + * Object cast macro
> >> + */
> >> +#define ACPIERST(obj) \
> >> +    OBJECT_CHECK(ERSTDeviceState, (obj), TYPE_ACPI_ERST)
> >> +
> >> +/*
> >> + * Main ERST device state structure
> >> + */
> >> +typedef struct {
> >> +    PCIDevice parent_obj;
> >> +
> >> +    HostMemoryBackend *hostmem;
> >> +    MemoryRegion *hostmem_mr;
> >> +
> >> +    MemoryRegion iomem; /* programming registes */
> >> +    MemoryRegion nvmem; /* exchange buffer */
> >> +    uint32_t prop_size;  
> > s/^^^/storage_size/  
> Corrected
> 
> >   
> >> +    hwaddr bar0; /* programming registers */
> >> +    hwaddr bar1; /* exchange buffer */  
> > why do you need to keep this addresses around?
> > Suggest to drop these fields and use local variables or pci_get_bar_addr() 
> > at call site.  
> Corrected
> 
> >   
> >> +
> >> +    uint8_t operation;
> >> +    uint8_t busy_status;
> >> +    uint8_t command_status;
> >> +    uint32_t record_offset;
> >> +    uint32_t record_count;
> >> +    uint64_t reg_action;
> >> +    uint64_t reg_value;
> >> +    uint64_t record_identifier;
> >> +
> >> +    unsigned next_record_index;  
> > 
> >   
> >> +    uint8_t record[ERST_RECORD_SIZE]; /* read/written directly by guest */
> >> +    uint8_t tmp_record[ERST_RECORD_SIZE]; /* intermediate manipulation 
> >> buffer */  
> > drop these see [**] below  
> Corrected
> 
> >   
> >> +
> >> +} ERSTDeviceState;
> >> +
> >> +/*******************************************************************/
> >> +/*******************************************************************/
> >> +
> >> +static unsigned copy_from_nvram_by_index(ERSTDeviceState *s, unsigned 
> >> index)
> >> +{
> >> +    /* Read an nvram entry into tmp_record */
> >> +    unsigned rc = ACPI_ERST_STATUS_FAILED;
> >> +    off_t offset = (index * ERST_RECORD_SIZE);
> >> +
> >> +    if ((offset + ERST_RECORD_SIZE) <= s->prop_size) {
> >> +        if (s->hostmem_mr) {
> >> +            uint8_t *p = (uint8_t 
> >> *)memory_region_get_ram_ptr(s->hostmem_mr);
> >> +            memcpy(s->tmp_record, p + offset, ERST_RECORD_SIZE);
> >> +            rc = ACPI_ERST_STATUS_SUCCESS;
> >> +        }
> >> +    }
> >> +    return rc;
> >> +}
> >> +
> >> +static unsigned copy_to_nvram_by_index(ERSTDeviceState *s, unsigned index)
> >> +{
> >> +    /* Write entry in tmp_record into nvram, and backing file */
> >> +    unsigned rc = ACPI_ERST_STATUS_FAILED;
> >> +    off_t offset = (index * ERST_RECORD_SIZE);
> >> +
> >> +    if ((offset + ERST_RECORD_SIZE) <= s->prop_size) {
> >> +        if (s->hostmem_mr) {
> >> +            uint8_t *p = (uint8_t 
> >> *)memory_region_get_ram_ptr(s->hostmem_mr);
> >> +            memcpy(p + offset, s->tmp_record, ERST_RECORD_SIZE);
> >> +            rc = ACPI_ERST_STATUS_SUCCESS;
> >> +        }
> >> +    }
> >> +    return rc;
> >> +}
> >> +
> >> +static int lookup_erst_record_by_identifier(ERSTDeviceState *s,
> >> +    uint64_t record_identifier, bool *record_found, bool alloc_for_write)
> >> +{
> >> +    int rc = -1;
> >> +    int empty_index = -1;
> >> +    int index = 0;
> >> +    unsigned rrc;
> >> +
> >> +    *record_found = 0;
> >> +
> >> +    do {
> >> +        rrc = copy_from_nvram_by_index(s, (unsigned)index);  
> > 
> > you have direct access to backend memory so there is no need
> > whatsoever to copy records from it to an intermediate buffer
> > everywhere. Almost all operations with records can be done
> > in place modulo EXECUTE_OPERATION action in BEGIN_[READ|WRITE]
> > context, where record is moved between backend and guest buffer.
> > 
> > So please eliminate all not necessary copying.
> > (for fun, time operations and set backend size to some huge
> > value to see how expensive this code is)  
> 
> I've corrected this. In our previous exchangs, I thought the reference
> to copying was about trying to directly have guest write/read the appropriate
> record in the backend storage. After reading this comment I realized that
> yes I was doing alot of copying (an artifact of the transition away from
> direct file i/o to MemoryBackend). So good find, and I've eliminated the
> intermediate copying.
> 
> >   
> >> +        if (rrc == ACPI_ERST_STATUS_SUCCESS) {
> >> +            uint64_t this_identifier;
> >> +            this_identifier = THE_UEFI_CPER_RECORD_ID(s->tmp_record);
> >> +            if (IS_UEFI_CPER_RECORD(s->tmp_record) &&
> >> +                (this_identifier == record_identifier)) {
> >> +                rc = index;
> >> +                *record_found = 1;
> >> +                break;
> >> +            }
> >> +            if ((this_identifier == ERST_INVALID_RECORD_ID) &&
> >> +                (empty_index < 0)) {
> >> +                empty_index = index; /* first available for write */
> >> +            }
> >> +        }
> >> +        ++index;
> >> +    } while (rrc == ACPI_ERST_STATUS_SUCCESS);
> >> +
> >> +    /* Record not found, allocate for writing */
> >> +    if ((rc < 0) && alloc_for_write) {
> >> +        rc = empty_index;
> >> +    }
> >> +
> >> +    return rc;
> >> +}
> >> +
> >> +static unsigned clear_erst_record(ERSTDeviceState *s)
> >> +{
> >> +    unsigned rc = ACPI_ERST_STATUS_RECORD_NOT_FOUND;
> >> +    bool record_found;
> >> +    int index;
> >> +
> >> +    index = lookup_erst_record_by_identifier(s,
> >> +        s->record_identifier, &record_found, 0);
> >> +    if (record_found) {
> >> +        memset(s->tmp_record, 0xFF, ERST_RECORD_SIZE);
> >> +        rc = copy_to_nvram_by_index(s, (unsigned)index);
> >> +        if (rc == ACPI_ERST_STATUS_SUCCESS) {
> >> +            s->record_count -= 1;
> >> +        }
> >> +    }
> >> +
> >> +    return rc;
> >> +}
> >> +
> >> +static unsigned write_erst_record(ERSTDeviceState *s)
> >> +{
> >> +    unsigned rc = ACPI_ERST_STATUS_FAILED;
> >> +
> >> +    if (s->record_offset < (ERST_RECORD_SIZE - 
> >> UEFI_CPER_RECORD_MIN_SIZE)) {
> >> +        uint64_t record_identifier;
> >> +        uint8_t *record = &s->record[s->record_offset];
> >> +        bool record_found;
> >> +        int index;
> >> +
> >> +        record_identifier = (s->record_identifier == 
> >> ERST_INVALID_RECORD_ID)
> >> +            ? THE_UEFI_CPER_RECORD_ID(record) : s->record_identifier;
> >> +
> >> +        index = lookup_erst_record_by_identifier(s,
> >> +            record_identifier, &record_found, 1);
> >> +        if (index < 0) {
> >> +            rc = ACPI_ERST_STATUS_NOT_ENOUGH_SPACE;
> >> +        } else {
> >> +            if (0 != s->record_offset) {
> >> +                memset(&s->tmp_record[ERST_RECORD_SIZE - 
> >> s->record_offset],
> >> +                    0xFF, s->record_offset);
> >> +            }
> >> +            memcpy(s->tmp_record, record, ERST_RECORD_SIZE - 
> >> s->record_offset);
> >> +            rc = copy_to_nvram_by_index(s, (unsigned)index);
> >> +            if (rc == ACPI_ERST_STATUS_SUCCESS) {
> >> +                if (!record_found) { /* not overwriting existing record */
> >> +                    s->record_count += 1; /* writing new record */
> >> +                }
> >> +            }
> >> +        }
> >> +    }
> >> +
> >> +    return rc;
> >> +}
> >> +
> >> +static unsigned next_erst_record(ERSTDeviceState *s,
> >> +    uint64_t *record_identifier)
> >> +{
> >> +    unsigned rc = ACPI_ERST_STATUS_RECORD_NOT_FOUND;
> >> +    unsigned index;
> >> +    unsigned rrc;
> >> +
> >> +    *record_identifier = ERST_INVALID_RECORD_ID;
> >> +
> >> +    index = s->next_record_index;
> >> +    do {
> >> +        rrc = copy_from_nvram_by_index(s, (unsigned)index);
> >> +        if (rrc == ACPI_ERST_STATUS_SUCCESS) {
> >> +            if (IS_UEFI_CPER_RECORD(s->tmp_record)) {
> >> +                s->next_record_index = index + 1; /* where to start next 
> >> time */
> >> +                *record_identifier = 
> >> THE_UEFI_CPER_RECORD_ID(s->tmp_record);
> >> +                rc = ACPI_ERST_STATUS_SUCCESS;
> >> +                break;
> >> +            }
> >> +            ++index;
> >> +        } else {
> >> +            if (s->next_record_index == 0) {
> >> +                rc = ACPI_ERST_STATUS_RECORD_STORE_EMPTY;
> >> +            }
> >> +            s->next_record_index = 0; /* at end, reset */
> >> +        }
> >> +    } while (rrc == ACPI_ERST_STATUS_SUCCESS);
> >> +
> >> +    return rc;
> >> +}
> >> +
> >> +static unsigned read_erst_record(ERSTDeviceState *s)
> >> +{
> >> +    unsigned rc = ACPI_ERST_STATUS_RECORD_NOT_FOUND;
> >> +    bool record_found;
> >> +    int index;
> >> +
> >> +    index = lookup_erst_record_by_identifier(s,
> >> +        s->record_identifier, &record_found, 0);
> >> +    if (record_found) {
> >> +        rc = copy_from_nvram_by_index(s, (unsigned)index);
> >> +        if (rc == ACPI_ERST_STATUS_SUCCESS) {
> >> +            if (s->record_offset < ERST_RECORD_SIZE) {
> >> +                memcpy(&s->record[s->record_offset], s->tmp_record,
> >> +                    ERST_RECORD_SIZE - s->record_offset);
> >> +            }
> >> +        }
> >> +    }
> >> +
> >> +    return rc;
> >> +}
> >> +
> >> +static unsigned get_erst_record_count(ERSTDeviceState *s)
> >> +{
> >> +    /* Compute record_count */
> >> +    unsigned index = 0;
> >> +
> >> +    s->record_count = 0;
> >> +    while (copy_from_nvram_by_index(s, index) == 
> >> ACPI_ERST_STATUS_SUCCESS) {
> >> +        uint8_t *ptr = &s->tmp_record[0];
> >> +        uint64_t record_identifier = THE_UEFI_CPER_RECORD_ID(ptr);
> >> +        if (IS_UEFI_CPER_RECORD(ptr) &&
> >> +            (ERST_INVALID_RECORD_ID != record_identifier)) {
> >> +            s->record_count += 1;
> >> +        }
> >> +        ++index;
> >> +    }
> >> +    return s->record_count;
> >> +}
> >> +
> >> +/*******************************************************************/
> >> +
> >> +static uint64_t erst_rd_reg64(hwaddr addr,
> >> +    uint64_t reg, unsigned size)
> >> +{
> >> +    uint64_t rdval;
> >> +    uint64_t mask;
> >> +    unsigned shift;
> >> +
> >> +    if (size == sizeof(uint64_t)) {
> >> +        /* 64b access */
> >> +        mask = 0xFFFFFFFFFFFFFFFFUL;
> >> +        shift = 0;
> >> +    } else {
> >> +        /* 32b access */
> >> +        mask = 0x00000000FFFFFFFFUL;
> >> +        shift = ((addr & 0x4) == 0x4) ? 32 : 0;
> >> +    }
> >> +
> >> +    rdval = reg;
> >> +    rdval >>= shift;
> >> +    rdval &= mask;
> >> +
> >> +    return rdval;
> >> +}
> >> +
> >> +static uint64_t erst_wr_reg64(hwaddr addr,
> >> +    uint64_t reg, uint64_t val, unsigned size)
> >> +{
> >> +    uint64_t wrval;
> >> +    uint64_t mask;
> >> +    unsigned shift;
> >> +
> >> +    if (size == sizeof(uint64_t)) {
> >> +        /* 64b access */
> >> +        mask = 0xFFFFFFFFFFFFFFFFUL;
> >> +        shift = 0;
> >> +    } else {
> >> +        /* 32b access */
> >> +        mask = 0x00000000FFFFFFFFUL;
> >> +        shift = ((addr & 0x4) == 0x4) ? 32 : 0;
> >> +    }
> >> +
> >> +    val &= mask;
> >> +    val <<= shift;
> >> +    mask <<= shift;
> >> +    wrval = reg;
> >> +    wrval &= ~mask;
> >> +    wrval |= val;
> >> +
> >> +    return wrval;
> >> +}  
> > (I see in next patch it's us defining access width in the ACPI tables)
> > so question is: do we have to have mixed register width access?
> > can't all register accesses be 64-bit?  
> 
> Initially I attempted to just use 64-bit exclusively. The problem is that,
> for reasons I don't understand, the OSPM on Linux, even x86_64, breaks a 64b
> register access into two. Here's an example of reading the exchange buffer
> address, which is coded as a 64b access:
> 
> acpi_erst_reg_write addr: 0x0000 <== 0x000000000000000d (size: 4)
> acpi_erst_reg_read  addr: 0x0008 ==> 0x00000000c1010000 (size: 4)
> acpi_erst_reg_read  addr: 0x000c ==> 0x0000000000000000 (size: 4)
> 
> So I went ahead and made ACTION register accesses 32b, else there would
> be two reads of 32-bts, of which the second is useless.

could you post here decompiled ERST table?

> 
> >   
> >> +static void erst_reg_write(void *opaque, hwaddr addr,
> >> +    uint64_t val, unsigned size)
> >> +{
> >> +    ERSTDeviceState *s = (ERSTDeviceState *)opaque;
> >> +
> >> +    /*
> >> +     * NOTE: All actions/operations/side effects happen on the WRITE,
> >> +     * by design. The READs simply return the reg_value contents.
> >> +     */
> >> +    trace_acpi_erst_reg_write(addr, val, size);
> >> +
> >> +    switch (addr) {
> >> +    case ERST_CSR_VALUE + 0:
> >> +    case ERST_CSR_VALUE + 4:
> >> +        s->reg_value = erst_wr_reg64(addr, s->reg_value, val, size);
> >> +        break;
> >> +    case ERST_CSR_ACTION + 0:
> >> +/*  case ERST_CSR_ACTION+4: as coded, not really a 64b register */
> >> +        switch (val) {
> >> +        case ACPI_ERST_ACTION_BEGIN_WRITE_OPERATION:
> >> +        case ACPI_ERST_ACTION_BEGIN_READ_OPERATION:
> >> +        case ACPI_ERST_ACTION_BEGIN_CLEAR_OPERATION:
> >> +        case ACPI_ERST_ACTION_BEGIN_DUMMY_WRITE_OPERATION:
> >> +        case ACPI_ERST_ACTION_END_OPERATION:
> >> +            s->operation = val;
> >> +            break;
> >> +        case ACPI_ERST_ACTION_SET_RECORD_OFFSET:
> >> +            s->record_offset = s->reg_value;
> >> +            break;
> >> +        case ACPI_ERST_ACTION_EXECUTE_OPERATION:
> >> +            if ((uint8_t)s->reg_value == ERST_EXECUTE_OPERATION_MAGIC) {
> >> +                s->busy_status = 1;
> >> +                switch (s->operation) {
> >> +                case ACPI_ERST_ACTION_BEGIN_WRITE_OPERATION:
> >> +                    s->command_status = write_erst_record(s);
> >> +                    break;
> >> +                case ACPI_ERST_ACTION_BEGIN_READ_OPERATION:
> >> +                    s->command_status = read_erst_record(s);
> >> +                    break;
> >> +                case ACPI_ERST_ACTION_BEGIN_CLEAR_OPERATION:
> >> +                    s->command_status = clear_erst_record(s);
> >> +                    break;
> >> +                case ACPI_ERST_ACTION_BEGIN_DUMMY_WRITE_OPERATION:
> >> +                    s->command_status = ACPI_ERST_STATUS_SUCCESS;
> >> +                    break;
> >> +                case ACPI_ERST_ACTION_END_OPERATION:
> >> +                    s->command_status = ACPI_ERST_STATUS_SUCCESS;
> >> +                    break;
> >> +                default:
> >> +                    s->command_status = ACPI_ERST_STATUS_FAILED;
> >> +                    break;
> >> +                }
> >> +                s->record_identifier = ERST_INVALID_RECORD_ID;
> >> +                s->busy_status = 0;
> >> +            }
> >> +            break;
> >> +        case ACPI_ERST_ACTION_CHECK_BUSY_STATUS:
> >> +            s->reg_value = s->busy_status;
> >> +            break;
> >> +        case ACPI_ERST_ACTION_GET_COMMAND_STATUS:
> >> +            s->reg_value = s->command_status;
> >> +            break;
> >> +        case ACPI_ERST_ACTION_GET_RECORD_IDENTIFIER:
> >> +            s->command_status = next_erst_record(s, &s->reg_value);
> >> +            break;
> >> +        case ACPI_ERST_ACTION_SET_RECORD_IDENTIFIER:
> >> +            s->record_identifier = s->reg_value;
> >> +            break;
> >> +        case ACPI_ERST_ACTION_GET_RECORD_COUNT:
> >> +            s->reg_value = s->record_count;
> >> +            break;
> >> +        case ACPI_ERST_ACTION_GET_ERROR_LOG_ADDRESS_RANGE:
> >> +            s->reg_value = s->bar1;
> >> +            break;
> >> +        case ACPI_ERST_ACTION_GET_ERROR_LOG_ADDRESS_LENGTH:
> >> +            s->reg_value = ERST_RECORD_SIZE;
> >> +            break;
> >> +        case ACPI_ERST_ACTION_GET_ERROR_LOG_ADDRESS_RANGE_ATTRIBUTES:
> >> +            s->reg_value = 0x0; /* intentional, not NVRAM mode */
> >> +            break;
> >> +        case ACPI_ERST_ACTION_GET_EXECUTE_OPERATION_TIMINGS:
> >> +            /*
> >> +             * 100UL is max, 10UL is nominal  
> > 100/10 of what, also add reference to spec/table it comes from
> > and explain in comment why theses values were chosen  
> I've changed the comment and style to be similar to build_amd_iommu().
> These are merely sane non-zero max/min times.
> 
> >   
> >> +             */
> >> +            s->reg_value = ((100UL << 32) | (10UL << 0));
> >> +            break;
> >> +        case ACPI_ERST_ACTION_RESERVED:  
> > not necessary, it will be handled by 'default:'  
> Corrected
> 
> >   
> >> +        default:
> >> +            /*
> >> +             * Unknown action/command, NOP
> >> +             */
> >> +            break;
> >> +        }
> >> +        break;
> >> +    default:
> >> +        /*
> >> +         * This should not happen, but if it does, NOP
> >> +         */
> >> +        break;
> >> +    }
> >> +}
> >> +
> >> +static uint64_t erst_reg_read(void *opaque, hwaddr addr,
> >> +                                unsigned size)
> >> +{
> >> +    ERSTDeviceState *s = (ERSTDeviceState *)opaque;
> >> +    uint64_t val = 0;
> >> +
> >> +    switch (addr) {
> >> +    case ERST_CSR_ACTION + 0:
> >> +    case ERST_CSR_ACTION + 4:
> >> +        val = erst_rd_reg64(addr, s->reg_action, size);
> >> +        break;
> >> +    case ERST_CSR_VALUE + 0:
> >> +    case ERST_CSR_VALUE + 4:
> >> +        val = erst_rd_reg64(addr, s->reg_value, size);
> >> +        break;
> >> +    default:
> >> +        break;
> >> +    }
> >> +    trace_acpi_erst_reg_read(addr, val, size);
> >> +    return val;
> >> +}
> >> +
> >> +static const MemoryRegionOps erst_reg_ops = {
> >> +    .read = erst_reg_read,
> >> +    .write = erst_reg_write,
> >> +    .endianness = DEVICE_NATIVE_ENDIAN,
> >> +};
> >> +
> >> +static void erst_mem_write(void *opaque, hwaddr addr,
> >> +    uint64_t val, unsigned size)
> >> +{
> >> +    ERSTDeviceState *s = (ERSTDeviceState *)opaque;  
> >   
> >> +    uint8_t *ptr = &s->record[addr - 0];
> >> +    trace_acpi_erst_mem_write(addr, val, size);
> >> +    switch (size) {
> >> +    default:
> >> +    case sizeof(uint8_t):
> >> +        *(uint8_t *)ptr = (uint8_t)val;
> >> +        break;
> >> +    case sizeof(uint16_t):
> >> +        *(uint16_t *)ptr = (uint16_t)val;
> >> +        break;
> >> +    case sizeof(uint32_t):
> >> +        *(uint32_t *)ptr = (uint32_t)val;
> >> +        break;
> >> +    case sizeof(uint64_t):
> >> +        *(uint64_t *)ptr = (uint64_t)val;
> >> +        break;
> >> +    }
> >> +}
> >> +
> >> +static uint64_t erst_mem_read(void *opaque, hwaddr addr,
> >> +                                unsigned size)
> >> +{
> >> +    ERSTDeviceState *s = (ERSTDeviceState *)opaque;
> >> +    uint8_t *ptr = &s->record[addr - 0];
> >> +    uint64_t val = 0;
> >> +    switch (size) {
> >> +    default:
> >> +    case sizeof(uint8_t):
> >> +        val = *(uint8_t *)ptr;
> >> +        break;
> >> +    case sizeof(uint16_t):
> >> +        val = *(uint16_t *)ptr;
> >> +        break;
> >> +    case sizeof(uint32_t):
> >> +        val = *(uint32_t *)ptr;
> >> +        break;
> >> +    case sizeof(uint64_t):
> >> +        val = *(uint64_t *)ptr;
> >> +        break;
> >> +    }
> >> +    trace_acpi_erst_mem_read(addr, val, size);
> >> +    return val;
> >> +}
> >> +
> >> +static const MemoryRegionOps erst_mem_ops = {
> >> +    .read = erst_mem_read,
> >> +    .write = erst_mem_write,
> >> +    .endianness = DEVICE_NATIVE_ENDIAN,
> >> +};
> >> +
> >> +/*******************************************************************/
> >> +/*******************************************************************/
> >> +
> >> +static const VMStateDescription erst_vmstate  = {
> >> +    .name = "acpi-erst",
> >> +    .version_id = 1,
> >> +    .minimum_version_id = 1,
> >> +    .fields = (VMStateField[]) {
> >> +        VMSTATE_UINT8(operation, ERSTDeviceState),
> >> +        VMSTATE_UINT8(busy_status, ERSTDeviceState),
> >> +        VMSTATE_UINT8(command_status, ERSTDeviceState),
> >> +        VMSTATE_UINT32(record_offset, ERSTDeviceState),
> >> +        VMSTATE_UINT32(record_count, ERSTDeviceState),
> >> +        VMSTATE_UINT64(reg_action, ERSTDeviceState),
> >> +        VMSTATE_UINT64(reg_value, ERSTDeviceState),
> >> +        VMSTATE_UINT64(record_identifier, ERSTDeviceState),
> >> +        VMSTATE_UINT8_ARRAY(record, ERSTDeviceState, ERST_RECORD_SIZE),
> >> +        VMSTATE_UINT8_ARRAY(tmp_record, ERSTDeviceState, 
> >> ERST_RECORD_SIZE),
> >> +        VMSTATE_END_OF_LIST()
> >> +    }
> >> +};
> >> +
> >> +static void erst_realizefn(PCIDevice *pci_dev, Error **errp)
> >> +{
> >> +    ERSTDeviceState *s = ACPIERST(pci_dev);
> >> +    unsigned index = 0;
> >> +    bool share;
> >> +
> >> +    trace_acpi_erst_realizefn_in();
> >> +
> >> +    if (!s->hostmem) {
> >> +        error_setg(errp, "'" ACPI_ERST_MEMDEV_PROP "' property is not 
> >> set");
> >> +        return;
> >> +    } else if (host_memory_backend_is_mapped(s->hostmem)) {
> >> +        error_setg(errp, "can't use already busy memdev: %s",
> >> +                   
> >> object_get_canonical_path_component(OBJECT(s->hostmem)));
> >> +        return;
> >> +    }
> >> +
> >> +    share = object_property_get_bool(OBJECT(s->hostmem), "share", 
> >> &error_fatal);  
> > s/&error_fatal/errp/  
> Corrected
> 
> >   
> >> +    if (!share) {
> >> +        error_setg(errp, "ACPI ERST requires hostmem property share=on: 
> >> %s",
> >> +                   
> >> object_get_canonical_path_component(OBJECT(s->hostmem)));
> >> +    }  
> > This limits possible to use backends to file|memfd only, so
> > I wonder if really need this limitation, what if user doesn't
> > care about preserving it across QEMU restarts. (i.e. usecase
> > where storage is used as a means to troubleshoot guest crash
> > i.e. QEMU is not restarted in between)
> > 
> > Maybe instead of enforcing we should just document that if user
> > wishes to preserve content they should use file|memfd backend with
> > share=on option.  
> 
> I've removed this check. It is documented the way it is intended to be used.
> 
> >   
> >> +
> >> +    s->hostmem_mr = host_memory_backend_get_memory(s->hostmem);
> >> +
> >> +    /* HostMemoryBackend size will be multiple of PAGE_SIZE */
> >> +    s->prop_size = object_property_get_int(OBJECT(s->hostmem), "size", 
> >> &error_fatal);  
> > s/&error_fatal/errp/  
> Corrected
> 
> >   
> >> +
> >> +    /* Convert prop_size to integer multiple of ERST_RECORD_SIZE */
> >> +    s->prop_size -= (s->prop_size % ERST_RECORD_SIZE);  
> > 
> > pls, no fixups on behalf of user, if size is not what it should be
> > error out with suggestion how to fix it.  
> Removed
> 
> >   
> >> +
> >> +    /*
> >> +     * MemoryBackend initializes contents to zero, but we actually
> >> +     * want contents initialized to 0xFF, ERST_INVALID_RECORD_ID.
> >> +     */
> >> +    if (copy_from_nvram_by_index(s, 0) == ACPI_ERST_STATUS_SUCCESS) {
> >> +        if (s->tmp_record[0] == 0x00) {
> >> +            memset(s->tmp_record, 0xFF, ERST_RECORD_SIZE);  
> > this doesn't scale,
> > (set backend size to more than host physical RAM, put it on slow storage 
> > and have fun.)  
> of course, which is why i think we need to have an upper bound (my early
> submissions did).
> 
> > 
> > Is it possible to use 0 as invalid record id or change storage format
> > so you would not have to rewrite whole file at startup (maybe some sort  
> no
> 
> > of metadata header/records book-keeping table before actual records.
> > And initialize file only if header is invalid.)  
> I have to scan the backend storage anyway in order to initialize the record
> count, so I've combined that scan with a test to see if the backend storage
> needs to be initialized.


if you add a records table at the start of backend,
then you won't need to read/write whole file.
It would be enough to read/initialize header only and access
actual records only when necessary. Header could look something like:

|erst magic string|
|slot size|
|slots nr|
+++++++++++++++++ slots header ++++++++++++
|is_empty, record offset from file start, maybe something else that would speed 
up access|
...
|last record descriptor|
++++++++++ actual records +++++++++++++
|slot 0|
...
|slot n|

> >> +            index = 0;
> >> +            while (copy_to_nvram_by_index(s, index) == 
> >> ACPI_ERST_STATUS_SUCCESS) {
> >> +                ++index;
> >> +            }  
> > also back&forth copying here is not really necessary.  
> corrected
> 
> >   
> >> +        }
> >> +    }
> >> +
> >> +    /* Initialize record_count */
> >> +    get_erst_record_count(s);  
> > why not put it into reset?  
> It is initialized once, then subsequent write/clear operations update
> the counter as needed.

ok

> >   
> >> +
> >> +    /* BAR 0: Programming registers */
> >> +    memory_region_init_io(&s->iomem, OBJECT(pci_dev), &erst_reg_ops, s,
> >> +                          TYPE_ACPI_ERST, ERST_REG_SIZE);
> >> +    pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, 
> >> &s->iomem);
> >> +  
> >   
> >> +    /* BAR 1: Exchange buffer memory */
> >> +    memory_region_init_io(&s->nvmem, OBJECT(pci_dev), &erst_mem_ops, s,
> >> +                          TYPE_ACPI_ERST, ERST_RECORD_SIZE);
> >> +    pci_register_bar(pci_dev, 1, PCI_BASE_ADDRESS_SPACE_MEMORY, 
> >> &s->nvmem);  
> > 
> > **)
> > instead of using mmio for buffer where each write causes
> > guest exit to QEMU, map memory region directly to guest.
> > see ivshmem_bar2, the only difference with ivshmem, you'd
> > create memory region manually (for example you can use
> > memory_region_init_resizeable_ram)
> > 
> > this way you can speedup access and drop erst_mem_ops and
> > [tmp_]record intermediate buffers.
> > 
> > Instead of [tmp_]record you can copy record content
> > directly between buffer and backend memory regions.  
> 
> I've changed the exchange buffer into a MemoryBackend object and
> eliminated the erst_mem_ops.
> 
> >   
> >> +    /*
> >> +     * The vmstate_register_ram_global() puts the memory in
> >> +     * migration stream, where it is written back to the memory
> >> +     * upon reaching the destination, which causes the backing
> >> +     * file to be updated (with share=on).
> >> +     */
> >> +    vmstate_register_ram_global(s->hostmem_mr);
> >> +
> >> +    trace_acpi_erst_realizefn_out(s->prop_size);
> >> +}
> >> +
> >> +static void erst_reset(DeviceState *dev)
> >> +{
> >> +    ERSTDeviceState *s = ACPIERST(dev);
> >> +
> >> +    trace_acpi_erst_reset_in(s->record_count);
> >> +    s->operation = 0;
> >> +    s->busy_status = 0;
> >> +    s->command_status = ACPI_ERST_STATUS_SUCCESS;  
> >   
> >> +    /* indicate empty/no-more until further notice */  
> > pls rephrase, I'm not sure what it's trying to say  
> Eliminated; I don't know why I was trying to say there either
> >   
> >> +    s->record_identifier = ERST_INVALID_RECORD_ID;
> >> +    s->record_offset = 0;
> >> +    s->next_record_index = 0;  
> >   
> >> +    /* NOTE: record_count and nvram are initialized elsewhere */
> >> +    trace_acpi_erst_reset_out(s->record_count);
> >> +}
> >> +
> >> +static Property erst_properties[] = {
> >> +    DEFINE_PROP_LINK(ACPI_ERST_MEMDEV_PROP, ERSTDeviceState, hostmem,
> >> +                     TYPE_MEMORY_BACKEND, HostMemoryBackend *),
> >> +    DEFINE_PROP_END_OF_LIST(),
> >> +};
> >> +
> >> +static void erst_class_init(ObjectClass *klass, void *data)
> >> +{
> >> +    DeviceClass *dc = DEVICE_CLASS(klass);
> >> +    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
> >> +
> >> +    trace_acpi_erst_class_init_in();
> >> +    k->realize = erst_realizefn;
> >> +    k->vendor_id = PCI_VENDOR_ID_REDHAT;
> >> +    k->device_id = PCI_DEVICE_ID_REDHAT_ACPI_ERST;
> >> +    k->revision = 0x00;
> >> +    k->class_id = PCI_CLASS_OTHERS;
> >> +    dc->reset = erst_reset;
> >> +    dc->vmsd = &erst_vmstate;
> >> +    dc->user_creatable = true;
> >> +    device_class_set_props(dc, erst_properties);
> >> +    dc->desc = "ACPI Error Record Serialization Table (ERST) device";
> >> +    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> >> +    trace_acpi_erst_class_init_out();
> >> +}
> >> +
> >> +static const TypeInfo erst_type_info = {
> >> +    .name          = TYPE_ACPI_ERST,
> >> +    .parent        = TYPE_PCI_DEVICE,
> >> +    .class_init    = erst_class_init,
> >> +    .instance_size = sizeof(ERSTDeviceState),
> >> +    .interfaces = (InterfaceInfo[]) {
> >> +        { INTERFACE_CONVENTIONAL_PCI_DEVICE },  
> > what is this for here?
> >   
> >> +        { }
> >> +    }
> >> +};
> >> +
> >> +static void erst_register_types(void)
> >> +{
> >> +    type_register_static(&erst_type_info);
> >> +}
> >> +
> >> +type_init(erst_register_types)
> >> diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build
> >> index dd69577..262a8ee 100644
> >> --- a/hw/acpi/meson.build
> >> +++ b/hw/acpi/meson.build
> >> @@ -4,6 +4,7 @@ acpi_ss.add(files(
> >>     'aml-build.c',
> >>     'bios-linker-loader.c',
> >>     'utils.c',
> >> +  'erst.c',
> >>   ))
> >>   acpi_ss.add(when: 'CONFIG_ACPI_CPU_HOTPLUG', if_true: files('cpu.c'))
> >>   acpi_ss.add(when: 'CONFIG_ACPI_CPU_HOTPLUG', if_true: 
> >> files('cpu_hotplug.c'))  
> >   
> 




reply via email to

[Prev in Thread] Current Thread [Next in Thread]