qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 12/25] spapr: introduce a XIVE interrupt present


From: Cédric Le Goater
Subject: Re: [Qemu-devel] [PATCH 12/25] spapr: introduce a XIVE interrupt presenter model
Date: Wed, 29 Nov 2017 10:55:34 +0100
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Thunderbird/52.4.0

On 11/29/2017 06:11 AM, David Gibson wrote:
> On Thu, Nov 23, 2017 at 02:29:42PM +0100, Cédric Le Goater wrote:
>> The XIVE interrupt presenter exposes a set of rings, also called
>> Thread Interrupt Management Areas (TIMA), to handle priority
>> management and interrupt acknowledgment among other things. There is
>> one ring per level of privilege, four in all. The one we are
>> interested in for the sPAPR machine is the OS ring.
>>
>> The TIMA is mapped at the same address for each CPU. 'current_cpu' is
>> used to retrieve the targeted interrupt presenter object holding the
>> cache data of the registers the model use.
>>
>> Signed-off-by: Cédric Le Goater <address@hidden>
>> ---
>>  hw/intc/spapr_xive.c        | 271 
>> ++++++++++++++++++++++++++++++++++++++++++++
>>  hw/intc/xive-internal.h     |  89 +++++++++++++++
>>  include/hw/ppc/spapr_xive.h |  11 ++
>>  3 files changed, 371 insertions(+)
>>
>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>> index b1e3f8710cff..554b25e0884c 100644
>> --- a/hw/intc/spapr_xive.c
>> +++ b/hw/intc/spapr_xive.c
>> @@ -23,9 +23,166 @@
>>  #include "sysemu/dma.h"
>>  #include "monitor/monitor.h"
>>  #include "hw/ppc/spapr_xive.h"
>> +#include "hw/ppc/xics.h"
>>  
>>  #include "xive-internal.h"
>>  
>> +struct sPAPRXiveICP {
> 
> I'd really prefer to avoid calling anything in xive "icp" to avoid
> confusion with xics.

OK. 

The specs refers to the whole as an IVPE : Interrupt Virtualization 
Presentation Engine. In our model, we use the TIMA cached values of 
the OS ring and the qemu_irq for the CPU line. 

Would 'sPAPRXivePresenter' be fine ?  


>> +    DeviceState parent_obj;
>> +
>> +    CPUState  *cs;
>> +    uint8_t   tima[TM_RING_COUNT * 0x10];
> 
> What does the 0x10 represent?  #define for clarity, maybe.

yes.

> Do we need to model the whole range as memory, or just the relevant
> pieces with read/write meaning?

Yes. we could limit the TIMA and MMIO region to what sPAPR only needs : 
the OS ring. 

> 
>> +    uint8_t   *tima_os;
>> +    qemu_irq  output;
>> +};
>> +
>> +static uint64_t spapr_xive_icp_accept(sPAPRXiveICP *icp)
>> +{
>> +    return 0;
>> +}
>> +
>> +static void spapr_xive_icp_set_cppr(sPAPRXiveICP *icp, uint8_t cppr)
>> +{
>> +    if (cppr > XIVE_PRIORITY_MAX) {
>> +        cppr = 0xff;
>> +    }
>> +
>> +    icp->tima_os[TM_CPPR] = cppr;
>> +}
>> +
>> +/*
>> + * Thread Interrupt Management Area MMIO
>> + */
>> +static uint64_t spapr_xive_tm_read_special(sPAPRXiveICP *icp, hwaddr offset,
>> +                                     unsigned size)
>> +{
>> +    uint64_t ret = -1;
>> +
>> +    if (offset == TM_SPC_ACK_OS_REG && size == 2) {
>> +        ret = spapr_xive_icp_accept(icp);
>> +    } else {
>> +        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA read @%"
>> +                      HWADDR_PRIx" size %d\n", offset, size);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static uint64_t spapr_xive_tm_read(void *opaque, hwaddr offset, unsigned 
>> size)
>> +{
>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
> 
> So, strictly speaking this could be handled by setting each of the
> CPUs address spaces separately, to something with their own TIMA
> superimposed on address_space_memory. 

Ah. I didn't know we could do that.

> What you have might be more practical though.

well, you will see at the end of the patchset how cpu->intc is assigned.

>> +    sPAPRXiveICP *icp = SPAPR_XIVE_ICP(cpu->intc);
>> +    uint64_t ret = -1;
>> +    int i;
>> +
>> +    if (offset >= TM_SPC_ACK_EBB) {
>> +        return spapr_xive_tm_read_special(icp, offset, size);
>> +    }
>> +
>> +    if ((offset & 0xf0) == TM_QW1_OS) {
>> +        switch (size) {
>> +        case 1:
>> +        case 2:
>> +        case 4:
>> +        case 8:
>> +            if (QEMU_IS_ALIGNED(offset, size)) {
> 
> Hm, the MR subsystem doesn't already split unaligned accesses?

euh. yes, I might be doing a little too much.

>> +                ret = 0;
>> +                for (i = 0; i < size; i++) {
>> +                    ret |= icp->tima[offset + i] << (8 * i);
>> +                }
>> +            } else {
>> +                qemu_log_mask(LOG_GUEST_ERROR,
>> +                              "XIVE: invalid TIMA read alignment @%"
>> +                              HWADDR_PRIx" size %d\n", offset, size);
>> +            }
>> +            break;
>> +        default:
>> +            g_assert_not_reached();
>> +        }
>> +    } else {
>> +        qemu_log_mask(LOG_UNIMP, "XIVE: does handle non-OS TIMA ring @%"
>> +                      HWADDR_PRIx"\n", offset);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static bool spapr_xive_tm_is_readonly(uint8_t offset)
>> +{
>> +    /* Let's be optimistic and prepare ground for HV mode support */
>> +    switch (offset) {
>> +    case TM_QW1_OS + TM_CPPR:
>> +        return false;
>> +    default:
>> +        return true;
>> +    }
>> +}
>> +
>> +static void spapr_xive_tm_write_special(sPAPRXiveICP *icp, hwaddr offset,
>> +                                  uint64_t value, unsigned size)
>> +{
>> +    /* TODO: support TM_SPC_SET_OS_PENDING */
>> +
>> +    /* TODO: support TM_SPC_ACK_OS_EL */
>> +}
>> +
>> +static void spapr_xive_tm_write(void *opaque, hwaddr offset,
>> +                           uint64_t value, unsigned size)
>> +{
>> +    PowerPCCPU *cpu = POWERPC_CPU(current_cpu);
>> +    sPAPRXiveICP *icp = SPAPR_XIVE_ICP(cpu->intc);
>> +    int i;
>> +
>> +    if (offset >= TM_SPC_ACK_EBB) {
>> +        spapr_xive_tm_write_special(icp, offset, value, size);
>> +        return;
>> +    }
>> +
>> +    if ((offset & 0xf0) == TM_QW1_OS) {
>> +        switch (size) {
>> +        case 1:
>> +            if (offset == TM_QW1_OS + TM_CPPR) {
>> +                spapr_xive_icp_set_cppr(icp, value & 0xff);
>> +            }
>> +            break;
>> +        case 4:
>> +        case 8:
>> +            if (QEMU_IS_ALIGNED(offset, size)) {
>> +                for (i = 0; i < size; i++) {
>> +                    if (!spapr_xive_tm_is_readonly(offset + i)) {
>> +                        icp->tima[offset + i] = (value >> (8 * i)) & 0xff;
>> +                    }
>> +                }
>> +            } else {
>> +                qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA write @%"
>> +                              HWADDR_PRIx" size %d\n", offset, size);
>> +            }
>> +            break;
>> +        default:
>> +            qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA write @%"
>> +                          HWADDR_PRIx" size %d\n", offset, size);
>> +        }
>> +    } else {
>> +        qemu_log_mask(LOG_UNIMP, "XIVE: does handle non-OS TIMA ring @%"
>> +                      HWADDR_PRIx"\n", offset);
> 
> The many qemu_log()s worry me a little.  They're not ratelimited, so
> the guest could in principle chew through the host's log space.
> 
> IIUC these are very unlikely to be hit in practice, so maybe
> tracepoints would be more suitable.

ok.

>> +    }
>> +}
>> +
>> +
>> +static const MemoryRegionOps spapr_xive_tm_ops = {
>> +    .read = spapr_xive_tm_read,
>> +    .write = spapr_xive_tm_write,
>> +    .endianness = DEVICE_BIG_ENDIAN,
>> +    .valid = {
>> +        .min_access_size = 1,
>> +        .max_access_size = 8,
>> +    },
>> +    .impl = {
>> +        .min_access_size = 1,
>> +        .max_access_size = 8,
>> +    },
>> +};
>> +
>>  static void spapr_xive_irq(sPAPRXive *xive, int lisn)
>>  {
>>  
>> @@ -287,6 +444,11 @@ static void spapr_xive_source_set_irq(void *opaque, int 
>> lisn, int val)
>>  #define VC_BAR_SIZE      0x08000000000ull
>>  #define ESB_SHIFT        16 /* One 64k page. OPAL has two */
>>  
>> +/* Thread Interrupt Management Area MMIO */
>> +#define TM_BAR_DEFAULT   0x30203180000ull
>> +#define TM_SHIFT         16
>> +#define TM_BAR_SIZE      (TM_RING_COUNT * (1 << TM_SHIFT))
>> +
>>  static uint64_t spapr_xive_esb_default_read(void *p, hwaddr offset,
>>                                              unsigned size)
>>  {
>> @@ -392,6 +554,14 @@ static void spapr_xive_realize(DeviceState *dev, Error 
>> **errp)
>>                            (1ull << xive->esb_shift) * xive->nr_irqs);
>>      memory_region_add_subregion(&xive->esb_mr, 0, &xive->esb_iomem);
>>  
>> +    /* TM BAR. Same address for each chip */
>> +    xive->tm_base = (P9_MMIO_BASE | TM_BAR_DEFAULT);
>> +    xive->tm_shift = TM_SHIFT;
> 
> Any reason for this to be a variable?

no, we could just use TM_SHIFT. I will look into it.

>> +
>> +    memory_region_init_io(&xive->tm_iomem, OBJECT(xive), &spapr_xive_tm_ops,
>> +                          xive, "xive.tm", TM_BAR_SIZE);
>> +    sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_iomem);
>> +
>>      qemu_register_reset(spapr_xive_reset, dev);
>>  }
>>  
>> @@ -448,9 +618,110 @@ static const TypeInfo spapr_xive_info = {
>>      .class_init = spapr_xive_class_init,
>>  };
>>  
>> +void spapr_xive_icp_pic_print_info(sPAPRXiveICP *xicp, Monitor *mon)
>> +{
>> +    int cpu_index = xicp->cs ? xicp->cs->cpu_index : -1;
>> +
>> +    monitor_printf(mon, "CPU %d CPPR=%02x IPB=%02x PIPR=%02x NSR=%02x\n",
>> +                   cpu_index, xicp->tima_os[TM_CPPR], xicp->tima_os[TM_IPB],
>> +                   xicp->tima_os[TM_PIPR], xicp->tima_os[TM_NSR]);
>> +}
>> +
>> +static void spapr_xive_icp_reset(void *dev)
>> +{
>> +    sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(dev);
>> +
>> +    memset(xicp->tima, 0, sizeof(xicp->tima));
>> +}
>> +
>> +static void spapr_xive_icp_realize(DeviceState *dev, Error **errp)
>> +{
>> +    sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(dev);
>> +    PowerPCCPU *cpu;
>> +    CPUPPCState *env;
>> +    Object *obj;
>> +    Error *err = NULL;
>> +
>> +    obj = object_property_get_link(OBJECT(dev), ICP_PROP_CPU, &err);
>> +    if (!obj) {
>> +        error_propagate(errp, err);
>> +        error_prepend(errp, "required link '" ICP_PROP_CPU "' not found: ");
>> +        return;
>> +    }
>> +
>> +    cpu = POWERPC_CPU(obj);
>> +    xicp->cs = CPU(obj);
>> +
>> +    env = &cpu->env;
>> +    switch (PPC_INPUT(env)) {
>> +    case PPC_FLAGS_INPUT_POWER7:
>> +        xicp->output = env->irq_inputs[POWER7_INPUT_INT];
>> +        break;
>> +
>> +    case PPC_FLAGS_INPUT_970:
>> +        xicp->output = env->irq_inputs[PPC970_INPUT_INT];
>> +        break;
> 
> I really don't think we need to implement XIVE for 970.

Indeed. This is a left over from a copy/paste of the ICPState 
realize routine.

>> +
>> +    default:
>> +        error_setg(errp, "XIVE interrupt controller does not support "
>> +                   "this CPU bus model");
>> +        return;
>> +    }
>> +
>> +    qemu_register_reset(spapr_xive_icp_reset, dev);
>> +}
>> +
>> +static void spapr_xive_icp_unrealize(DeviceState *dev, Error **errp)
>> +{
>> +    qemu_unregister_reset(spapr_xive_icp_reset, dev);
>> +}
>> +
>> +static void spapr_xive_icp_init(Object *obj)
>> +{
>> +    sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(obj);
>> +
>> +    xicp->tima_os = &xicp->tima[TM_QW1_OS];
> 
> This is a fixed offset, so why store it as a pointer.  For the PAPR
> guest case, do we even need to model the other rings?

No we don't. I will simplify.

Thanks,

C. 

> 
>> +}
>> +
>> +static bool vmstate_spapr_xive_icp_needed(void *opaque)
>> +{
>> +    /* TODO check machine XIVE support */
>> +    return true;
>> +}
>> +
>> +static const VMStateDescription vmstate_spapr_xive_icp = {
>> +    .name = TYPE_SPAPR_XIVE_ICP,
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .needed = vmstate_spapr_xive_icp_needed,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_BUFFER(tima, sPAPRXiveICP),
>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>> +static void spapr_xive_icp_class_init(ObjectClass *klass, void *data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +
>> +    dc->realize = spapr_xive_icp_realize;
>> +    dc->unrealize = spapr_xive_icp_unrealize;
>> +    dc->desc = "sPAPR XIVE Interrupt Presenter";
>> +    dc->vmsd = &vmstate_spapr_xive_icp;
>> +}
>> +
>> +static const TypeInfo xive_icp_info = {
>> +    .name          = TYPE_SPAPR_XIVE_ICP,
>> +    .parent        = TYPE_DEVICE,
>> +    .instance_size = sizeof(sPAPRXiveICP),
>> +    .instance_init = spapr_xive_icp_init,
>> +    .class_init    = spapr_xive_icp_class_init,
>> +};
>> +
>>  static void spapr_xive_register_types(void)
>>  {
>>      type_register_static(&spapr_xive_info);
>> +    type_register_static(&xive_icp_info);
>>  }
>>  
>>  type_init(spapr_xive_register_types)
>> diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h
>> index bea88d82992c..7d329f203a9b 100644
>> --- a/hw/intc/xive-internal.h
>> +++ b/hw/intc/xive-internal.h
>> @@ -24,6 +24,93 @@
>>  #define PPC_BITMASK32(bs, be)   ((PPC_BIT32(bs) - PPC_BIT32(be)) | \
>>                                   PPC_BIT32(bs))
>>  
>> +/*
>> + * Thread Management (aka "TM") registers
> 
> Because "TM" didn't stand for enough things already :/.
> 
>> + */
>> +
>> +/* Number of Thread Management Interrupt Areas */
>> +#define TM_RING_COUNT 4
>> +
>> +/* TM register offsets */
>> +#define TM_QW0_USER             0x000 /* All rings */
>> +#define TM_QW1_OS               0x010 /* Ring 0..2 */
>> +#define TM_QW2_HV_POOL          0x020 /* Ring 0..1 */
>> +#define TM_QW3_HV_PHYS          0x030 /* Ring 0..1 */
>> +
>> +/* Byte offsets inside a QW             QW0 QW1 QW2 QW3 */
>> +#define TM_NSR                  0x0  /*  +   +   -   +  */
>> +#define TM_CPPR                 0x1  /*  -   +   -   +  */
>> +#define TM_IPB                  0x2  /*  -   +   +   +  */
>> +#define TM_LSMFB                0x3  /*  -   +   +   +  */
>> +#define TM_ACK_CNT              0x4  /*  -   +   -   -  */
>> +#define TM_INC                  0x5  /*  -   +   -   +  */
>> +#define TM_AGE                  0x6  /*  -   +   -   +  */
>> +#define TM_PIPR                 0x7  /*  -   +   -   +  */
>> +
>> +#define TM_WORD0                0x0
>> +#define TM_WORD1                0x4
>> +
>> +/*
>> + * QW word 2 contains the valid bit at the top and other fields
>> + * depending on the QW.
>> + */
>> +#define TM_WORD2                0x8
>> +#define   TM_QW0W2_VU           PPC_BIT32(0)
>> +#define   TM_QW0W2_LOGIC_SERV   PPC_BITMASK32(1, 31) /* XX 2,31 ? */
>> +#define   TM_QW1W2_VO           PPC_BIT32(0)
>> +#define   TM_QW1W2_OS_CAM       PPC_BITMASK32(8, 31)
>> +#define   TM_QW2W2_VP           PPC_BIT32(0)
>> +#define   TM_QW2W2_POOL_CAM     PPC_BITMASK32(8, 31)
>> +#define   TM_QW3W2_VT           PPC_BIT32(0)
>> +#define   TM_QW3W2_LP           PPC_BIT32(6)
>> +#define   TM_QW3W2_LE           PPC_BIT32(7)
>> +#define   TM_QW3W2_T            PPC_BIT32(31)
>> +
>> +/*
>> + * In addition to normal loads to "peek" and writes (only when invalid)
>> + * using 4 and 8 bytes accesses, the above registers support these
>> + * "special" byte operations:
>> + *
>> + *   - Byte load from QW0[NSR] - User level NSR (EBB)
>> + *   - Byte store to QW0[NSR] - User level NSR (EBB)
>> + *   - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access
>> + *   - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0
>> + *                                    otherwise VT||0000000
>> + *   - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present)
>> + *
>> + * Then we have all these "special" CI ops at these offset that trigger
>> + * all sorts of side effects:
>> + */
>> +#define TM_SPC_ACK_EBB          0x800   /* Load8 ack EBB to reg*/
>> +#define TM_SPC_ACK_OS_REG       0x810   /* Load16 ack OS irq to reg */
>> +#define TM_SPC_PUSH_USR_CTX     0x808   /* Store32 Push/Validate user 
>> context */
>> +#define TM_SPC_PULL_USR_CTX     0x808   /* Load32 Pull/Invalidate user
>> +                                         * context */
>> +#define TM_SPC_SET_OS_PENDING   0x812   /* Store8 Set OS irq pending bit */
>> +#define TM_SPC_PULL_OS_CTX      0x818   /* Load32/Load64 Pull/Invalidate OS
>> +                                         * context to reg */
>> +#define TM_SPC_PULL_POOL_CTX    0x828   /* Load32/Load64 Pull/Invalidate 
>> Pool
>> +                                         * context to reg*/
>> +#define TM_SPC_ACK_HV_REG       0x830   /* Load16 ack HV irq to reg */
>> +#define TM_SPC_PULL_USR_CTX_OL  0xc08   /* Store8 Pull/Inval usr ctx to odd
>> +                                         * line */
>> +#define TM_SPC_ACK_OS_EL        0xc10   /* Store8 ack OS irq to even line */
>> +#define TM_SPC_ACK_HV_POOL_EL   0xc20   /* Store8 ack HV evt pool to even
>> +                                         * line */
>> +#define TM_SPC_ACK_HV_EL        0xc30   /* Store8 ack HV irq to even line */
>> +/* XXX more... */
>> +
>> +/* NSR fields for the various QW ack types */
>> +#define TM_QW0_NSR_EB           PPC_BIT8(0)
>> +#define TM_QW1_NSR_EO           PPC_BIT8(0)
>> +#define TM_QW3_NSR_HE           PPC_BITMASK8(0, 1)
>> +#define  TM_QW3_NSR_HE_NONE     0
>> +#define  TM_QW3_NSR_HE_POOL     1
>> +#define  TM_QW3_NSR_HE_PHYS     2
>> +#define  TM_QW3_NSR_HE_LSI      3
>> +#define TM_QW3_NSR_I            PPC_BIT8(2)
>> +#define TM_QW3_NSR_GRP_LVL      PPC_BIT8(3, 7)
>> +
>>  /* IVE/EAS
>>   *
>>   * One per interrupt source. Targets that interrupt to a given EQ
>> @@ -44,6 +131,8 @@ typedef struct XiveIVE {
>>  #define IVE_EQ_DATA     PPC_BITMASK(33, 63)      /* Data written to the EQ 
>> */
>>  } XiveIVE;
>>  
>> +#define XIVE_PRIORITY_MAX  7
>> +
>>  void spapr_xive_reset(void *dev);
>>  XiveIVE *spapr_xive_get_ive(sPAPRXive *xive, uint32_t lisn);
>>  
>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>> index 7a308fb4db2b..6e8a189e723f 100644
>> --- a/include/hw/ppc/spapr_xive.h
>> +++ b/include/hw/ppc/spapr_xive.h
>> @@ -23,10 +23,15 @@
>>  
>>  typedef struct sPAPRXive sPAPRXive;
>>  typedef struct XiveIVE XiveIVE;
>> +typedef struct sPAPRXiveICP sPAPRXiveICP;
>>  
>>  #define TYPE_SPAPR_XIVE "spapr-xive"
>>  #define SPAPR_XIVE(obj) OBJECT_CHECK(sPAPRXive, (obj), TYPE_SPAPR_XIVE)
>>  
>> +#define TYPE_SPAPR_XIVE_ICP "spapr-xive-icp"
>> +#define SPAPR_XIVE_ICP(obj) \
>> +    OBJECT_CHECK(sPAPRXiveICP, (obj), TYPE_SPAPR_XIVE_ICP)
>> +
>>  struct sPAPRXive {
>>      SysBusDevice parent;
>>  
>> @@ -57,6 +62,11 @@ struct sPAPRXive {
>>      hwaddr       esb_base;
>>      MemoryRegion esb_mr;
>>      MemoryRegion esb_iomem;
>> +
>> +    /* TIMA memory region */
>> +    uint32_t     tm_shift;
>> +    hwaddr       tm_base;
>> +    MemoryRegion tm_iomem;
>>  };
>>  
>>  static inline bool spapr_xive_irq_is_lsi(sPAPRXive *xive, int lisn)
>> @@ -67,5 +77,6 @@ static inline bool spapr_xive_irq_is_lsi(sPAPRXive *xive, 
>> int lisn)
>>  bool spapr_xive_irq_set(sPAPRXive *xive, uint32_t lisn, bool lsi);
>>  bool spapr_xive_irq_unset(sPAPRXive *xive, uint32_t lisn);
>>  void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon);
>> +void spapr_xive_icp_pic_print_info(sPAPRXiveICP *xicp, Monitor *mon);
>>  
>>  #endif /* PPC_SPAPR_XIVE_H */
> 




reply via email to

[Prev in Thread] Current Thread [Next in Thread]