qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [RFC PATCH] cputlb and ssi: cache class to avoid expensive object_dy


From: Alex Bennée
Subject: Re: [RFC PATCH] cputlb and ssi: cache class to avoid expensive object_dynamic_cast_assert (HACK!!!)
Date: Thu, 04 Aug 2022 17:51:27 +0100
User-agent: mu4e 1.7.27; emacs 28.1.91

Cédric Le Goater <clg@kaod.org> writes:

> Hello Alex,
>
> Thanks for putting some time into this problem,
>
> On 8/4/22 11:20, Alex Bennée wrote:
>> Investigating why some BMC models are so slow compared to a plain ARM
>> virt machines I did some profiling of:
>>    ./qemu-system-arm -M romulus-bmc -nic user \
>>      -drive
>>      file=obmc-phosphor-image-romulus.static.mtd,format=raw,if=mtd \
>>      -nographic -serial mon:stdio
>> And saw that object_dynamic_cast was dominating the profile times.
>> We
>> have a number of cases in the CPU hot path and more importantly for
>> this model in the SSI bus. As the class is static once the object is
>> created we just cache it and use it instead of the dynamic case
>> macros.
>> [AJB: I suspect a proper fix for this is for QOM to support a cached
>> class lookup, abortive macro attempt #if 0'd in this patch].
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> Cc: Cédric Le Goater <clg@kaod.org>
>
>
> Here are some results,
>
> * romulus-bmc, OpenBmc login prompt
>
>   without : 82s
>   with    : 56s

Looks like I lucked out picking the lowest hanging fruit.

>
> * ast2500-evb,execute-in-place=true, U-boot 2019.04 prompt
>
>   without : 30s
>   with    : 22s
>
> * witherspoon-bmc,execute-in-place=true, U-boot 2016.07 prompt
>
>   without : 5.5s
>   with    : 4.1s
>
> There is definitely an improvement in all scenarios.
>
> Applying a similar technique on AspeedSMCClass, I could gain
> another ~10% and boot the ast2500-evb,execute-in-place=true
> machine, up to the U-boot 2019.04 prompt, in less then 20s.

There are some fundamentals to XIP which means they will be slower if
each instruction is being sucked through io_readx/device emulation
although I'm not sure what the exact mechanism is because surely a ROM
can just be mapped into the address space and run from there?

> However, newer u-boot are still quite slow to boot when executing
> from the flash device.

For any of those machines? Whats the next command line for me to dig
into?

>
> Thanks,
>
> C.
>
>> ---
>>   include/hw/core/cpu.h |  2 ++
>>   include/hw/ssi/ssi.h  |  3 +++
>>   include/qom/object.h  | 29 +++++++++++++++++++++++++++++
>>   accel/tcg/cputlb.c    | 12 ++++++++----
>>   hw/ssi/ssi.c          | 10 +++++++---
>>   5 files changed, 49 insertions(+), 7 deletions(-)
>> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
>> index 500503da13..70027a772e 100644
>> --- a/include/hw/core/cpu.h
>> +++ b/include/hw/core/cpu.h
>> @@ -317,6 +317,8 @@ struct qemu_work_item;
>>   struct CPUState {
>>       /*< private >*/
>>       DeviceState parent_obj;
>> +    /* cache to avoid expensive CPU_GET_CLASS */
>> +    CPUClass *cc;
>>       /*< public >*/
>>         int nr_cores;
>> diff --git a/include/hw/ssi/ssi.h b/include/hw/ssi/ssi.h
>> index f411858ab0..6950f86810 100644
>> --- a/include/hw/ssi/ssi.h
>> +++ b/include/hw/ssi/ssi.h
>> @@ -59,6 +59,9 @@ struct SSIPeripheralClass {
>>   struct SSIPeripheral {
>>       DeviceState parent_obj;
>>   +    /* cache the class */
>> +    SSIPeripheralClass *spc;
>> +
>>       /* Chip select state */
>>       bool cs;
>>   };
>> diff --git a/include/qom/object.h b/include/qom/object.h
>> index ef7258a5e1..2202dbfa43 100644
>> --- a/include/qom/object.h
>> +++ b/include/qom/object.h
>> @@ -198,6 +198,35 @@ struct Object
>>       OBJ_NAME##_CLASS(const void *klass) \
>>       { return OBJECT_CLASS_CHECK(ClassType, klass, TYPENAME); }
>>   +#if 0
>> +/**
>> + * DECLARE_CACHED_CLASS_CHECKER:
>> + * @InstanceType: instance struct name
>> + * @ClassType: class struct name
>> + * @OBJ_NAME: the object name in uppercase with underscore separators
>> + * @TYPENAME: type name
>> + *
>> + * This variant of DECLARE_CLASS_CHECKERS allows for the caching of
>> + * class in the parent object instance. This is useful for very hot
>> + * path code at the expense of an extra indirection and check. As per
>> + * the original direct usage of this macro should be avoided if the
>> + * complete OBJECT_DECLARE_TYPE macro has been used.
>> + *
>> + * This macro will provide the class type cast functions for a
>> + * QOM type.
>> + */
>> +#define DECLARE_CACHED_CLASS_CHECKERS(InstanceType, ClassType, OBJ_NAME, 
>> TYPENAME) \
>> +    DECLARE_CLASS_CHECKERS(ClassType, OBJ_NAME, TYPENAME) \
>> +    static inline G_GNUC_UNUSED ClassType * \
>> +    OBJ_NAME##_GET_CACHED_CLASS(const void *obj) \
>> +    { \
>> +        InstanceType *p = (InstanceType *) obj; \
>> +        p->cc = p->cc ? p->cc : OBJECT_GET_CLASS(ClassType, obj, TYPENAME);\
>> +        return p->cc;                                                   \
>> +    }
>> +
>> +#endif
>> +
>>   /**
>>    * DECLARE_OBJ_CHECKERS:
>>    * @InstanceType: instance struct name
>> diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
>> index a46f3a654d..882315f7dd 100644
>> --- a/accel/tcg/cputlb.c
>> +++ b/accel/tcg/cputlb.c
>> @@ -1303,8 +1303,9 @@ static inline ram_addr_t 
>> qemu_ram_addr_from_host_nofail(void *ptr)
>>   static void tlb_fill(CPUState *cpu, target_ulong addr, int size,
>>                        MMUAccessType access_type, int mmu_idx, uintptr_t 
>> retaddr)
>>   {
>> -    CPUClass *cc = CPU_GET_CLASS(cpu);
>> +    CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu);
>>       bool ok;
>> +    cpu->cc = cc;
>>         /*
>>        * This is not a probe, so only valid return is success; failure
>> @@ -1319,7 +1320,8 @@ static inline void cpu_unaligned_access(CPUState *cpu, 
>> vaddr addr,
>>                                           MMUAccessType access_type,
>>                                           int mmu_idx, uintptr_t retaddr)
>>   {
>> -    CPUClass *cc = CPU_GET_CLASS(cpu);
>> +    CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu);
>> +    cpu->cc = cc;
>>         cc->tcg_ops->do_unaligned_access(cpu, addr, access_type,
>> mmu_idx, retaddr);
>>   }
>> @@ -1331,7 +1333,8 @@ static inline void cpu_transaction_failed(CPUState 
>> *cpu, hwaddr physaddr,
>>                                             MemTxResult response,
>>                                             uintptr_t retaddr)
>>   {
>> -    CPUClass *cc = CPU_GET_CLASS(cpu);
>> +    CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu);
>> +    cpu->cc = cc;
>>         if (!cpu->ignore_memory_transaction_failures &&
>>           cc->tcg_ops->do_transaction_failed) {
>> @@ -1606,7 +1609,8 @@ static int probe_access_internal(CPUArchState *env, 
>> target_ulong addr,
>>       if (!tlb_hit_page(tlb_addr, page_addr)) {
>>           if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page_addr)) {
>>               CPUState *cs = env_cpu(env);
>> -            CPUClass *cc = CPU_GET_CLASS(cs);
>> +            CPUClass *cc = cs->cc ? cs->cc : CPU_GET_CLASS(cs);
>> +            cs->cc = cc;
>>                 if (!cc->tcg_ops->tlb_fill(cs, addr, fault_size,
>> access_type,
>>                                          mmu_idx, nonfault, retaddr)) {
>> diff --git a/hw/ssi/ssi.c b/hw/ssi/ssi.c
>> index 003931fb50..f749feb6e3 100644
>> --- a/hw/ssi/ssi.c
>> +++ b/hw/ssi/ssi.c
>> @@ -38,7 +38,8 @@ static void ssi_cs_default(void *opaque, int n, int level)
>>       bool cs = !!level;
>>       assert(n == 0);
>>       if (s->cs != cs) {
>> -        SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s);
>> +        /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s); */
>> +        SSIPeripheralClass *ssc = s->spc;
>>           if (ssc->set_cs) {
>>               ssc->set_cs(s, cs);
>>           }
>> @@ -48,7 +49,8 @@ static void ssi_cs_default(void *opaque, int n, int level)
>>     static uint32_t ssi_transfer_raw_default(SSIPeripheral *dev,
>> uint32_t val)
>>   {
>> -    SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev);
>> +    /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev); */
>> +    SSIPeripheralClass *ssc = dev->spc;
>>         if ((dev->cs && ssc->cs_polarity == SSI_CS_HIGH) ||
>>               (!dev->cs && ssc->cs_polarity == SSI_CS_LOW) ||
>> @@ -67,6 +69,7 @@ static void ssi_peripheral_realize(DeviceState *dev, Error 
>> **errp)
>>               ssc->cs_polarity != SSI_CS_NONE) {
>>           qdev_init_gpio_in_named(dev, ssi_cs_default, SSI_GPIO_CS, 1);
>>       }
>> +    s->spc = ssc;
>>         ssc->realize(s, errp);
>>   }
>> @@ -120,7 +123,8 @@ uint32_t ssi_transfer(SSIBus *bus, uint32_t val)
>>         QTAILQ_FOREACH(kid, &b->children, sibling) {
>>           SSIPeripheral *peripheral = SSI_PERIPHERAL(kid->child);
>> -        ssc = SSI_PERIPHERAL_GET_CLASS(peripheral);
>> +        /* ssc = SSI_PERIPHERAL_GET_CLASS(peripheral); */
>> +        ssc = peripheral->spc;
>>           r |= ssc->transfer_raw(peripheral, val);
>>       }
>>   


-- 
Alex Bennée



reply via email to

[Prev in Thread] Current Thread [Next in Thread]