lightning
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Idea : load/stores with pre-decrement / post-increment


From: Paul Cercueil
Subject: Re: Idea : load/stores with pre-decrement / post-increment
Date: Thu, 21 Dec 2023 11:23:40 +0100

Hi Paulo,

Le mardi 19 décembre 2023 à 18:07 -0300, Paulo César Pereira de Andrade
a écrit :
> [snip]
> 
>   Still very early work in progress, only in "it compiles" state, but
> you might have
> different ideas or extra suggestions to the concept/design.
> 
>   Sample session:
> 
> $ cat ldstxbar.tst
> .disasm
> .data 32
> .code
>     prolog
>     ldxbr_c %r0 %r1 1
>     ldxar_c %r0 %r1 1
>     ldxbr_uc %r0 %r1 1
>     ldxar_uc %r0 %r1 1
>     ldxbr_s %r0 %r1 2
>     ldxar_s %r0 %r1 2
>     ldxbr_us %r0 %r1 2
>     ldxar_us %r0 %r1 2
>     ldxbr_i %r0 %r1 4
>     ldxar_i %r0 %r1 4
> #if __WORDSIZE == 64
>     ldxbr_ui %r0 %r1 4
>     ldxar_ui %r0 %r1 4
>     ldxbr_l %r0 %r1 8
>     ldxar_l %r0 %r1 8
> #endif
>     ldxbr_f %f0 %r1 4
>     ldxar_f %f0 %r1 4
>     ldxbr_d %f0 %r1 8
>     ldxar_d %f0 %r1 8
>     stxbr_c %r0 %r1 1
>     stxar_c %r0 %r1 1
>     stxbr_s %r0 %r1 2
>     stxar_s %r0 %r1 2
>     stxbr_i %r0 %r1 4
>     stxar_i %r0 %r1 4
> #if __WORDSIZE == 64
>     stxbr_l %r0 %r1 8
>     stxar_l %r0 %r1 8
> #endif
>     stxbr_f %r0 %f0 4
>     stxar_f %r0 %f0 4
>     stxbr_d %r0 %f0 8
>     stxar_d %r0 %f0 8
>     ret
>     epilog
> 
>  ./lightning ldstxbar.tst
> L0: %rbx %r13 %r14 %r15 %r12 /* prolog */
>     ldxbr_c %rax %r10 0x1
>     ldxar_c %rax %r10 0x1
>     ldxbr_uc %rax %r10 0x1
>     ldxar_uc %rax %r10 0x1
>     ldxbr_s %rax %r10 0x2
>     ldxar_s %rax %r10 0x2
>     ldxbr_us %rax %r10 0x2
>     ldxar_us %rax %r10 0x2
>     ldxbr_i %rax %r10 0x4
>     ldxar_i %rax %r10 0x4
>     ldxbr_ui %rax %r10 0x4
>     ldxar_ui %rax %r10 0x4
>     ldxbr_l %rax %r10 0x8
>     ldxar_l %rax %r10 0x8
>     ldxbr_f %xmm8 %r10 0x4
>     ldxar_f %xmm8 %r10 0x4
>     ldxbr_d %xmm8 %r10 0x8
>     ldxar_d %xmm8 %r10 0x8
>     stxbr_c %rax %r10 0x1
>     stxar_c %rax %r10 0x1
>     stxbr_s %rax %r10 0x2
>     stxar_s %rax %r10 0x2
>     stxbr_i %rax %r10 0x4
>     stxar_i %rax %r10 0x4
>     stxbr_l %rax %r10 0x8
>     stxar_l %rax %r10 0x8
>     stxbr_f %rax %xmm8 0x4
>     stxar_f %rax %xmm8 0x4
>     stxbr_d %rax %xmm8 0x8
>     stxar_d %rax %xmm8 0x8
>     prepare
>     pushargi_l 0x61c388
>      \__ movi %rdi 0x61c388
>     finishi 0x7f744c699390
>      \__ calli 0x7f744c699390
> L1: %rbx %r13 %r14 %r15 %r12
>     ret
> L2: %rax %xmm0 /* epilog */
>   - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
> - - - -
> L0: %rbx %r13 %r14 %r15 %r12 /* prolog */
>     0x7f744cc95000    sub    $0x18,%rsp
>     0x7f744cc95004    mov    %rbp,(%rsp)
>     0x7f744cc95008    mov    %rsp,%rbp
>     ldxbr_c %rax %r10 0x1
>     0x7f744cc9500b    add    $0x1,%r10
>     0x7f744cc9500f    movsbq (%r10),%rax
>     ldxar_c %rax %r10 0x1
>     0x7f744cc95013    movsbq (%r10),%rax
>     0x7f744cc95017    add    $0x1,%r10
>     ldxbr_uc %rax %r10 0x1
>     0x7f744cc9501b    add    $0x1,%r10
>     0x7f744cc9501f    movzbq (%r10),%rax
>     ldxar_uc %rax %r10 0x1
>     0x7f744cc95023    movzbq (%r10),%rax
>     0x7f744cc95027    add    $0x1,%r10
>     ldxbr_s %rax %r10 0x2
>     0x7f744cc9502b    add    $0x2,%r10
>     0x7f744cc9502f    movswq (%r10),%rax
>     ldxar_s %rax %r10 0x2
>     0x7f744cc95033    movswq (%r10),%rax
>     0x7f744cc95037    add    $0x2,%r10
>     ldxbr_us %rax %r10 0x2
>     0x7f744cc9503b    add    $0x2,%r10
>     0x7f744cc9503f    movzwq (%r10),%rax
>     ldxar_us %rax %r10 0x2
>     0x7f744cc95043    movzwq (%r10),%rax
>     0x7f744cc95047    add    $0x2,%r10
>     ldxbr_i %rax %r10 0x4
>     0x7f744cc9504b    add    $0x4,%r10
>     0x7f744cc9504f    movslq (%r10),%rax
>     ldxar_i %rax %r10 0x4
>     0x7f744cc95052    movslq (%r10),%rax
>     0x7f744cc95055    add    $0x4,%r10
>     ldxbr_ui %rax %r10 0x4
>     0x7f744cc95059    add    $0x4,%r10
>     0x7f744cc9505d    movsxd (%r10),%eax
>     ldxar_ui %rax %r10 0x4
>     0x7f744cc95060    movsxd (%r10),%eax
>     0x7f744cc95063    add    $0x4,%r10
>     ldxbr_l %rax %r10 0x8
>     0x7f744cc95067    add    $0x8,%r10
>     0x7f744cc9506b    mov    (%r10),%rax
>     ldxar_l %rax %r10 0x8
>     0x7f744cc9506e    mov    (%r10),%rax
>     0x7f744cc95071    add    $0x8,%r10
>     ldxbr_f %xmm8 %r10 0x4
>     0x7f744cc95075    add    $0x4,%r10
>     0x7f744cc95079    movss  (%r10),%xmm8
>     ldxar_f %xmm8 %r10 0x4
>     0x7f744cc9507e    movss  (%r10),%xmm8
>     0x7f744cc95083    add    $0x4,%r10
>     ldxbr_d %xmm8 %r10 0x8
>     0x7f744cc95087    add    $0x8,%r10
>     0x7f744cc9508b    movsd  (%r10),%xmm8
>     ldxar_d %xmm8 %r10 0x8
>     0x7f744cc95090    movsd  (%r10),%xmm8
>     0x7f744cc95095    add    $0x8,%r10
>     stxbr_c %rax %r10 0x1
>     0x7f744cc95099    add    $0x1,%rax
>     0x7f744cc9509d    mov    %r10b,(%rax)
>     stxar_c %rax %r10 0x1
>     0x7f744cc950a0    mov    %r10b,(%rax)
>     0x7f744cc950a3    add    $0x1,%rax
>     stxbr_s %rax %r10 0x2
>     0x7f744cc950a7    add    $0x2,%rax
>     0x7f744cc950ab    mov    %r10w,(%rax)
>     stxar_s %rax %r10 0x2
>     0x7f744cc950af    mov    %r10w,(%rax)
>     0x7f744cc950b3    add    $0x2,%rax
>     stxbr_i %rax %r10 0x4
>     0x7f744cc950b7    add    $0x4,%rax
>     0x7f744cc950bb    mov    %r10d,(%rax)
>     stxar_i %rax %r10 0x4
>     0x7f744cc950be    mov    %r10d,(%rax)
>     0x7f744cc950c1    add    $0x4,%rax
>     stxbr_l %rax %r10 0x8
>     0x7f744cc950c5    add    $0x8,%rax
>     0x7f744cc950c9    mov    %r10,(%rax)
>     stxar_l %rax %r10 0x8
>     0x7f744cc950cc    mov    %r10,(%rax)
>     0x7f744cc950cf    add    $0x8,%rax
>     stxbr_f %rax %xmm8 0x4
>     0x7f744cc950d3    add    $0x4,%rax
>     0x7f744cc950d7    movsd  %xmm8,(%rax)
>     stxar_f %rax %xmm8 0x4
>     0x7f744cc950dc    movsd  %xmm8,(%rax)
>     0x7f744cc950e1    add    $0x4,%rax
>     stxbr_d %rax %xmm8 0x8
>     0x7f744cc950e5    add    $0x8,%rax
>     0x7f744cc950e9    movsd  %xmm8,(%rax)
>     stxar_d %rax %xmm8 0x8
>     0x7f744cc950ee    movsd  %xmm8,(%rax)
>     0x7f744cc950f3    add    $0x8,%rax
>     prepare
>     pushargi_l 0x61c388
>      \__ movi %rdi 0x61c388
>     0x7f744cc950f7    mov    $0x61c388,%edi
>     finishi 0x7f744c699390
>      \__ calli 0x7f744c699390
>     0x7f744cc950fc    call   0x7f744c699390
> L1: %rbx %r13 %r14 %r15 %r12
>     ret
> L2: %rax %xmm0 /* epilog */
>     0x7f744cc95101    mov    %rbp,%rsp
>     0x7f744cc95104    mov    (%rsp),%rbp
>     0x7f744cc95108    add    $0x18,%rsp
>     0x7f744cc9510c    ret
> 
>   So, the idea is the pattern:
> 
> jit_ldxbr_T(R0, R1, DISP), jit_ldxar_T(R0, R1, DISP)
> jit_stxbr_T(R0, R1, DISP) and jit_stxar_T(R0, R1, DISP)
> 
> where the fallback/generic version does addi of DISP in the base
> register (b)efore
> or (a)fter the load and otherwise is a normal jit_ldr_T or jit_str_T.

Do we need DISP? In your examples above it's always equal to sizeof(T),
(or I guess the negative sizeof(T) as well) and that would be my
assumption. I'm not against it, but it sounds a bit error-prone, as
well as redundant since the suffix _c _s _i already tells you the
increment/decrement value. Unless you want to accept arbitrary
increment/decrement values (as e.g. ARM supports that) but such usage
wouldn't be very typical.

On the other hand... I like that it supports all cases with just 4 new
instructions.

Cheers,
-Paul

reply via email to

[Prev in Thread] Current Thread [Next in Thread]