[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: Idea : load/stores with pre-decrement / post-increment
From: |
Paul Cercueil |
Subject: |
Re: Idea : load/stores with pre-decrement / post-increment |
Date: |
Thu, 21 Dec 2023 11:23:40 +0100 |
Hi Paulo,
Le mardi 19 décembre 2023 à 18:07 -0300, Paulo César Pereira de Andrade
a écrit :
> [snip]
>
> Still very early work in progress, only in "it compiles" state, but
> you might have
> different ideas or extra suggestions to the concept/design.
>
> Sample session:
>
> $ cat ldstxbar.tst
> .disasm
> .data 32
> .code
> prolog
> ldxbr_c %r0 %r1 1
> ldxar_c %r0 %r1 1
> ldxbr_uc %r0 %r1 1
> ldxar_uc %r0 %r1 1
> ldxbr_s %r0 %r1 2
> ldxar_s %r0 %r1 2
> ldxbr_us %r0 %r1 2
> ldxar_us %r0 %r1 2
> ldxbr_i %r0 %r1 4
> ldxar_i %r0 %r1 4
> #if __WORDSIZE == 64
> ldxbr_ui %r0 %r1 4
> ldxar_ui %r0 %r1 4
> ldxbr_l %r0 %r1 8
> ldxar_l %r0 %r1 8
> #endif
> ldxbr_f %f0 %r1 4
> ldxar_f %f0 %r1 4
> ldxbr_d %f0 %r1 8
> ldxar_d %f0 %r1 8
> stxbr_c %r0 %r1 1
> stxar_c %r0 %r1 1
> stxbr_s %r0 %r1 2
> stxar_s %r0 %r1 2
> stxbr_i %r0 %r1 4
> stxar_i %r0 %r1 4
> #if __WORDSIZE == 64
> stxbr_l %r0 %r1 8
> stxar_l %r0 %r1 8
> #endif
> stxbr_f %r0 %f0 4
> stxar_f %r0 %f0 4
> stxbr_d %r0 %f0 8
> stxar_d %r0 %f0 8
> ret
> epilog
>
> ./lightning ldstxbar.tst
> L0: %rbx %r13 %r14 %r15 %r12 /* prolog */
> ldxbr_c %rax %r10 0x1
> ldxar_c %rax %r10 0x1
> ldxbr_uc %rax %r10 0x1
> ldxar_uc %rax %r10 0x1
> ldxbr_s %rax %r10 0x2
> ldxar_s %rax %r10 0x2
> ldxbr_us %rax %r10 0x2
> ldxar_us %rax %r10 0x2
> ldxbr_i %rax %r10 0x4
> ldxar_i %rax %r10 0x4
> ldxbr_ui %rax %r10 0x4
> ldxar_ui %rax %r10 0x4
> ldxbr_l %rax %r10 0x8
> ldxar_l %rax %r10 0x8
> ldxbr_f %xmm8 %r10 0x4
> ldxar_f %xmm8 %r10 0x4
> ldxbr_d %xmm8 %r10 0x8
> ldxar_d %xmm8 %r10 0x8
> stxbr_c %rax %r10 0x1
> stxar_c %rax %r10 0x1
> stxbr_s %rax %r10 0x2
> stxar_s %rax %r10 0x2
> stxbr_i %rax %r10 0x4
> stxar_i %rax %r10 0x4
> stxbr_l %rax %r10 0x8
> stxar_l %rax %r10 0x8
> stxbr_f %rax %xmm8 0x4
> stxar_f %rax %xmm8 0x4
> stxbr_d %rax %xmm8 0x8
> stxar_d %rax %xmm8 0x8
> prepare
> pushargi_l 0x61c388
> \__ movi %rdi 0x61c388
> finishi 0x7f744c699390
> \__ calli 0x7f744c699390
> L1: %rbx %r13 %r14 %r15 %r12
> ret
> L2: %rax %xmm0 /* epilog */
> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
> - - - -
> L0: %rbx %r13 %r14 %r15 %r12 /* prolog */
> 0x7f744cc95000 sub $0x18,%rsp
> 0x7f744cc95004 mov %rbp,(%rsp)
> 0x7f744cc95008 mov %rsp,%rbp
> ldxbr_c %rax %r10 0x1
> 0x7f744cc9500b add $0x1,%r10
> 0x7f744cc9500f movsbq (%r10),%rax
> ldxar_c %rax %r10 0x1
> 0x7f744cc95013 movsbq (%r10),%rax
> 0x7f744cc95017 add $0x1,%r10
> ldxbr_uc %rax %r10 0x1
> 0x7f744cc9501b add $0x1,%r10
> 0x7f744cc9501f movzbq (%r10),%rax
> ldxar_uc %rax %r10 0x1
> 0x7f744cc95023 movzbq (%r10),%rax
> 0x7f744cc95027 add $0x1,%r10
> ldxbr_s %rax %r10 0x2
> 0x7f744cc9502b add $0x2,%r10
> 0x7f744cc9502f movswq (%r10),%rax
> ldxar_s %rax %r10 0x2
> 0x7f744cc95033 movswq (%r10),%rax
> 0x7f744cc95037 add $0x2,%r10
> ldxbr_us %rax %r10 0x2
> 0x7f744cc9503b add $0x2,%r10
> 0x7f744cc9503f movzwq (%r10),%rax
> ldxar_us %rax %r10 0x2
> 0x7f744cc95043 movzwq (%r10),%rax
> 0x7f744cc95047 add $0x2,%r10
> ldxbr_i %rax %r10 0x4
> 0x7f744cc9504b add $0x4,%r10
> 0x7f744cc9504f movslq (%r10),%rax
> ldxar_i %rax %r10 0x4
> 0x7f744cc95052 movslq (%r10),%rax
> 0x7f744cc95055 add $0x4,%r10
> ldxbr_ui %rax %r10 0x4
> 0x7f744cc95059 add $0x4,%r10
> 0x7f744cc9505d movsxd (%r10),%eax
> ldxar_ui %rax %r10 0x4
> 0x7f744cc95060 movsxd (%r10),%eax
> 0x7f744cc95063 add $0x4,%r10
> ldxbr_l %rax %r10 0x8
> 0x7f744cc95067 add $0x8,%r10
> 0x7f744cc9506b mov (%r10),%rax
> ldxar_l %rax %r10 0x8
> 0x7f744cc9506e mov (%r10),%rax
> 0x7f744cc95071 add $0x8,%r10
> ldxbr_f %xmm8 %r10 0x4
> 0x7f744cc95075 add $0x4,%r10
> 0x7f744cc95079 movss (%r10),%xmm8
> ldxar_f %xmm8 %r10 0x4
> 0x7f744cc9507e movss (%r10),%xmm8
> 0x7f744cc95083 add $0x4,%r10
> ldxbr_d %xmm8 %r10 0x8
> 0x7f744cc95087 add $0x8,%r10
> 0x7f744cc9508b movsd (%r10),%xmm8
> ldxar_d %xmm8 %r10 0x8
> 0x7f744cc95090 movsd (%r10),%xmm8
> 0x7f744cc95095 add $0x8,%r10
> stxbr_c %rax %r10 0x1
> 0x7f744cc95099 add $0x1,%rax
> 0x7f744cc9509d mov %r10b,(%rax)
> stxar_c %rax %r10 0x1
> 0x7f744cc950a0 mov %r10b,(%rax)
> 0x7f744cc950a3 add $0x1,%rax
> stxbr_s %rax %r10 0x2
> 0x7f744cc950a7 add $0x2,%rax
> 0x7f744cc950ab mov %r10w,(%rax)
> stxar_s %rax %r10 0x2
> 0x7f744cc950af mov %r10w,(%rax)
> 0x7f744cc950b3 add $0x2,%rax
> stxbr_i %rax %r10 0x4
> 0x7f744cc950b7 add $0x4,%rax
> 0x7f744cc950bb mov %r10d,(%rax)
> stxar_i %rax %r10 0x4
> 0x7f744cc950be mov %r10d,(%rax)
> 0x7f744cc950c1 add $0x4,%rax
> stxbr_l %rax %r10 0x8
> 0x7f744cc950c5 add $0x8,%rax
> 0x7f744cc950c9 mov %r10,(%rax)
> stxar_l %rax %r10 0x8
> 0x7f744cc950cc mov %r10,(%rax)
> 0x7f744cc950cf add $0x8,%rax
> stxbr_f %rax %xmm8 0x4
> 0x7f744cc950d3 add $0x4,%rax
> 0x7f744cc950d7 movsd %xmm8,(%rax)
> stxar_f %rax %xmm8 0x4
> 0x7f744cc950dc movsd %xmm8,(%rax)
> 0x7f744cc950e1 add $0x4,%rax
> stxbr_d %rax %xmm8 0x8
> 0x7f744cc950e5 add $0x8,%rax
> 0x7f744cc950e9 movsd %xmm8,(%rax)
> stxar_d %rax %xmm8 0x8
> 0x7f744cc950ee movsd %xmm8,(%rax)
> 0x7f744cc950f3 add $0x8,%rax
> prepare
> pushargi_l 0x61c388
> \__ movi %rdi 0x61c388
> 0x7f744cc950f7 mov $0x61c388,%edi
> finishi 0x7f744c699390
> \__ calli 0x7f744c699390
> 0x7f744cc950fc call 0x7f744c699390
> L1: %rbx %r13 %r14 %r15 %r12
> ret
> L2: %rax %xmm0 /* epilog */
> 0x7f744cc95101 mov %rbp,%rsp
> 0x7f744cc95104 mov (%rsp),%rbp
> 0x7f744cc95108 add $0x18,%rsp
> 0x7f744cc9510c ret
>
> So, the idea is the pattern:
>
> jit_ldxbr_T(R0, R1, DISP), jit_ldxar_T(R0, R1, DISP)
> jit_stxbr_T(R0, R1, DISP) and jit_stxar_T(R0, R1, DISP)
>
> where the fallback/generic version does addi of DISP in the base
> register (b)efore
> or (a)fter the load and otherwise is a normal jit_ldr_T or jit_str_T.
Do we need DISP? In your examples above it's always equal to sizeof(T),
(or I guess the negative sizeof(T) as well) and that would be my
assumption. I'm not against it, but it sounds a bit error-prone, as
well as redundant since the suffix _c _s _i already tells you the
increment/decrement value. Unless you want to accept arbitrary
increment/decrement values (as e.g. ARM supports that) but such usage
wouldn't be very typical.
On the other hand... I like that it supports all cases with just 4 new
instructions.
Cheers,
-Paul
- Idea : load/stores with pre-decrement / post-increment, Paul Cercueil, 2023/12/18
- Re: Idea : load/stores with pre-decrement / post-increment, Paulo César Pereira de Andrade, 2023/12/18
- Re: Idea : load/stores with pre-decrement / post-increment, Marc Nieper-Wißkirchen, 2023/12/18
- Re: Idea : load/stores with pre-decrement / post-increment, Paulo César Pereira de Andrade, 2023/12/18
- Re: Idea : load/stores with pre-decrement / post-increment, Paul Cercueil, 2023/12/18
- Re: Idea : load/stores with pre-decrement / post-increment, Paulo César Pereira de Andrade, 2023/12/18
- Re: Idea : load/stores with pre-decrement / post-increment, Paulo César Pereira de Andrade, 2023/12/19
- Re: Idea : load/stores with pre-decrement / post-increment,
Paul Cercueil <=
- Re: Idea : load/stores with pre-decrement / post-increment, Paulo César Pereira de Andrade, 2023/12/21
- Re: Idea : load/stores with pre-decrement / post-increment, Paul Cercueil, 2023/12/21
- Re: Idea : load/stores with pre-decrement / post-increment, Paulo César Pereira de Andrade, 2023/12/21
- Re: Idea : load/stores with pre-decrement / post-increment, Paulo César Pereira de Andrade, 2023/12/22
Re: Idea : load/stores with pre-decrement / post-increment, Paul Cercueil, 2023/12/18