>From 69b2316a9f8c79d1f889d3b8039b5cd4c381bb95 Mon Sep 17 00:00:00 2001 From: Paulo Cesar Pereira de Andrade Date: Thu, 19 Aug 2010 21:29:13 -0300 Subject: [PATCH] Experimental x86_64 support for any number of integer and float arguments Tested only in calls to varargs C functions, and used the same call convention in some jit functions tested. Still need to ensure the varargs abi is the same as prototyped calls (the abi from x86-64.org specifies usage of SSEUP, that is, use top 64 bits of xmm registers for extra parameters). In this commit, registers xmm0 to xmm7 are used for float arguments, and stack if run out of registers. --- lightning/core-common.h | 2 +- lightning/i386/asm.h | 2 + lightning/i386/core-64.h | 141 ++++++++++++++++++++++++++++++++++++++-------- lightning/i386/fp-64.h | 52 ++++++++++++----- 4 files changed, 159 insertions(+), 38 deletions(-) diff --git a/lightning/core-common.h b/lightning/core-common.h index c9efa9d..370a529 100644 --- a/lightning/core-common.h +++ b/lightning/core-common.h @@ -462,7 +462,7 @@ typedef union jit_code { #define jit_getarg_s(reg, ofs) jit_extr_s_i ((reg), (ofs)) #define jit_getarg_uc(reg, ofs) jit_extr_uc_ui((reg), (ofs)) #define jit_getarg_ui(reg, ofs) jit_movr_ui ((reg), (ofs)) -#define jit_getarg_ul(reg, ofs) jit_extr_uc_ul((reg), (ofs)) +#define jit_getarg_ul(reg, ofs) jit_movr_ul ((reg), (ofs)) #define jit_getarg_us(reg, ofs) jit_extr_us_ul((reg), (ofs)) #else #define jit_getarg_c(reg, ofs) jit_ldxi_c((reg), JIT_AP, (ofs)); diff --git a/lightning/i386/asm.h b/lightning/i386/asm.h index 2dec4b9..7cc2ba0 100644 --- a/lightning/i386/asm.h +++ b/lightning/i386/asm.h @@ -94,6 +94,7 @@ typedef _uc jit_insn; #define _MM5 0x65 #define _MM6 0x66 #define _MM7 0x67 +#define _MM8 0x68 #define _XMM0 0x70 #define _XMM1 0x71 @@ -103,6 +104,7 @@ typedef _uc jit_insn; #define _XMM5 0x75 #define _XMM6 0x76 #define _XMM7 0x77 +#define _XMM8 0x78 #define _ST0 0 #define _ST1 1 diff --git a/lightning/i386/core-64.h b/lightning/i386/core-64.h index 4c0c5dc..3b100ee 100644 --- a/lightning/i386/core-64.h +++ b/lightning/i386/core-64.h @@ -36,11 +36,18 @@ /* Used to implement ldc, stc, ... */ #define JIT_CAN_16 0 -#define JIT_REXTMP _R9D +#define JIT_REXTMP _R11D + +/* Number or integer argument registers */ +#define JIT_A_NUM 6 + +/* Number of float argument registers - actually there are 16 as + * top 64 bits are also used to pass arguments */ +#define JIT_FA_NUM 8 #define JIT_R_NUM 3 #define JIT_R(i) ((i) == 0 ? _EAX : _R9D + (i)) -#define JIT_V_NUM 3 +#define JIT_V_NUM 5 #define JIT_V(i) ((i) == 0 ? _EBX : _R11D + (i)) struct jit_local_state { @@ -48,7 +55,10 @@ struct jit_local_state { int nextarg_getfp; int nextarg_putfp; int nextarg_geti; + int nextarg_puti; + int framesize; int argssize; + int fprssize; int alloca_offset; int alloca_slack; }; @@ -123,34 +133,112 @@ struct jit_local_state { #define jit_pushr_i(rs) PUSHQr(rs) #define jit_popr_i(rs) POPQr(rs) -/* A return address is 8 bytes, plus 4 registers = 32 byte, total = 40 bytes. +/* A return address is 8 bytes, plus 6 registers = 48 byte, total = 56 bytes. The final push of EBX keeps the stack aligned to 16 bytes. */ -#define jit_prolog(n) (_jitl.nextarg_getfp = _jitl.nextarg_geti = 0, _jitl.alloca_offset = 0, \ - PUSHQr(_EBX), PUSHQr(_R12), PUSHQr(_R13), PUSHQr(_EBP), MOVQrr(_ESP, _EBP), PUSHQr(_EBX)) +#define jit_prolog(n) \ + (_jitl.framesize = 56, \ + _jitl.nextarg_getfp = _jitl.nextarg_geti = 0, \ + _jitl.alloca_offset = 0, \ + PUSHQr(_EBX), \ + PUSHQr(_R12), \ + PUSHQr(_R13), \ + PUSHQr(_R14), \ + PUSHQr(_R15), \ + PUSHQr(_EBP), \ + MOVQrr(_ESP, _EBP), \ + PUSHQr(_EBX)) #define jit_calli(sub) (MOVQir((long) (sub), JIT_REXTMP), CALLsr(JIT_REXTMP)) #define jit_callr(reg) CALLsr((reg)) -/* Stack isn't used for arguments: */ -#define jit_prepare_i(ni) (_jitl.argssize = (ni)) - -#define jit_pusharg_i(rs) (--_jitl.argssize, MOVQrr(rs, jit_arg_reg_order[_jitl.argssize])) -#define jit_finish(sub) (MOVQir((long) (sub), JIT_REXTMP), \ - CALLsr(JIT_REXTMP)) +#define jit_prepare_i(ni) \ + (_jitl.argssize = _jitl.nextarg_puti = (ni), \ + _jitl.argssize = _jitl.nextarg_puti > JIT_A_NUM ? \ + _jitl.nextarg_puti - JIT_A_NUM : 0) + +#define jit_pusharg_i(rs) \ + (--_jitl.nextarg_puti, \ + _jitl.nextarg_puti >= JIT_A_NUM ? \ + PUSHQr(rs) : \ + MOVQrr(rs, jit_arg_reg_order[_jitl.nextarg_puti])) +#define jit_finish(sub) \ + (MOVBir(_jitl.fprssize < JIT_FA_NUM ? \ + _jitl.fprssize : JIT_FA_NUM, _AL), \ + jit_calli(sub), \ + (_jitl.argssize ? \ + ADDQir(sizeof(long) * _jitl.argssize, JIT_SP) : \ + 0), \ + _jitl.argssize = 0) #define jit_reg_is_arg(reg) ((reg) == _ECX || (reg) == _EDX) -#define jit_finishr(reg) ((jit_reg_is_arg((reg)) ? MOVQrr(reg, JIT_REXTMP) : (void)0), \ - CALLsr(jit_reg_is_arg((reg)) ? JIT_REXTMP : (reg))) +#define jit_finishr(reg) \ + (MOVBir(_jitl.fprssize < JIT_FA_NUM ? \ + _jitl.fprssize : JIT_FA_NUM, _AL), \ + (jit_reg_is_arg((reg)) ? \ + (MOVQrr(reg, JIT_REXTMP), jit_callr(JIT_REXTMP)) : \ + jit_callr(reg)), \ + (_jitl.argssize ? \ + ADDQir(sizeof(long) * _jitl.argssize, JIT_SP) : 0), \ + _jitl.argssize = 0) #define jit_retval_l(rd) ((void)jit_movr_l ((rd), _EAX)) -#define jit_arg_c() (jit_arg_reg_order[_jitl.nextarg_geti++]) -#define jit_arg_uc() (jit_arg_reg_order[_jitl.nextarg_geti++]) -#define jit_arg_s() (jit_arg_reg_order[_jitl.nextarg_geti++]) -#define jit_arg_us() (jit_arg_reg_order[_jitl.nextarg_geti++]) -#define jit_arg_i() (jit_arg_reg_order[_jitl.nextarg_geti++]) -#define jit_arg_ui() (jit_arg_reg_order[_jitl.nextarg_geti++]) -#define jit_arg_l() (jit_arg_reg_order[_jitl.nextarg_geti++]) -#define jit_arg_ul() (jit_arg_reg_order[_jitl.nextarg_geti++]) -#define jit_arg_p() (jit_arg_reg_order[_jitl.nextarg_geti++]) +#define jit_arg_i() \ + (_jitl.nextarg_geti < JIT_A_NUM ? \ + _jitl.nextarg_geti++ : \ + ((_jitl.framesize += sizeof(long)) - sizeof(long))) +#define jit_arg_c() jit_arg_i() +#define jit_arg_uc() jit_arg_i() +#define jit_arg_s() jit_arg_i() +#define jit_arg_us() jit_arg_i() +#define jit_arg_ui() jit_arg_i() +#define jit_arg_l() jit_arg_i() +#define jit_arg_ul() jit_arg_i() +#define jit_arg_p() jit_arg_i() + +#define jit_getarg_c(reg, ofs) \ + ((ofs) < JIT_A_NUM ? \ + jit_extr_c_i((reg), jit_arg_reg_order[(ofs)]) : \ + (_jitl.framesize -= sizeof(long), \ + jit_ldxi_c((reg), JIT_FP, ofs))) +#define jit_getarg_uc(reg, ofs) \ + ((ofs) < JIT_A_NUM ? \ + jit_extr_uc_ui((reg), jit_arg_reg_order[(ofs)]) : \ + (_jitl.framesize -= sizeof(long), \ + jit_ldxi_uc((reg), JIT_FP, ofs))) +#define jit_getarg_s(reg, ofs) \ + ((ofs) < JIT_A_NUM ? \ + jit_extr_s_i((reg), jit_arg_reg_order[(ofs)]) : \ + (_jitl.framesize -= sizeof(long), \ + jit_ldxi_s((reg), JIT_FP, ofs))) +#define jit_getarg_us(reg, ofs) \ + ((ofs) < JIT_A_NUM ? \ + jit_extr_us_ui((reg), jit_arg_reg_order[(ofs)]) : \ + (_jitl.framesize -= sizeof(long), \ + jit_ldxi_us((reg), JIT_FP, ofs))) +#define jit_getarg_i(reg, ofs) \ + ((ofs) < JIT_A_NUM ? \ + jit_movr_i((reg), jit_arg_reg_order[(ofs)]) : \ + (_jitl.framesize -= sizeof(long), \ + jit_ldxi_i((reg), JIT_FP, ofs))) +#define jit_getarg_ui(reg, ofs) \ + ((ofs) < JIT_A_NUM ? \ + jit_movr_ui((reg), jit_arg_reg_order[(ofs)]) : \ + (_jitl.framesize -= sizeof(long), \ + jit_ldxi_ui((reg), JIT_FP, ofs))) +#define jit_getarg_l(reg, ofs) \ + ((ofs) < JIT_A_NUM ? \ + jit_movr_l((reg), jit_arg_reg_order[(ofs)]) : \ + (_jitl.framesize -= sizeof(long), \ + jit_ldxi_l((reg), JIT_FP, ofs))) +#define jit_getarg_ul(reg, ofs) \ + ((ofs) < JIT_A_NUM ? \ + jit_movr_ul((reg), jit_arg_reg_order[(ofs)]) : \ + (_jitl.framesize -= sizeof(long), \ + jit_ldxi_ul((reg), JIT_FP, ofs))) +#define jit_getarg_p(reg, ofs) \ + ((ofs) < JIT_A_NUM ? \ + jit_movr_p((reg), jit_arg_reg_order[(ofs)]) : \ + (_jitl.framesize -= sizeof(long), \ + jit_ldxi_p((reg), JIT_FP, ofs))) static int jit_arg_reg_order[] = { _EDI, _ESI, _EDX, _ECX, _R8D, _R9D }; @@ -178,7 +266,14 @@ static int jit_arg_reg_order[] = { _EDI, _ESI, _EDX, _ECX, _R8D, _R9D }; #define jit_patch_long_at(jump_pc,v) (*_PSL((jump_pc) - sizeof(long)) = _jit_SL((jit_insn *)(v))) #define jit_patch_short_at(jump_pc,v) (*_PSI((jump_pc) - sizeof(int)) = _jit_SI((jit_insn *)(v) - (jump_pc))) #define jit_patch_at(jump_pc,v) (_jitl.long_jumps ? jit_patch_long_at((jump_pc)-3, v) : jit_patch_short_at(jump_pc, v)) -#define jit_ret() (LEAVE_(), POPQr(_R13), POPQr(_R12), POPQr(_EBX), RET_()) +#define jit_ret() \ + (LEAVE_(), \ + POPQr(_R15), \ + POPQr(_R14), \ + POPQr(_R13), \ + POPQr(_R12), \ + POPQr(_EBX), \ + RET_()) /* Memory */ diff --git a/lightning/i386/fp-64.h b/lightning/i386/fp-64.h index 9bb2681..a476fe6 100644 --- a/lightning/i386/fp-64.h +++ b/lightning/i386/fp-64.h @@ -35,10 +35,10 @@ #include -#define JIT_FPR_NUM 9 +#define JIT_FPR_NUM 7 #define JIT_FPRET _XMM0 -#define JIT_FPR(i) (_XMM7 + (i)) -#define JIT_FPTMP _XMM6 +#define JIT_FPR(i) (_XMM8 + (i)) +#define JIT_FPTMP _XMM8 /* Either use a temporary register that is finally AND/OR/XORed with RS = RD, or use RD as the temporary register and to the AND/OR/XOR with RS. */ @@ -290,16 +290,40 @@ union jit_double_imm { #define jit_ordr_d(d, s1, s2) (XORLrr ((d), (d)), UCOMISDrr ((s1), (s2)), SETNPr (jit_reg8((d)))) #define jit_unordr_d(d, s1, s2) (XORLrr ((d), (d)), UCOMISDrr ((s1), (s2)), SETPr (jit_reg8((d)))) -#define jit_prepare_f(num) (_jitl.nextarg_putfp = _XMM0 + (num)) -#define jit_prepare_d(num) (_jitl.nextarg_putfp = _XMM0 + (num)) - -#define jit_arg_f() (_XMM0 + _jitl.nextarg_getfp++) -#define jit_arg_d() (_XMM0 + _jitl.nextarg_getfp++) - -#define jit_getarg_f(rd, ofs) (jit_movr_f ((rd), (ofs))) -#define jit_getarg_d(rd, ofs) (jit_movr_d ((rd), (ofs))) - -#define jit_pusharg_f(rs) (--_jitl.nextarg_putfp, jit_movr_f (_jitl.nextarg_putfp, (rs))) -#define jit_pusharg_d(rs) (--_jitl.nextarg_putfp, jit_movr_d (_jitl.nextarg_putfp, (rs))) +#define jit_prepare_d(num) \ + ((_jitl.nextarg_putfp + (num) > JIT_FA_NUM ? \ + (_jitl.argssize += (_jitl.nextarg_putfp + (num)) - JIT_FA_NUM, \ + _jitl.fprssize = JIT_FA_NUM) : \ + (_jitl.fprssize += (num))), \ + _jitl.nextarg_putfp += (num)) +#define jit_prepare_f(num) jit_prepare_d(num) + +#define jit_arg_d() \ + (_jitl.nextarg_getfp < JIT_FA_NUM ? \ + _jitl.nextarg_getfp++ : \ + ((_jitl.framesize += sizeof(double)) - sizeof(double))) +#define jit_arg_f() jit_arg_d() + +#define jit_getarg_f(reg, ofs) \ + ((ofs) < JIT_FA_NUM ? \ + jit_movr_f((reg), _XMM0 + (ofs)) : \ + (_jitl.framesize -= sizeof(double), \ + jit_ldxi_f((reg), JIT_FP, (ofs)))) +#define jit_getarg_d(reg, ofs) \ + ((ofs) < JIT_FA_NUM ? \ + jit_movr_d((reg), _XMM0 + (ofs)) : \ + (_jitl.framesize -= sizeof(double), \ + jit_ldxi_d((reg), JIT_FP, (ofs)))) + +#define jit_pusharg_f(rs) \ + (--_jitl.nextarg_putfp, \ + _jitl.nextarg_putfp >= JIT_FA_NUM ? \ + (SUBQir(sizeof(double), JIT_SP), jit_str_d(JIT_SP,(rs))) : \ + jit_movr_f(_XMM0 + _jitl.nextarg_putfp, (rs))) +#define jit_pusharg_d(rs) \ + (--_jitl.nextarg_putfp, \ + _jitl.nextarg_putfp >= JIT_FA_NUM ? \ + (SUBQir(sizeof(double), JIT_SP), jit_str_d(JIT_SP,(rs))) : \ + jit_movr_d(_XMM0 + _jitl.nextarg_putfp, (rs))) #endif /* __lightning_fp_h */ -- 1.7.2.1