guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] 45/437: add floating-point for x86-64


From: Andy Wingo
Subject: [Guile-commits] 45/437: add floating-point for x86-64
Date: Mon, 2 Jul 2018 05:13:42 -0400 (EDT)

wingo pushed a commit to branch lightning
in repository guile.

commit 58c4dcea4396193ec4ac18b54ff84dba0c12719c
Author: Paolo Bonzini <address@hidden>
Date:   Thu Nov 23 09:01:19 2006 +0000

    add floating-point for x86-64
    
    git-archimport-id: address@hidden/lightning--stable--1.2--patch-49
---
 ChangeLog                  |  14 +++
 NEWS                       |   5 +
 lightning/core-common.h    |  24 ++--
 lightning/i386/asm-32.h    |   7 +-
 lightning/i386/asm-64.h    |  36 ++++++
 lightning/i386/asm-i386.h  | 279 ++++++++++++++++++++++++++++++++++++++++++++-
 lightning/i386/core-64.h   |  46 +++++---
 lightning/i386/core-i386.h |   5 -
 lightning/i386/fp-32.h     |   5 +
 lightning/i386/fp-64.h     | 260 +++++++++++++++++++++++++++++++++++++++++-
 lightning/sparc/fp.h       |   4 +-
 11 files changed, 640 insertions(+), 45 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index a749ceb..79dbe91 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2006-11-23  Paolo Bonzini  <address@hidden>
+
+       * lightning/core-common.h: Add casts in "*i_p" variants.
+       * lightning/i386/asm-32.h: Add _r1.
+       * lightning/i386/asm-64.h: Likewise, and add SSE instructions.
+       * lightning/i386/asm-i386.h: Merge SSE instructions from Gwenole.
+       Use short form for 16-bit AX instructions.  Remove _r1
+       * lightning/i386/core-64.h: Add FP ABI support in its infancy.
+       * lightning/i386/core-i386.h: Move jit_arg_f and jit_arg_d...
+       * lightning/i386/core-32.h: ... and jit_prepare_f and jit_prepare_d...
+       * lightning/i386/fp-32.h: ... here.
+       * lightning/i386/fp-64.h: Write the code.
+       * lightning/sparc/fp.h: Fix jit_extr_{f_d,d_f} register order.
+       
 2006-11-22  Paolo Bonzini  <address@hidden>
 
        * lightning/i386/asm-i386.h: Move x86-64 instructions...
diff --git a/NEWS b/NEWS
index 5869867..be75675 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ NEWS FROM VERSION 1.2 TO 1.3
 
 o   Initial support for x86-64 back-end (mostly untested).
 
+o   lightning is more strict on casts from integer to pointer.
+    Be sure to use the _p variants when your immediates are
+    of pointer type.  This was done to ease 64-bit cleanliness
+    tests.
+
 o   Many bug fixes.
 
 o   JIT_FPRET is used as JIT_RET to move return values.
diff --git a/lightning/core-common.h b/lightning/core-common.h
index d9edaab..1a90c57 100644
--- a/lightning/core-common.h
+++ b/lightning/core-common.h
@@ -396,30 +396,30 @@ typedef union jit_code {
 #define jit_bmsi_ul(label, rs, is)     jit_bmsi_l((label), (rs), (is))
 
 #define jit_ltr_p(d, s1, s2)           jit_ltr_ul((d), (s1), (s2))
-#define jit_lti_p(d, rs, is)           jit_lti_ul((d), (rs), (is))
+#define jit_lti_p(d, rs, is)           jit_lti_ul((d), (rs), (long)(is))
 #define jit_ler_p(d, s1, s2)           jit_ler_ul((d), (s1), (s2))
-#define jit_lei_p(d, rs, is)           jit_lei_ul((d), (rs), (is))
+#define jit_lei_p(d, rs, is)           jit_lei_ul((d), (rs), (long)(is))
 #define jit_gtr_p(d, s1, s2)           jit_gtr_ul((d), (s1), (s2))
-#define jit_gti_p(d, rs, is)           jit_gti_ul((d), (rs), (is))
+#define jit_gti_p(d, rs, is)           jit_gti_ul((d), (rs), (long)(is))
 #define jit_ger_p(d, s1, s2)           jit_ger_ul((d), (s1), (s2))
-#define jit_gei_p(d, rs, is)           jit_gei_ul((d), (rs), (is))
+#define jit_gei_p(d, rs, is)           jit_gei_ul((d), (rs), (long)(is))
 #define jit_eqr_p(d, s1, s2)           jit_eqr_ul((d), (s1), (s2))
-#define jit_eqi_p(d, rs, is)           jit_eqi_ul((d), (rs), (is))
+#define jit_eqi_p(d, rs, is)           jit_eqi_ul((d), (rs), (long)(is))
 #define jit_ner_p(d, s1, s2)           jit_ner_ul((d), (s1), (s2))
-#define jit_nei_p(d, rs, is)           jit_nei_ul((d), (rs), (is))
+#define jit_nei_p(d, rs, is)           jit_nei_ul((d), (rs), (long)(is))
 
 #define jit_bltr_p(label, s1, s2)      jit_bltr_ul((label), (s1), (s2))
-#define jit_blti_p(label, rs, is)      jit_blti_ul((label), (rs), (is))
+#define jit_blti_p(label, rs, is)      jit_blti_ul((label), (rs), (long)(is))
 #define jit_bler_p(label, s1, s2)      jit_bler_ul((label), (s1), (s2))
-#define jit_blei_p(label, rs, is)      jit_blei_ul((label), (rs), (is))
+#define jit_blei_p(label, rs, is)      jit_blei_ul((label), (rs), (long)(is))
 #define jit_bgtr_p(label, s1, s2)      jit_bgtr_ul((label), (s1), (s2))
-#define jit_bgti_p(label, rs, is)      jit_bgti_ul((label), (rs), (is))
+#define jit_bgti_p(label, rs, is)      jit_bgti_ul((label), (rs), (long)(is))
 #define jit_bger_p(label, s1, s2)      jit_bger_ul((label), (s1), (s2))
-#define jit_bgei_p(label, rs, is)      jit_bgei_ul((label), (rs), (is))
+#define jit_bgei_p(label, rs, is)      jit_bgei_ul((label), (rs), (long)(is))
 #define jit_beqr_p(label, s1, s2)      jit_beqr_ul((label), (s1), (s2))
-#define jit_beqi_p(label, rs, is)      jit_beqi_ul((label), (rs), (is))
+#define jit_beqi_p(label, rs, is)      jit_beqi_ul((label), (rs), (long)(is))
 #define jit_bner_p(label, s1, s2)      jit_bner_ul((label), (s1), (s2))
-#define jit_bnei_p(label, rs, is)      jit_bnei_ul((label), (rs), (is))
+#define jit_bnei_p(label, rs, is)      jit_bnei_ul((label), (rs), (long)(is))
 
 #define jit_retval_ui(rd)              jit_retval_i((rd))
 #define jit_retval_uc(rd)              jit_retval_i((rd))
diff --git a/lightning/i386/asm-32.h b/lightning/i386/asm-32.h
index 1945a49..d336cb2 100644
--- a/lightning/i386/asm-32.h
+++ b/lightning/i386/asm-32.h
@@ -45,6 +45,7 @@
 
 #include "asm-i386.h"
 
+#define _r1(R)          ( ((R) & ~3) == _AL || ((R) & ~3) == _AH ? _rN(R) : 
JITFAIL( "8-bit register required"))
 #define _rA(R)          _r4(R)
 
 /* Use RIP-addressing in 64-bit mode, if possible */
@@ -58,14 +59,14 @@
 #define _m64only(X)            JITFAIL("invalid instruction in 32-bit mode")
 #define _m64(X)                        ((void)0)
 
-#define CALLsr(R)                      CALLLsr(R)
-#define JMPsr(R)                       JMPLsr(R)
-
 #define _AH            0x24
 #define _CH            0x25
 #define _DH            0x26
 #define _BH            0x27
 
+#define CALLsr(R)                      CALLLsr(R)
+#define JMPsr(R)                       JMPLsr(R)
+
 #define DECWr(RD)      (_d16(),        _Or             (0x48,_r2(RD)           
                                        ))
 #define DECLr(RD)                      _Or             (0x48,_r4(RD)           
                                        )
 #define INCWr(RD)      (_d16(),        _Or             (0x40,_r2(RD)           
                                        ))
diff --git a/lightning/i386/asm-64.h b/lightning/i386/asm-64.h
index 2280c83..9f5431c 100644
--- a/lightning/i386/asm-64.h
+++ b/lightning/i386/asm-64.h
@@ -127,6 +127,8 @@
 #define _R15           0x4F
 #define _RIP           -2
 
+#define _r1(R)          ( ((unsigned) _rC((R) - 16)) < (0x30 - 16)      ? 
_rN(R) : JITFAIL( "8-bit register required"))
+
 #if 0
 #define _r8(R)         ( (_rC(R) == 0x50)                      ? _rN(R) : 
JITFAIL("64-bit register required"))
 #else
@@ -335,6 +337,40 @@
 
 #define BSWAPQr(R)                     (_REXQrr(0, R),                 _OOr    
        (0x0fc8,_r8(R)                                                  ))
 
+
+
+#define __SSEQrr(OP,RS,RSA,RD,RDA)             (_REXQrr(RD, RS),               
_OO_Mrm         (0x0f00|(OP)    ,_b11,RDA(RD),RSA(RS)                           
))
+#define __SSEQmr(OP,MD,MB,MI,MS,RD,RDA)                (_REXQmr(MB, MI, RD),   
        _OO_r_X         (0x0f00|(OP)         ,RDA(RD)           ,MD,MB,MI,MS    
        ))
+#define __SSEQrm(OP,RS,RSA,MD,MB,MI,MS)                (_REXQrm(RS, MB, MI),   
        _OO_r_X         (0x0f00|(OP)         ,RSA(RS)           ,MD,MB,MI,MS    
        ))
+#define __SSEQ1rm(OP,RS,RSA,MD,MB,MI,MS)       (_REXQrm(RS, MB, MI),           
_OO_r_X         (0x0f01|(OP)         ,RSA(RS)           ,MD,MB,MI,MS            
))
+
+#define _SSEQrr(PX,OP,RS,RSA,RD,RDA)                                   
(_jit_B(PX), __SSEQrr(OP, RS, RSA, RD, RDA))
+#define _SSEQmr(PX,OP,MD,MB,MI,MS,RD,RDA)                              
(_jit_B(PX), __SSEQmr(OP, MD, MB, MI, MS, RD, RDA))
+#define _SSEQrm(PX,OP,RS,RSA,MD,MB,MI,MS)                              
(_jit_B(PX), __SSEQrm(OP, RS, RSA, MD, MB, MI, MS))
+#define _SSEQ1rm(PX,OP,RS,RSA,MD,MB,MI,MS)                             
(_jit_B(PX), __SSEQ1rm(OP, RS, RSA, MD, MB, MI, MS))
+
+#define CVTSS2SIQrr(RS, RD)             _SSEQrr(0xf3, X86_SSE_CVTSI, RS,_rX, 
RD,_r8)
+#define CVTSS2SIQmr(MD, MB, MI, MS, RD)         _SSEQmr(0xf3, X86_SSE_CVTSI, 
MD, MB, MI, MS, RD,_r8)
+#define CVTSD2SIQrr(RS, RD)             _SSEQrr(0xf2, X86_SSE_CVTSI, RS,_rX, 
RD,_r8)
+#define CVTSD2SIQmr(MD, MB, MI, MS, RD)         _SSEQmr(0xf2, X86_SSE_CVTSI, 
MD, MB, MI, MS, RD,_r8)
+
+#define CVTSI2SSQrr(RS, RD)             _SSEQrr(0xf3, X86_SSE_CVTIS, RS,_r8, 
RD,_rX)
+#define CVTSI2SSQmr(MD, MB, MI, MS, RD)         _SSEQmr(0xf3, X86_SSE_CVTIS, 
MD, MB, MI, MS, RD,_rX)
+#define CVTSI2SDQrr(RS, RD)             _SSEQrr(0xf2, X86_SSE_CVTIS, RS,_r8, 
RD,_rX)
+#define CVTSI2SDQmr(MD, MB, MI, MS, RD)         _SSEQmr(0xf2, X86_SSE_CVTIS, 
MD, MB, MI, MS, RD,_rX)
+
+#define MOVDQXrr(RS, RD)                _SSEQrr(0x66, 0x6e, RS,_r8, RD,_rX)
+#define MOVDQXmr(MD, MB, MI, MS, RD)    _SSEQmr(0x66, 0x6e, MD, MB, MI, MS, 
RD,_rX)
+
+#define MOVDXQrr(RS, RD)                _SSEQrr(0x66, 0x7e, RS,_rX, RD,_r8)
+#define MOVDXQrm(RS, MD, MB, MI, MS)    _SSEQrm(0x66, 0x7e, RS,_rX, MD, MB, 
MI, MS)
+#define MOVDQMrr(RS, RD)               __SSEQrr(      0x6e, RS,_r8, RD,_rM)
+#define MOVDQMmr(MD, MB, MI, MS, RD)   __SSEQmr(      0x6e, MD, MB, MI, MS, 
RD,_rM)
+#define MOVDMQrr(RS, RD)               __SSEQrr(      0x7e, RS,_rM, RD,_r8)
+#define MOVDMQrm(RS, MD, MB, MI, MS)   __SSEQrm(      0x7e, RS,_rM, MD, MB, 
MI, MS)
+
+
+
 #define CALLsr(R)                      CALLQsr(R)
 #define JMPsr(R)                       JMPQsr(R)
 
diff --git a/lightning/i386/asm-i386.h b/lightning/i386/asm-i386.h
index ffe870e..94d944c 100644
--- a/lightning/i386/asm-i386.h
+++ b/lightning/i386/asm-i386.h
@@ -129,7 +129,6 @@ typedef _uc         jit_insn;
 #define _rM(R)         _rN(R)
 #define _rX(R)         _rN(R)
 #else
-#define _r1(R)         ( ((unsigned) _rC((R) - 16)) < (0x30 - 16)      ? 
_rN(R) : JITFAIL( "8-bit register required"))
 #define _r2(R)         ( (_rC(R) == 0x30)                      ? _rN(R) : 
JITFAIL("16-bit register required"))
 #define _r4(R)         ( (_rC(R) == 0x40)                      ? _rN(R) : 
JITFAIL("32-bit register required"))
 #define _rM(R)         ( (_rC(R) == 0x60)                      ? _rN(R) : 
JITFAIL("MMX register required"))
@@ -314,7 +313,7 @@ enum {
 #define _ALUWrr(OP, RS, RD)            (_d16(), _REXLrr(RS, RD),       _O_Mrm  
        (((OP) << 3) + 1,_b11,_r2(RS),_r2(RD)                           ))
 #define _ALUWmr(OP, MD, MB, MI, MS, RD)        (_d16(), _REXLmr(MB, MI, RD),   
_O_r_X          (((OP) << 3) + 3     ,_r2(RD)           ,MD,MB,MI,MS            
))
 #define _ALUWrm(OP, RS, MD, MB, MI, MS)        (_d16(), _REXLrm(RS, MB, MI),   
_O_r_X          (((OP) << 3) + 1     ,_r2(RS)           ,MD,MB,MI,MS            
))
-#define _ALUWir(OP, IM, RD)            (!_s8P(IM) && (RD) == _AX ? \
+#define _ALUWir(OP, IM, RD)            ((RD) == _AX ? \
                                        (_d16(), _REXLrr(0, RD),        _O_W    
        (((OP) << 3) + 5                                        ,_su16(IM))) : \
                                        (_d16(), _REXLrr(0, RD),        
_Os_Mrm_sW      (0x81           ,_b11,OP     ,_r2(RD)                   
,_su16(IM))) )
 #define _ALUWim(OP, IM, MD, MB, MI, MS)        (_d16(), _REXLrm(0, MB, MI),    
_Os_r_X_sW      (0x81                ,OP                ,MD,MB,MI,MS    
,_su16(IM)))
@@ -1084,7 +1083,7 @@ enum {
 
 #define TESTWrr(RS, RD)                        (_d16(), _REXLrr(RS, RD),       
_O_Mrm          (0x85           ,_b11,_r2(RS),_r2(RD)                           
))
 #define TESTWrm(RS, MD, MB, MI, MS)    (_d16(), _REXLrm(RS, MB, MI),   _O_r_X  
        (0x85                ,_r2(RS)           ,MD,MB,MI,MS            ))
-#define TESTWir(IM, RD)                        (!_s8P(IM) && (RD) == _AX ? \
+#define TESTWir(IM, RD)                        ((RD) == _AX ? \
                                        (_d16(), _REXLrr(0, RD),        _O_W    
        (0xa9                                                   ,_u16(IM))) : \
                                        (_d16(), _REXLrr(0, RD),        
_O_Mrm_W        (0xf7           ,_b11,_b000  ,_r2(RD)                   
,_u16(IM))) )
 #define TESTWim(IM, MD, MB, MI, MS)    (_d16(), _REXLrm(0, MB, MI),    
_O_r_X_W        (0xf7                ,_b000             ,MD,MB,MI,MS    
,_u16(IM)))
@@ -1302,6 +1301,280 @@ enum {
                          JITFAIL(".align argument too large")))
 
 
+/* --- Media 128-bit instructions ------------------------------------------ */
+
+enum {
+  X86_SSE_MOV    = 0x10,
+  X86_SSE_MOVLP  = 0x12,
+  X86_SSE_MOVHP  = 0x16,
+  X86_SSE_MOVA   = 0x28,
+  X86_SSE_CVTIS  = 0x2a,
+  X86_SSE_CVTSI  = 0x2d,
+  X86_SSE_UCOMI  = 0x2e,
+  X86_SSE_COMI   = 0x2f,
+  X86_SSE_SQRT   = 0x51,
+  X86_SSE_RSQRT  = 0x52,
+  X86_SSE_RCP    = 0x53,
+  X86_SSE_AND    = 0x54,
+  X86_SSE_ANDN   = 0x55,
+  X86_SSE_OR     = 0x56,
+  X86_SSE_XOR    = 0x57,
+  X86_SSE_ADD    = 0x58,
+  X86_SSE_MUL    = 0x59,
+  X86_SSE_CVTSD  = 0x5a,
+  X86_SSE_CVTDT  = 0x5b,
+  X86_SSE_SUB    = 0x5c,
+  X86_SSE_MIN    = 0x5d,
+  X86_SSE_DIV    = 0x5e,
+  X86_SSE_MAX    = 0x5f,
+  X86_SSE_MOV2   = 0xd6
+};
+
+/*                                                                             
_format         Opcd            ,Mod ,r      ,m         ,mem=dsp+sib    ,imm... 
*/
+
+#define __SSELrr(OP,RS,RSA,RD,RDA)             (_REXLrr(RD, RS),               
_OO_Mrm         (0x0f00|(OP)    ,_b11,RDA(RD),RSA(RS)                           
))
+#define __SSELmr(OP,MD,MB,MI,MS,RD,RDA)                (_REXLmr(MB, MI, RD),   
        _OO_r_X         (0x0f00|(OP)         ,RDA(RD)           ,MD,MB,MI,MS    
        ))
+#define __SSELrm(OP,RS,RSA,MD,MB,MI,MS)                (_REXLrm(RS, MB, MI),   
        _OO_r_X         (0x0f00|(OP)         ,RSA(RS)           ,MD,MB,MI,MS    
        ))
+#define __SSEL1rm(OP,RS,RSA,MD,MB,MI,MS)       (_REXLrm(RS, MB, MI),           
_OO_r_X         (0x0f01|(OP)         ,RSA(RS)           ,MD,MB,MI,MS            
))
+
+#define _SSELrr(PX,OP,RS,RSA,RD,RDA)                                   
(_jit_B(PX), __SSELrr(OP, RS, RSA, RD, RDA))
+#define _SSELmr(PX,OP,MD,MB,MI,MS,RD,RDA)                              
(_jit_B(PX), __SSELmr(OP, MD, MB, MI, MS, RD, RDA))
+#define _SSELrm(PX,OP,RS,RSA,MD,MB,MI,MS)                              
(_jit_B(PX), __SSELrm(OP, RS, RSA, MD, MB, MI, MS))
+#define _SSEL1rm(PX,OP,RS,RSA,MD,MB,MI,MS)                             
(_jit_B(PX), __SSEL1rm(OP, RS, RSA, MD, MB, MI, MS))
+
+#define _SSEPSrr(OP,RS,RD)             __SSELrr (      OP, RS,_rX, RD,_rX)
+#define _SSEPSmr(OP,MD,MB,MI,MS,RD)    __SSELmr (      OP, MD, MB, MI, MS, 
RD,_rX)
+#define _SSEPSrm(OP,RS,MD,MB,MI,MS)    __SSELrm (      OP, RS,_rX, MD, MB, MI, 
MS)
+#define _SSEPS1rm(OP,RS,MD,MB,MI,MS)   __SSEL1rm(      OP, RS,_rX, MD, MB, MI, 
MS)
+
+#define _SSEPDrr(OP,RS,RD)              _SSELrr (0x66, OP, RS,_rX, RD,_rX)
+#define _SSEPDmr(OP,MD,MB,MI,MS,RD)     _SSELmr (0x66, OP, MD, MB, MI, MS, 
RD,_rX)
+#define _SSEPDrm(OP,RS,MD,MB,MI,MS)     _SSELrm (0x66, OP, RS,_rX, MD, MB, MI, 
MS)
+#define _SSEPD1rm(OP,RS,MD,MB,MI,MS)    _SSEL1rm(0x66, OP, RS,_rX, MD, MB, MI, 
MS)
+
+#define _SSESSrr(OP,RS,RD)              _SSELrr (0xf3, OP, RS,_rX, RD,_rX)
+#define _SSESSmr(OP,MD,MB,MI,MS,RD)     _SSELmr (0xf3, OP, MD, MB, MI, MS, 
RD,_rX)
+#define _SSESSrm(OP,RS,MD,MB,MI,MS)     _SSELrm (0xf3, OP, RS,_rX, MD, MB, MI, 
MS)
+#define _SSESS1rm(OP,RS,MD,MB,MI,MS)    _SSEL1rm(0xf3, OP, RS,_rX, MD, MB, MI, 
MS)
+
+#define _SSESDrr(OP,RS,RD)              _SSELrr (0xf2, OP, RS,_rX, RD,_rX)
+#define _SSESDmr(OP,MD,MB,MI,MS,RD)     _SSELmr (0xf2, OP, MD, MB, MI, MS, 
RD,_rX)
+#define _SSESDrm(OP,RS,MD,MB,MI,MS)     _SSELrm (0xf2, OP, RS,_rX, MD, MB, MI, 
MS)
+#define _SSESD1rm(OP,RS,MD,MB,MI,MS)    _SSEL1rm(0xf2, OP, RS,_rX, MD, MB, MI, 
MS)
+
+#define ADDPSrr(RS, RD)                        _SSEPSrr(X86_SSE_ADD, RS, RD)
+#define ADDPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_ADD, MD, MB, MI, MS, 
RD)
+#define ADDPDrr(RS, RD)                        _SSEPDrr(X86_SSE_ADD, RS, RD)
+#define ADDPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_ADD, MD, MB, MI, MS, 
RD)
+
+#define ADDSSrr(RS, RD)                        _SSESSrr(X86_SSE_ADD, RS, RD)
+#define ADDSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_ADD, MD, MB, MI, MS, 
RD)
+#define ADDSDrr(RS, RD)                        _SSESDrr(X86_SSE_ADD, RS, RD)
+#define ADDSDmr(MD, MB, MI, MS, RD)    _SSESDmr(X86_SSE_ADD, MD, MB, MI, MS, 
RD)
+
+#define ANDNPSrr(RS, RD)               _SSEPSrr(X86_SSE_ANDN, RS, RD)
+#define ANDNPSmr(MD, MB, MI, MS, RD)   _SSEPSmr(X86_SSE_ANDN, MD, MB, MI, MS, 
RD)
+#define ANDNPDrr(RS, RD)               _SSEPDrr(X86_SSE_ANDN, RS, RD)
+#define ANDNPDmr(MD, MB, MI, MS, RD)   _SSEPDmr(X86_SSE_ANDN, MD, MB, MI, MS, 
RD)
+
+#define ANDNSSrr                       ANDNPSrr
+#define ANDNSSmr                       ANDNPSrr
+#define ANDNSDrr                       ANDNPDrr
+#define ANDNSDmr                       ANDNPDrr
+
+#define ANDPSrr(RS, RD)                        _SSEPSrr(X86_SSE_AND, RS, RD)
+#define ANDPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_AND, MD, MB, MI, MS, 
RD)
+#define ANDPDrr(RS, RD)                        _SSEPDrr(X86_SSE_AND, RS, RD)
+#define ANDPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_AND, MD, MB, MI, MS, 
RD)
+
+#define ANDSSrr                                ANDPSrr
+#define ANDSSmr                                ANDPSrr
+#define ANDSDrr                                ANDPDrr
+#define ANDSDmr                                ANDPDrr
+
+#define DIVPSrr(RS, RD)                        _SSEPSrr(X86_SSE_DIV, RS, RD)
+#define DIVPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_DIV, MD, MB, MI, MS, 
RD)
+#define DIVPDrr(RS, RD)                        _SSEPDrr(X86_SSE_DIV, RS, RD)
+#define DIVPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_DIV, MD, MB, MI, MS, 
RD)
+
+#define DIVSSrr(RS, RD)                        _SSESSrr(X86_SSE_DIV, RS, RD)
+#define DIVSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_DIV, MD, MB, MI, MS, 
RD)
+#define DIVSDrr(RS, RD)                        _SSESDrr(X86_SSE_DIV, RS, RD)
+#define DIVSDmr(MD, MB, MI, MS, RD)    _SSESDmr(X86_SSE_DIV, MD, MB, MI, MS, 
RD)
+
+#define MAXPSrr(RS, RD)                        _SSEPSrr(X86_SSE_MAX, RS, RD)
+#define MAXPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_MAX, MD, MB, MI, MS, 
RD)
+#define MAXPDrr(RS, RD)                        _SSEPDrr(X86_SSE_MAX, RS, RD)
+#define MAXPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_MAX, MD, MB, MI, MS, 
RD)
+
+#define MAXSSrr(RS, RD)                        _SSESSrr(X86_SSE_MAX, RS, RD)
+#define MAXSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_MAX, MD, MB, MI, MS, 
RD)
+#define MAXSDrr(RS, RD)                        _SSESDrr(X86_SSE_MAX, RS, RD)
+#define MAXSDmr(MD, MB, MI, MS, RD)    _SSESDmr(X86_SSE_MAX, MD, MB, MI, MS, 
RD)
+
+#define MINPSrr(RS, RD)                        _SSEPSrr(X86_SSE_MIN, RS, RD)
+#define MINPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_MIN, MD, MB, MI, MS, 
RD)
+#define MINPDrr(RS, RD)                        _SSEPDrr(X86_SSE_MIN, RS, RD)
+#define MINPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_MIN, MD, MB, MI, MS, 
RD)
+
+#define MINSSrr(RS, RD)                        _SSESSrr(X86_SSE_MIN, RS, RD)
+#define MINSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_MIN, MD, MB, MI, MS, 
RD)
+#define MINSDrr(RS, RD)                        _SSESDrr(X86_SSE_MIN, RS, RD)
+#define MINSDmr(MD, MB, MI, MS, RD)    _SSESDmr(X86_SSE_MIN, MD, MB, MI, MS, 
RD)
+
+#define MULPSrr(RS, RD)                        _SSEPSrr(X86_SSE_MUL, RS, RD)
+#define MULPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_MUL, MD, MB, MI, MS, 
RD)
+#define MULPDrr(RS, RD)                        _SSEPDrr(X86_SSE_MUL, RS, RD)
+#define MULPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_MUL, MD, MB, MI, MS, 
RD)
+
+#define MULSSrr(RS, RD)                        _SSESSrr(X86_SSE_MUL, RS, RD)
+#define MULSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_MUL, MD, MB, MI, MS, 
RD)
+#define MULSDrr(RS, RD)                        _SSESDrr(X86_SSE_MUL, RS, RD)
+#define MULSDmr(MD, MB, MI, MS, RD)    _SSESDmr(X86_SSE_MUL, MD, MB, MI, MS, 
RD)
+
+#define ORPSrr(RS, RD)                 _SSEPSrr(X86_SSE_OR, RS, RD)
+#define ORPSmr(MD, MB, MI, MS, RD)     _SSEPSmr(X86_SSE_OR, MD, MB, MI, MS, RD)
+#define ORPDrr(RS, RD)                 _SSEPDrr(X86_SSE_OR, RS, RD)
+#define ORPDmr(MD, MB, MI, MS, RD)     _SSEPDmr(X86_SSE_OR, MD, MB, MI, MS, RD)
+
+#define ORSSrr                         ORPSrr
+#define ORSSmr                         ORPSrr
+#define ORSDrr                         ORPDrr
+#define ORSDmr                         ORPDrr
+
+#define RCPPSrr(RS, RD)                        _SSEPSrr(X86_SSE_RCP, RS, RD)
+#define RCPPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_RCP, MD, MB, MI, MS, 
RD)
+#define RCPSSrr(RS, RD)                        _SSESSrr(X86_SSE_RCP, RS, RD)
+#define RCPSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_RCP, MD, MB, MI, MS, 
RD)
+
+#define RSQRTPSrr(RS, RD)              _SSEPSrr(X86_SSE_RSQRT, RS, RD)
+#define RSQRTPSmr(MD, MB, MI, MS, RD)  _SSEPSmr(X86_SSE_RSQRT, MD, MB, MI, MS, 
RD)
+#define RSQRTSSrr(RS, RD)              _SSESSrr(X86_SSE_RSQRT, RS, RD)
+#define RSQRTSSmr(MD, MB, MI, MS, RD)  _SSESSmr(X86_SSE_RSQRT, MD, MB, MI, MS, 
RD)
+
+#define SQRTPSrr(RS, RD)               _SSEPSrr(X86_SSE_SQRT, RS, RD)
+#define SQRTPSmr(MD, MB, MI, MS, RD)   _SSEPSmr(X86_SSE_SQRT, MD, MB, MI, MS, 
RD)
+#define SQRTPDrr(RS, RD)               _SSEPDrr(X86_SSE_SQRT, RS, RD)
+#define SQRTPDmr(MD, MB, MI, MS, RD)   _SSEPDmr(X86_SSE_SQRT, MD, MB, MI, MS, 
RD)
+
+#define SQRTSSrr(RS, RD)               _SSESSrr(X86_SSE_SQRT, RS, RD)
+#define SQRTSSmr(MD, MB, MI, MS, RD)   _SSESSmr(X86_SSE_SQRT, MD, MB, MI, MS, 
RD)
+#define SQRTSDrr(RS, RD)               _SSESDrr(X86_SSE_SQRT, RS, RD)
+#define SQRTSDmr(MD, MB, MI, MS, RD)   _SSESDmr(X86_SSE_SQRT, MD, MB, MI, MS, 
RD)
+
+#define SUBPSrr(RS, RD)                        _SSEPSrr(X86_SSE_SUB, RS, RD)
+#define SUBPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_SUB, MD, MB, MI, MS, 
RD)
+#define SUBPDrr(RS, RD)                        _SSEPDrr(X86_SSE_SUB, RS, RD)
+#define SUBPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_SUB, MD, MB, MI, MS, 
RD)
+
+#define SUBSSrr(RS, RD)                        _SSESSrr(X86_SSE_SUB, RS, RD)
+#define SUBSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_SUB, MD, MB, MI, MS, 
RD)
+#define SUBSDrr(RS, RD)                        _SSESDrr(X86_SSE_SUB, RS, RD)
+#define SUBSDmr(MD, MB, MI, MS, RD)    _SSESDmr(X86_SSE_SUB, MD, MB, MI, MS, 
RD)
+
+#define XORPSrr(RS, RD)                        _SSEPSrr(X86_SSE_XOR, RS, RD)
+#define XORPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_XOR, MD, MB, MI, MS, 
RD)
+#define XORPDrr(RS, RD)                        _SSEPDrr(X86_SSE_XOR, RS, RD)
+#define XORPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_XOR, MD, MB, MI, MS, 
RD)
+
+#define XORSSrr                                XORPSrr
+#define XORSSmr                                XORPSrr
+#define XORSDrr                                XORPDrr
+#define XORSDmr                                XORPDrr
+
+/* No prefixes here.  */
+#define COMISSrr(RS, RD)               _SSEPSrr(X86_SSE_COMI, RS, RD)
+#define COMISSmr(MD, MB, MI, MS, RD)   _SSEPSmr(X86_SSE_COMI, MD, MB, MI, MS, 
RD)
+#define COMISDrr(RS, RD)               _SSEPDrr(X86_SSE_COMI, RS, RD)
+#define COMISDmr(MD, MB, MI, MS, RD)   _SSEPDmr(X86_SSE_COMI, MD, MB, MI, MS, 
RD)
+
+/* No prefixes here.  */
+#define UCOMISSrr(RS, RD)              _SSEPSrr(X86_SSE_UCOMI, RS, RD)
+#define UCOMISSmr(MD, MB, MI, MS, RD)  _SSEPSmr(X86_SSE_UCOMI, MD, MB, MI, MS, 
RD)
+#define UCOMISDrr(RS, RD)              _SSEPDrr(X86_SSE_UCOMI, RS, RD)
+#define UCOMISDmr(MD, MB, MI, MS, RD)  _SSEPDmr(X86_SSE_UCOMI, MD, MB, MI, MS, 
RD)
+
+#define MOVSSrr(RS, RD)                        _SSESSrr (X86_SSE_MOV, RS, RD)
+#define MOVSSmr(MD, MB, MI, MS, RD)    _SSESSmr (X86_SSE_MOV, MD, MB, MI, MS, 
RD)
+#define MOVSSrm(RS, MD, MB, MI, MS)    _SSESS1rm(X86_SSE_MOV, RS, MD, MB, MI, 
MS)
+
+#define MOVSDrr(RS, RD)                        _SSESDrr (X86_SSE_MOV, RS, RD)
+#define MOVSDmr(MD, MB, MI, MS, RD)    _SSESDmr (X86_SSE_MOV, MD, MB, MI, MS, 
RD)
+#define MOVSDrm(RS, MD, MB, MI, MS)    _SSESD1rm(X86_SSE_MOV, RS, MD, MB, MI, 
MS)
+
+#define MOVAPSrr(RS, RD)               _SSEPSrr (X86_SSE_MOVA, RS, RD)
+#define MOVAPSmr(MD, MB, MI, MS, RD)   _SSEPSmr (X86_SSE_MOVA, MD, MB, MI, MS, 
RD)
+#define MOVAPSrm(RS, MD, MB, MI, MS)   _SSEPS1rm(X86_SSE_MOVA, RS, MD, MB, MI, 
MS)
+
+#define MOVAPDrr(RS, RD)               _SSEPDrr (X86_SSE_MOVA, RS, RD)
+#define MOVAPDmr(MD, MB, MI, MS, RD)   _SSEPDmr (X86_SSE_MOVA, MD, MB, MI, MS, 
RD)
+#define MOVAPDrm(RS, MD, MB, MI, MS)   _SSEPD1rm(X86_SSE_MOVA, RS, MD, MB, MI, 
MS)
+
+#define CVTPS2PIrr(RS, RD)             __SSELrr(      X86_SSE_CVTSI, RS,_rX, 
RD,_rM)
+#define CVTPS2PImr(MD, MB, MI, MS, RD) __SSELmr(      X86_SSE_CVTSI, MD, MB, 
MI, MS, RD,_rM)
+#define CVTPD2PIrr(RS, RD)              _SSELrr(0x66, X86_SSE_CVTSI, RS,_rX, 
RD,_rM)
+#define CVTPD2PImr(MD, MB, MI, MS, RD)  _SSELmr(0x66, X86_SSE_CVTSI, MD, MB, 
MI, MS, RD,_rM)
+
+#define CVTPI2PSrr(RS, RD)             __SSELrr(      X86_SSE_CVTIS, RS,_rM, 
RD,_rX)
+#define CVTPI2PSmr(MD, MB, MI, MS, RD) __SSELmr(      X86_SSE_CVTIS, MD, MB, 
MI, MS, RD,_rX)
+#define CVTPI2PDrr(RS, RD)              _SSELrr(0x66, X86_SSE_CVTIS, RS,_rM, 
RD,_rX)
+#define CVTPI2PDmr(MD, MB, MI, MS, RD)  _SSELmr(0x66, X86_SSE_CVTIS, MD, MB, 
MI, MS, RD,_rX)
+
+#define CVTPS2PDrr(RS, RD)             __SSELrr(      X86_SSE_CVTSD, RS,_rX, 
RD,_rX)
+#define CVTPS2PDmr(MD, MB, MI, MS, RD) __SSELmr(      X86_SSE_CVTSD, MD, MB, 
MI, MS, RD,_rX)
+#define CVTPD2PSrr(RS, RD)              _SSELrr(0x66, X86_SSE_CVTSD, RS,_rX, 
RD,_rX)
+#define CVTPD2PSmr(MD, MB, MI, MS, RD)  _SSELmr(0x66, X86_SSE_CVTSD, MD, MB, 
MI, MS, RD,_rX)
+
+#define CVTSS2SDrr(RS, RD)              _SSELrr(0xf3, X86_SSE_CVTSD, RS,_rX, 
RD,_rX)
+#define CVTSS2SDmr(MD, MB, MI, MS, RD)  _SSELmr(0xf3, X86_SSE_CVTSD, MD, MB, 
MI, MS, RD,_rX)
+#define CVTSD2SSrr(RS, RD)              _SSELrr(0xf2, X86_SSE_CVTSD, RS,_rX, 
RD,_rX)
+#define CVTSD2SSmr(MD, MB, MI, MS, RD)  _SSELmr(0xf2, X86_SSE_CVTSD, MD, MB, 
MI, MS, RD,_rX)
+
+#define CVTSS2SILrr(RS, RD)             _SSELrr(0xf3, X86_SSE_CVTSI, RS,_rX, 
RD,_r4)
+#define CVTSS2SILmr(MD, MB, MI, MS, RD)         _SSELmr(0xf3, X86_SSE_CVTSI, 
MD, MB, MI, MS, RD,_r4)
+#define CVTSD2SILrr(RS, RD)             _SSELrr(0xf2, X86_SSE_CVTSI, RS,_rX, 
RD,_r4)
+#define CVTSD2SILmr(MD, MB, MI, MS, RD)         _SSELmr(0xf2, X86_SSE_CVTSI, 
MD, MB, MI, MS, RD,_r4)
+
+#define CVTSI2SSLrr(RS, RD)             _SSELrr(0xf3, X86_SSE_CVTIS, RS,_r4, 
RD,_rX)
+#define CVTSI2SSLmr(MD, MB, MI, MS, RD)         _SSELmr(0xf3, X86_SSE_CVTIS, 
MD, MB, MI, MS, RD,_rX)
+#define CVTSI2SDLrr(RS, RD)             _SSELrr(0xf2, X86_SSE_CVTIS, RS,_r4, 
RD,_rX)
+#define CVTSI2SDLmr(MD, MB, MI, MS, RD)         _SSELmr(0xf2, X86_SSE_CVTIS, 
MD, MB, MI, MS, RD,_rX)
+
+#define MOVDLXrr(RS, RD)                _SSELrr(0x66, 0x6e, RS,_r4, RD,_rX)
+#define MOVDLXmr(MD, MB, MI, MS, RD)    _SSELmr(0x66, 0x6e, MD, MB, MI, MS, 
RD,_rX)
+
+#define MOVDXLrr(RS, RD)                _SSELrr(0x66, 0x7e, RS,_rX, RD,_r4)
+#define MOVDXLrm(RS, MD, MB, MI, MS)    _SSELrm(0x66, 0x7e, RS,_rX, MD, MB, 
MI, MS)
+
+#define MOVDLMrr(RS, RD)               __SSELrr(      0x6e, RS,_r4, RD,_rM)
+#define MOVDLMmr(MD, MB, MI, MS, RD)   __SSELmr(      0x6e, MD, MB, MI, MS, 
RD,_rM)
+
+#define MOVDMLrr(RS, RD)               __SSELrr(      0x7e, RS,_rM, RD,_r4)
+#define MOVDMLrm(RS, MD, MB, MI, MS)   __SSELrm(      0x7e, RS,_rM, MD, MB, 
MI, MS)
+
+#define MOVDQ2Qrr(RS, RD)               _SSELrr(0xf2, X86_SSE_MOV2, RS,_rX, 
RD,_rM)
+#define MOVQ2DQrr(RS, RD)               _SSELrr(0xf3, X86_SSE_MOV2, RS,_rM, 
RD,_rX)
+#define MOVHLPSrr(RS, RD)              __SSELrr(      X86_SSE_MOVLP, RS,_rX, 
RD,_rX)
+#define MOVLHPSrr(RS, RD)              __SSELrr(      X86_SSE_MOVHP, RS,_rX, 
RD,_rX)
+
+#define MOVDQArr(RS, RD)                _SSELrr(0x66, 0x6f, RS,_rX, RD,_rX)
+#define MOVDQAmr(MD, MB, MI, MS, RD)    _SSELmr(0x66, 0x6f, MD, MB, MI, MS, 
RD,_rX)
+#define MOVDQArm(RS, MD, MB, MI, MS)    _SSELrm(0x66, 0x7f, RS,_rX, MD, MB, 
MI, MS)
+
+#define MOVDQUrr(RS, RD)                _SSELrr(0xf3, 0x6f, RS,_rX, RD,_rX)
+#define MOVDQUmr(MD, MB, MI, MS, RD)    _SSELmr(0xf3, 0x6f, MD, MB, MI, MS, 
RD,_rX)
+#define MOVDQUrm(RS, MD, MB, MI, MS)    _SSELrm(0xf3, 0x7f, RS,_rX, MD, MB, 
MI, MS)
+
+#define MOVHPDmr(MD, MB, MI, MS, RD)    _SSELmr (0x66, X86_SSE_MOVHP, MD, MB, 
MI, MS, RD,_rX)
+#define MOVHPDrm(RS, MD, MB, MI, MS)    _SSEL1rm(0x66, X86_SSE_MOVHP, RS,_rX, 
MD, MB, MI, MS)
+#define MOVHPSmr(MD, MB, MI, MS, RD)   __SSELmr (      X86_SSE_MOVHP, MD, MB, 
MI, MS, RD,_rX)
+#define MOVHPSrm(RS, MD, MB, MI, MS)   __SSEL1rm(      X86_SSE_MOVHP, RS,_rX, 
MD, MB, MI, MS)
+
+#define MOVLPDmr(MD, MB, MI, MS, RD)    _SSELmr (0x66, X86_SSE_MOVLP, MD, MB, 
MI, MS, RD,_rX)
+#define MOVLPDrm(RS, MD, MB, MI, MS)    _SSEL1rm(0x66, X86_SSE_MOVLP, RS,_rX, 
MD, MB, MI, MS)
+#define MOVLPSmr(MD, MB, MI, MS, RD)   __SSELmr (      X86_SSE_MOVLP, MD, MB, 
MI, MS, RD,_rX)
+#define MOVLPSrm(RS, MD, MB, MI, MS)   __SSEL1rm(      X86_SSE_MOVLP, RS,_rX, 
MD, MB, MI, MS)
+
 /*** References:                                                               
                */
 /*                                                                             
                */
 /* [1] "Intel Architecture Software Developer's Manual Volume 1: Basic 
Architecture",          */
diff --git a/lightning/i386/core-64.h b/lightning/i386/core-64.h
index e095091..922cd26 100644
--- a/lightning/i386/core-64.h
+++ b/lightning/i386/core-64.h
@@ -43,6 +43,8 @@
 
 struct jit_local_state {
   int   long_jumps;
+  int   nextarg_getfp;
+  int   nextarg_putfp;
   int   nextarg_geti;
   int  argssize;
   int   alloca_offset;
@@ -63,13 +65,19 @@ struct jit_local_state {
   jit_allocai_internal ((n), (_jitl.alloca_slack - (n)) & 15)
 
 /* 3-parameter operation */
-#define jit_qopr_(d, s1, s2, op1d, op2d)                               \
-       ( (s2 == d) ? op1d :                                            \
-         (  ((s1 == d) ? (void)0 : (void)MOVQrr(s1, d)), op2d )        \
+#define jit_qopr_(d, s1, s2, op1d, op2d)                                       
\
+       ( ((s2) == (d)) ? op1d :                                                
\
+         (  (((s1) == (d)) ? (void)0 : (void)MOVQrr((s1), (d))), op2d )        
\
        )
 
-/* 3-parameter operation, with immediate */
-#define jit_qop_(d, s1, op2d)                          \
+/* 3-parameter operation, with immediate. TODO: fix the case where mmediate
+   does not fit! */
+#define jit_qop_small(d, s1, op2d)                                     \
+       (((s1) == (d)) ? op2d : (MOVQrr((s1), (d)), op2d))
+#define jit_qop_(d, s1, is, op2d, op2i)                                        
\
+       (_s32P((long)(is))                                              \
+        ? jit_qop_small ((d), (s1), (op2d))                            \
+        : (MOVQrr ((is), JIT_REXTMP), jit_qop_small ((d), (s1), (op2i))))
 
 #define jit_bra_qr(s1, s2, op)         (CMPQrr(s2, s1), op, _jit.x.pc)
 #define _jit_bra_l(rs, is, op)         (CMPQir(is, rs), op, _jit.x.pc)
@@ -88,21 +96,19 @@ struct jit_local_state {
 
 #define jit_addi_l(d, rs, is)  jit_opi_((d), (rs),       ADDQir((is), (d)),    
                LEAQmr((is), (rs), 0, 0, (d))  )
 #define jit_addr_l(d, s1, s2)  jit_opo_((d), (s1), (s2), ADDQrr((s2), (d)), 
ADDQrr((s1), (d)), LEAQmr(0, (s1), (s2), 1, (d))  )
-#define jit_andi_l(d, rs, is)  jit_qop_ ((d), (rs),       ANDQir((is), (d))    
               )
+#define jit_andi_l(d, rs, is)  jit_qop_ ((d), (rs), (is), ANDQir((is), (d)), 
ANDQrr(JIT_REXTMP, (d)))
 #define jit_andr_l(d, s1, s2)  jit_qopr_((d), (s1), (s2), ANDQrr((s1), (d)), 
ANDQrr((s2), (d)) )
 #define jit_orr_l(d, s1, s2)   jit_qopr_((d), (s1), (s2),  ORQrr((s1), (d)),  
ORQrr((s2), (d)) )
 #define jit_subr_l(d, s1, s2)  jit_qopr_((d), (s1), (s2), (SUBQrr((s1), (d)), 
NEGQr(d)),       SUBQrr((s2), (d))              )
 #define jit_xorr_l(d, s1, s2)  jit_qopr_((d), (s1), (s2), XORQrr((s1), (d)), 
XORQrr((s2), (d)) )
 
 /* These can sometimes use byte or word versions! */
-#define jit_ori_i(d, rs, is)   jit_op_ ((d), (rs),        jit_reduce(OR, (is), 
(d))           )
-#define jit_xori_i(d, rs, is)  jit_op_ ((d), (rs),        jit_reduce(XOR, 
(is), (d))          )
-#define jit_ori_l(d, rs, is)   jit_qop_ ((d), (rs),        jit_reduceQ(OR, 
(is), (d))         )
-#define jit_xori_l(d, rs, is)  jit_qop_ ((d), (rs),        jit_reduceQ(XOR, 
(is), (d))        )
-
-#define jit_lshi_l(d, rs, is)  ((is) <= 3 ?   LEAQmr(0, 0, (rs), 1 << (is), 
(d))   :   jit_qop_ ((d), (rs), SHLQir((is), (d)) ))
-#define jit_rshi_l(d, rs, is)                                                  
        jit_qop_ ((d), (rs), SARQir((is), (d))  )
-#define jit_rshi_ul(d, rs, is)                                                 
        jit_qop_ ((d), (rs), SHRQir((is), (d))  )
+#define jit_ori_l(d, rs, is)   jit_qop_ ((d), (rs),        jit_reduceQ(OR, 
(is), (d)), ORQrr(JIT_REXTMP, (d))         )
+#define jit_xori_l(d, rs, is)  jit_qop_ ((d), (rs),        jit_reduceQ(XOR, 
(is), (d)), ORQrr(JIT_REXTMP, (d))        )
+
+#define jit_lshi_l(d, rs, is)  ((is) <= 3 ?   LEAQmr(0, 0, (rs), 1 << (is), 
(d))   :   jit_qop_small ((d), (rs), SHLQir((is), (d)) ))
+#define jit_rshi_l(d, rs, is)                                                  
        jit_qop_small ((d), (rs), SARQir((is), (d))  )
+#define jit_rshi_ul(d, rs, is)                                                 
        jit_qop_small ((d), (rs), SHRQir((is), (d))  )
 #define jit_lshr_l(d, r1, r2)  jit_replace((r1), (r2), _ECX,                   
        jit_qop_ ((d), (r1), SHLQrr(_CL,  (d)) ))
 #define jit_rshr_l(d, r1, r2)  jit_replace((r1), (r2), _ECX,                   
        jit_qop_ ((d), (r1), SARQrr(_CL,  (d)) ))
 #define jit_rshr_ul(d, r1, r2) jit_replace((r1), (r2), _ECX,                   
        jit_qop_ ((d), (r1), SHRQrr(_CL,  (d)) ))
@@ -112,7 +118,7 @@ struct jit_local_state {
 #define jit_popr_i(rs)         POPQr(rs)
 
 #define jit_base_prolog() (PUSHQr(_EBX), PUSHQr(_R12), PUSHQr(_R13), 
PUSHQr(_EBP), MOVQrr(_ESP, _EBP))
-#define jit_prolog(n) (_jitl.nextarg_geti = 0, _jitl.alloca_offset = 0, 
jit_base_prolog())
+#define jit_prolog(n) (_jitl.nextarg_getfp = _jitl.nextarg_geti = 0, 
_jitl.alloca_offset = 0, jit_base_prolog())
 
 /* Stack isn't used for arguments: */
 #define jit_prepare_i(ni)      (_jitl.argssize = 0)
@@ -181,17 +187,19 @@ static int jit_arg_reg_order[] = { _EDI, _ESI, _EDX, _ECX 
};
 #define jit_ret() ((_jitl.alloca_offset < 0 ? LEAVE_() : POPQr(_EBP)), 
POPQr(_R13), POPQr(_R12), POPQr(_EBX), RET_())
 
 #define _jit_ldi_l(d, is)              MOVQmr((is), 0,    0,    0,  (d))
+#define _jit_ldxi_l(d, rs, is)         MOVQmr((is), (rs), 0,    0,  (d))
 #define jit_ldr_l(d, rs)               MOVQmr(0,    (rs), 0,    0,  (d))
 #define jit_ldxr_l(d, s1, s2)          MOVQmr(0,    (s1), (s2), 1,  (d))
-#define jit_ldxi_l(d, rs, is)          MOVQmr((is), (rs), 0,    0,  (d))
 
 #define _jit_sti_l(id, rs)             MOVQrm((rs), (id), 0,    0,    0)
+#define _jit_stxi_l(id, rd, rs)                MOVQrm((rs), (id), (rd), 0,    
0)
 #define jit_str_l(rd, rs)              MOVQrm((rs), 0,    (rd), 0,    0)
 #define jit_stxr_l(d1, d2, rs)         MOVQrm((rs), 0,    (d1), (d2), 1)
-#define jit_stxi_l(id, rd, rs)         MOVQrm((rs), (id), (rd), 0,    0)
 
-#define jit_ldi_l(d, is)               (_u32P((long)(is)) ? _jit_ldi_l((d), 
(is)) : (jit_movi_l(JIT_REXTMP, (is)), jit_ldr_l(JIT_REXTMP)))
-#define jit_sti_l(id, rs)              (_u32P((long)(id)) ? _jit_sti_l(id, rs) 
: (jit_movi_l(JIT_REXTMP, id), jit_str_l (JIT_REXTMP, (rs))))
+#define jit_ldi_l(d, is)               (_u32P((long)(is)) ? _jit_ldi_l((d), 
(is)) : (jit_movi_l(JIT_REXTMP, (is)), jit_ldr_l((d), JIT_REXTMP)))
+#define jit_sti_l(id, rs)              (_u32P((long)(id)) ? _jit_sti_l((id), 
(rs)) : (jit_movi_l(JIT_REXTMP, (id)), jit_str_l (JIT_REXTMP, (rs))))
+#define jit_ldxi_l(d, rs, is)          (_u32P((long)(is)) ? _jit_ldxi_l((d), 
(rs), (is)) : (jit_movi_l(JIT_REXTMP, (is)), jit_ldxr_l((d), (rs), JIT_REXTMP)))
+#define jit_stxi_l(id, rd, rs)         (_u32P((long)(id)) ? _jit_stxi_l((id), 
(rd), (rs)) : (jit_movi_l(JIT_REXTMP, (id)), jit_stxr_l (JIT_REXTMP, (rd), 
(rs))))
 
 /* Memory */
 
diff --git a/lightning/i386/core-i386.h b/lightning/i386/core-i386.h
index 2a9a8c2..24d12b5 100644
--- a/lightning/i386/core-i386.h
+++ b/lightning/i386/core-i386.h
@@ -244,13 +244,8 @@
 #define jit_rshr_ui(d, r1, r2) jit_replace((r1), (r2), _ECX,                   
        jit_op_ ((d), (r1), SHRLrr(_CL,  (d)) ))
 
 /* Stack */
-#define jit_prepare_f(nf)      (_jitl.argssize += (nf))
-#define jit_prepare_d(nd)      (_jitl.argssize += 2 * (nd))
 #define jit_retval_i(rd)       ((void)jit_movr_i ((rd), _EAX))
 
-#define        jit_arg_f()             ((_jitl.framesize += sizeof(float)) - 
sizeof(float))
-#define        jit_arg_d()             ((_jitl.framesize += sizeof(double)) - 
sizeof(double))
-
 /* Unary */
 #define jit_negr_i(d, rs)      jit_opi_((d), (rs), NEGLr(d), (XORLrr((d), 
(d)), SUBLrr((rs), (d))) )
 
diff --git a/lightning/i386/fp-32.h b/lightning/i386/fp-32.h
index 31a1d3d..1ee56db 100644
--- a/lightning/i386/fp-32.h
+++ b/lightning/i386/fp-32.h
@@ -346,4 +346,9 @@ union jit_double_imm {
                         _OO(0xd9f1))                   /* fyl2x */
 #endif
 
+#define jit_prepare_f(nf)       (_jitl.argssize += (nf))
+#define jit_prepare_d(nd)       (_jitl.argssize += 2 * (nd))
+#define jit_arg_f()             ((_jitl.framesize += sizeof(float)) - 
sizeof(float))
+#define jit_arg_d()             ((_jitl.framesize += sizeof(double)) - 
sizeof(double))
+
 #endif /* __lightning_asm_h */
diff --git a/lightning/i386/fp-64.h b/lightning/i386/fp-64.h
index 19e73dc..74cdfec 100644
--- a/lightning/i386/fp-64.h
+++ b/lightning/i386/fp-64.h
@@ -33,6 +33,264 @@
 #ifndef __lightning_fp_h
 #define __lightning_fp_h
 
-#warning SSE math not yet supported
+#include <float.h>
+
+#define JIT_FPR_NUM    9
+#define JIT_FPRET      _XMM0
+#define JIT_FPR(i)     (_XMM7 + (i))
+#define JIT_FPTMP      _XMM6
+
+/* Either use a temporary register that is finally AND/OR/XORed with RS = RD,
+   or use RD as the temporary register and to the AND/OR/XOR with RS.  */
+#define jit_unop_tmp(rd, rs, op)               \
+       ( (rs) == (rd)                          \
+        ? op((rd), JIT_FPTMP, JIT_FPTMP))      \
+        : op((rd), (rd), (rs)))
+
+#define jit_unop_f(rd, rs, op)                                         \
+       ((rs) == (rd) ? op((rd)) : (MOVSSrr ((rs), (rd)), op((rd))))
+
+#define jit_unop_d(rd, rs, op)                                 \
+       ((rs) == (rd) ? op((rd)) : (MOVSDrr ((rs), (rd)), op((rd))))
+
+#define jit_3opc_f(rd, s1, s2, op)                             \
+       ( (s1) == (rd) ? op((s2), (rd))                         \
+         : ((s2) == (rd) ? op((s1), (rd))                      \
+            : (MOVSSrr ((s1), (rd)), op((s2), (rd)))))
+
+#define jit_3opc_d(rd, s1, s2, op)                             \
+       ( (s1) == (rd) ? op((s2), (rd))                         \
+         : ((s2) == (rd) ? op((s1), (rd))                      \
+            : (MOVSDrr ((s1), (rd)), op((s2), (rd)))))
+
+#define jit_3op_f(rd, s1, s2, op)                              \
+       ( (s1) == (rd) ? op((s2), (rd))                         \
+         : ((s2) == (rd)                                       \
+            ? (MOVSSrr ((rd), JIT_FPTMP), MOVSSrr ((s1), (rd)), op(JIT_FPTMP, 
(rd)))   \
+            : (MOVSSrr ((s1), (rd)), op((s2), (rd)))))
+
+#define jit_3op_d(rd, s1, s2, op)                              \
+       ( (s1) == (rd) ? op((s2), (rd))                         \
+         : ((s2) == (rd)                                       \
+            ? (MOVSDrr ((rd), JIT_FPTMP), MOVSDrr ((s1), (rd)), op(JIT_FPTMP, 
(rd)))   \
+            : (MOVSDrr ((s1), (rd)), op((s2), (rd)))))
+
+#define jit_addr_f(rd,s1,s2)   jit_3opc_f((rd), (s1), (s2), ADDSSrr)
+#define jit_subr_f(rd,s1,s2)   jit_3op_f((rd), (s1), (s2), SUBSSrr)
+#define jit_mulr_f(rd,s1,s2)   jit_3opc_f((rd), (s1), (s2), MULSSrr)
+#define jit_divr_f(rd,s1,s2)   jit_3op_f((rd), (s1), (s2), DIVSSrr)
+
+#define jit_addr_d(rd,s1,s2)   jit_3opc_d((rd), (s1), (s2), ADDSDrr)
+#define jit_subr_d(rd,s1,s2)   jit_3op_d((rd), (s1), (s2), SUBSDrr)
+#define jit_mulr_d(rd,s1,s2)   jit_3opc_d((rd), (s1), (s2), MULSDrr)
+#define jit_divr_d(rd,s1,s2)   jit_3op_d((rd), (s1), (s2), DIVSDrr)
+
+#define jit_movr_f(rd,rs)      MOVSSrr((rs), (rd))
+#define jit_movr_d(rd,rs)      MOVSDrr((rs), (rd))
+
+/* either pcmpeqd %xmm7, %xmm7 / psrld $1, %xmm7 / andps %xmm7, %RD (if RS = 
RD)
+       or pcmpeqd %RD, %RD / psrld $1, %RD / andps %RS, %RD (if RS != RD) */
+#define _jit_abs_f(rd,cnst,rs)                                         \
+       (PCMPEQDrr((cnst), (cnst)), PSRLDir (1, (cnst)), ANDPSrr ((rs), (rd)))
+#define _jit_neg_f(rd,cnst,rs)                                         \
+       (PCMPEQDrr((cnst), (cnst)), PSLLDir (31, (cnst)), XORPSrr ((rs), (rd)))
+#define jit_abs_f(rd,rs)       jit_unop_tmp ((rd), (rs), _jit_abs_f)
+#define jit_neg_f(rd,rs)       jit_unop_tmp ((rd), (rs), _jit_neg_f)
+
+#define _jit_abs_d(rd,cnst,rs)                                         \
+       (PCMPEQDrr((cnst), (cnst)), PSRLQir (1, (cnst)), ANDPDrr ((rs), (rd)))
+#define _jit_neg_d(rd,cnst,rs)                                         \
+       (PCMPEQDrr((cnst), (cnst)), PSLLQir (63, (cnst)), XORPDrr ((rs), (rd)))
+#define jit_abs_d(rd,rs)       jit_unop_tmp ((rd), (rs), _jit_abs_d)
+#define jit_neg_d(rd,rs)       jit_unop_tmp ((rd), (rs), _jit_neg_d)
+
+#define jit_sqrt_d(rd,rs)      SQRTSSrr((rs), (rd))
+#define jit_sqrt_f(rd,rs)      SQRTSDrr((rs), (rd))
+
+#define _jit_ldi_f(d, is)               MOVSSmr((is), 0,    0,    0,  (d))
+#define _jit_ldxi_f(d, rs, is)          MOVSSmr((is), (rs), 0,    0,  (d))
+#define jit_ldr_f(d, rs)                MOVSSmr(0,    (rs), 0,    0,  (d))
+#define jit_ldxr_f(d, s1, s2)           MOVSSmr(0,    (s1), (s2), 1,  (d))
+
+#define _jit_sti_f(id, rs)              MOVSSrm((rs), (id), 0,    0,    0)
+#define _jit_stxi_f(id, rd, rs)         MOVSSrm((rs), (id), (rd), 0,    0)
+#define jit_str_f(rd, rs)               MOVSSrm((rs), 0,    (rd), 0,    0)
+#define jit_stxr_f(d1, d2, rs)          MOVSSrm((rs), 0,    (d1), (d2), 1)
+
+#define jit_ldi_f(d, is)                (_u32P((long)(is)) ? _jit_ldi_f((d), 
(is)) : (jit_movi_l(JIT_REXTMP, (is)), jit_ldr_f((d), JIT_REXTMP)))
+#define jit_sti_f(id, rs)               (_u32P((long)(id)) ? _jit_sti_f((id), 
(rs)) : (jit_movi_l(JIT_REXTMP, (id)), jit_str_f (JIT_REXTMP, (rs))))
+#define jit_ldxi_f(d, rs, is)           (_u32P((long)(is)) ? _jit_ldxi_f((d), 
(rs), (is)) : (jit_movi_l(JIT_REXTMP, (is)), jit_ldxr_f((d), (rs), JIT_REXTMP)))
+#define jit_stxi_f(id, rd, rs)          (_u32P((long)(id)) ? _jit_stxi_f((id), 
(rd), (rs)) : (jit_movi_l(JIT_REXTMP, (id)), jit_stxr_f (JIT_REXTMP, (rd), 
(rs))))
+
+#define _jit_ldi_d(d, is)               MOVSDmr((is), 0,    0,    0,  (d))
+#define _jit_ldxi_d(d, rs, is)          MOVSDmr((is), (rs), 0,    0,  (d))
+#define jit_ldr_d(d, rs)                MOVSDmr(0,    (rs), 0,    0,  (d))
+#define jit_ldxr_d(d, s1, s2)           MOVSDmr(0,    (s1), (s2), 1,  (d))
+
+#define _jit_sti_d(id, rs)              MOVSDrm((rs), (id), 0,    0,    0)
+#define _jit_stxi_d(id, rd, rs)         MOVSDrm((rs), (id), (rd), 0,    0)
+#define jit_str_d(rd, rs)               MOVSDrm((rs), 0,    (rd), 0,    0)
+#define jit_stxr_d(d1, d2, rs)          MOVSDrm((rs), 0,    (d1), (d2), 1)
+
+#define jit_ldi_d(d, is)                (_u32P((long)(is)) ? _jit_ldi_d((d), 
(is)) : (jit_movi_l(JIT_REXTMP, (is)), jit_ldr_d((d), JIT_REXTMP)))
+#define jit_sti_d(id, rs)               (_u32P((long)(id)) ? _jit_sti_d((id), 
(rs)) : (jit_movi_l(JIT_REXTMP, (id)), jit_str_d (JIT_REXTMP, (rs))))
+#define jit_ldxi_d(d, rs, is)           (_u32P((long)(is)) ? _jit_ldxi_d((d), 
(rs), (is)) : (jit_movi_l(JIT_REXTMP, (is)), jit_ldxr_d((d), (rs), JIT_REXTMP)))
+#define jit_stxi_d(id, rd, rs)          (_u32P((long)(id)) ? _jit_stxi_d((id), 
(rd), (rs)) : (jit_movi_l(JIT_REXTMP, (id)), jit_stxr_d (JIT_REXTMP, (rd), 
(rs))))
+
+
+#define jit_movi_f(rd,immf)                     \
+ ((immf) == 0.0 ? XORSSrr ((rd), (rd)) :                                       
      \
+        (_O (0x50),                            \
+         MOVLim (0x12345678L, 0, _ESP, 0, 0),                                 \
+         *((float *) (_jit.x.uc_pc - 4)) = (float) immf, \
+        jit_ldr_f((rd), _ESP),                 \
+        ADDLir(4, _ESP)))
+
+union jit_double_imm {
+  double d;
+  long l;
+};
+
+#define jit_movi_d(rd,immd)                                                    
       \
+ ((immd) == 0.0 ? XORSDrr ((rd), (rd)) :                                       
      \
+        (_O (0x50),                                                            
       \
+         MOVQir (0x123456789abcdef0L, _EAX),                                   
      \
+         ((union jit_double_imm *) (_jit.x.uc_pc - 8))->d = (double) immd,     
       \
+         _O (0x50), jit_ldr_d((rd), _ESP),                                     
       \
+         _O (0x58), _O (0x58)))
+
+#define jit_extr_i_d(rd, rs)   CVTSI2SDLrr((rs), (rd))
+#define jit_extr_i_f(rd, rs)   CVTSI2SSLrr((rs), (rd))
+#define jit_extr_l_d(rd, rs)   CVTSI2SDQrr((rs), (rd))
+#define jit_extr_l_f(rd, rs)   CVTSI2SSQrr((rs), (rd))
+#define jit_roundr_d_i(rd, rs) CVTSD2SILrr((rs), (rd))
+#define jit_roundr_f_i(rd, rs) CVTSS2SILrr((rs), (rd))
+#define jit_roundr_d_l(rd, rs) CVTSD2SIQrr((rs), (rd))
+#define jit_roundr_f_l(rd, rs) CVTSS2SIQrr((rs), (rd))
+
+
+#define jit_ceilr_f_i(rd, rs) do {                     \
+       jit_roundr_f_i ((rd), (rs));            \
+       jit_extr_i_f (JIT_FPTMP, (rd));                 \
+       UCOMISSrr ((rs), JIT_FPTMP);                    \
+       ADCLir (0, (rd));                               \
+  } while (0)
+
+#define jit_ceilr_d_i(rd, rs) do {                     \
+       jit_roundr_d_i ((rd), (rs));            \
+       jit_extr_i_d (JIT_FPTMP, (rd));                 \
+       UCOMISDrr ((rs), JIT_FPTMP);                    \
+       ADCLir (0, (rd));                               \
+  } while (0)
+
+#define jit_truncr_f_i(rd, rs) do {                    \
+       jit_roundr_f_i ((rd), (rs));                    \
+       jit_extr_i_f (JIT_FPTMP, (rd));                 \
+       TESTLrr ((rd), (rd));                           \
+       JSm (_jit.x.pc + 9);                            \
+       UCOMISSrr (JIT_FPTMP, (rs));                    \
+       SBBLir (0, (rd));                               \
+       JMPSm (_jit.x.pc + 7);                          \
+       UCOMISSrr ((rs), JIT_FPTMP);                    \
+       ADCLir (0, (rd));                               \
+  } while (0)
+
+#define jit_truncr_d_i(rd, rs) do {                    \
+       jit_roundr_d_i ((rd), (rs));            \
+       jit_extr_i_d (JIT_FPTMP, (rd));                 \
+       TESTLrr ((rd), (rd));                           \
+       JSm (_jit.x.pc + 9);                            \
+       UCOMISDrr (JIT_FPTMP, (rs));                    \
+       SBBLir (0, (rd));                               \
+       JMPSm (_jit.x.pc + 7);                          \
+       UCOMISDrr ((rs), JIT_FPTMP);                    \
+       ADCLir (0, (rd));                               \
+  } while (0)
+
+#define jit_floorr_f_i(rd, rs) do {                    \
+       jit_roundr_f_i ((rd), (rs));            \
+       jit_extr_i_f (JIT_FPTMP, (rd));                 \
+       UCOMISSrr (JIT_FPTMP, (rs));                    \
+       SBBLir (0, (rd));                               \
+  } while (0)
+
+#define jit_floorr_d_i(rd, rs) do {                    \
+       jit_roundr_d_i ((rd), (rs));            \
+       jit_extr_i_d (JIT_FPTMP, (rd));                 \
+       UCOMISDrr (JIT_FPTMP, (rs));                    \
+       SBBLir (0, (rd));                               \
+  } while (0)
+
+#define jit_bltr_f(d, s1, s2)            (UCOMISSrr ((s1), (s2)), JAm ((d)))
+#define jit_bler_f(d, s1, s2)            (UCOMISSrr ((s1), (s2)), JAEm ((d)))
+#define jit_beqr_f(d, s1, s2)            (UCOMISSrr ((s1), (s2)), _OO 
(0x7a06), JEm ((d)))
+#define jit_bner_f(d, s1, s2)            (UCOMISSrr ((s1), (s2)), _OO 
(0x7a03), _OO (0x7405), JMPm (((d)))) /* JP to JMP, JZ past JMP */
+#define jit_bger_f(d, s1, s2)            (UCOMISSrr ((s2), (s1)), JAEm ((d)))
+#define jit_bgtr_f(d, s1, s2)            (UCOMISSrr ((s2), (s1)), JAm ((d)))
+#define jit_bunltr_f(d, s1, s2)          (UCOMISSrr ((s2), (s1)), JNAEm ((d)))
+#define jit_bunler_f(d, s1, s2)          (UCOMISSrr ((s2), (s1)), JNAm ((d)))
+#define jit_buneqr_f(d, s1, s2)          (UCOMISSrr ((s1), (s2)), JEm ((d)))
+#define jit_bltgtr_f(d, s1, s2)          (UCOMISSrr ((s1), (s2)), JNEm ((d)))
+#define jit_bunger_f(d, s1, s2)          (UCOMISSrr ((s1), (s2)), JNAm ((d)))
+#define jit_bungtr_f(d, s1, s2)          (UCOMISSrr ((s1), (s2)), JNAEm ((d)))
+#define jit_bordr_f(d, s1, s2)           (UCOMISSrr ((s1), (s2)), JNPm ((d)))
+#define jit_bunordr_f(d, s1, s2)         (UCOMISSrr ((s1), (s2)), JPm ((d)))
+
+#define jit_bltr_d(d, s1, s2)            (UCOMISDrr ((s1), (s2)), JAm ((d)))
+#define jit_bler_d(d, s1, s2)            (UCOMISDrr ((s1), (s2)), JAEm ((d)))
+#define jit_beqr_d(d, s1, s2)            (UCOMISDrr ((s1), (s2)), _OO 
(0x7a06), JEm ((d)))
+#define jit_bner_d(d, s1, s2)            (UCOMISDrr ((s1), (s2)), _OO 
(0x7a03), _OO (0x7405), JMPm (((d)))) /* JP to JMP, JZ past JMP */ 
+#define jit_bger_d(d, s1, s2)            (UCOMISDrr ((s2), (s1)), JAEm ((d)))
+#define jit_bgtr_d(d, s1, s2)            (UCOMISDrr ((s2), (s1)), JAm ((d)))
+#define jit_bunltr_d(d, s1, s2)          (UCOMISDrr ((s2), (s1)), JNAEm ((d)))
+#define jit_bunler_d(d, s1, s2)          (UCOMISDrr ((s2), (s1)), JNAm ((d)))
+#define jit_buneqr_d(d, s1, s2)          (UCOMISDrr ((s1), (s2)), JEm ((d)))
+#define jit_bltgtr_d(d, s1, s2)          (UCOMISDrr ((s1), (s2)), JNEm ((d)))
+#define jit_bunger_d(d, s1, s2)          (UCOMISDrr ((s1), (s2)), JNAm ((d)))
+#define jit_bungtr_d(d, s1, s2)          (UCOMISDrr ((s1), (s2)), JNAEm ((d)))
+#define jit_bordr_d(d, s1, s2)           (UCOMISDrr ((s1), (s2)), JNPm ((d)))
+#define jit_bunordr_d(d, s1, s2)         (UCOMISDrr ((s1), (s2)), JPm ((d)))
+
+#define jit_ltr_f(d, s1, s2)            (XORLrr ((d), (d)), UCOMISSrr ((s1), 
(s2)), SETAr (jit_reg8((d))))
+#define jit_ler_f(d, s1, s2)            (XORLrr ((d), (d)), UCOMISSrr ((s1), 
(s2)), SETAEr (jit_reg8((d))))
+#define jit_eqr_f(d, s1, s2)            (XORLrr ((d), (d)), UCOMISSrr ((s1), 
(s2)), _OO(0x7a03), SETEr (jit_reg8((d))))
+#define jit_ner_f(d, s1, s2)            (UCOMISSrr ((s1), (s2)), MOVLir (1, 
(d)), _OO(0x7a03), SETNEr (jit_reg8((d))))
+#define jit_ger_f(d, s1, s2)            (XORLrr ((d), (d)), UCOMISSrr ((s2), 
(s1)), SETAEr (jit_reg8((d))))
+#define jit_gtr_f(d, s1, s2)            (XORLrr ((d), (d)), UCOMISSrr ((s2), 
(s1)), SETAr (jit_reg8((d))))
+#define jit_unltr_f(d, s1, s2)          (XORLrr ((d), (d)), UCOMISSrr ((s2), 
(s1)), SETNAEr (jit_reg8((d))))
+#define jit_unler_f(d, s1, s2)          (XORLrr ((d), (d)), UCOMISSrr ((s2), 
(s1)), SETNAr (jit_reg8((d))))
+#define jit_uneqr_f(d, s1, s2)          (XORLrr ((d), (d)), UCOMISSrr ((s1), 
(s2)), SETEr (jit_reg8((d))))
+#define jit_ltgtr_f(d, s1, s2)          (XORLrr ((d), (d)), UCOMISSrr ((s1), 
(s2)), SETNEr (jit_reg8((d))))
+#define jit_unger_f(d, s1, s2)          (XORLrr ((d), (d)), UCOMISSrr ((s1), 
(s2)), SETNAr (jit_reg8((d))))
+#define jit_ungtr_f(d, s1, s2)          (XORLrr ((d), (d)), UCOMISSrr ((s1), 
(s2)), SETNAEr (jit_reg8((d))))
+#define jit_ordr_f(d, s1, s2)           (XORLrr ((d), (d)), UCOMISSrr ((s1), 
(s2)), SETNPr (jit_reg8((d))))
+#define jit_unordr_f(d, s1, s2)         (XORLrr ((d), (d)), UCOMISSrr ((s1), 
(s2)), SETPr (jit_reg8((d))))
+
+#define jit_ltr_d(d, s1, s2)            (XORLrr ((d), (d)), UCOMISDrr ((s1), 
(s2)), SETAr (jit_reg8((d))))
+#define jit_ler_d(d, s1, s2)            (XORLrr ((d), (d)), UCOMISDrr ((s1), 
(s2)), SETAEr (jit_reg8((d))))
+#define jit_eqr_d(d, s1, s2)            (XORLrr ((d), (d)), UCOMISDrr ((s1), 
(s2)), _OO(0x7a03), SETEr (jit_reg8((d))))
+#define jit_ner_d(d, s1, s2)            (UCOMISDrr ((s1), (s2)), MOVLir (1, 
(d)), _OO(0x7a03), SETNEr (jit_reg8((d))))
+#define jit_ger_d(d, s1, s2)            (XORLrr ((d), (d)), UCOMISDrr ((s2), 
(s1)), SETAEr (jit_reg8((d))))
+#define jit_gtr_d(d, s1, s2)            (XORLrr ((d), (d)), UCOMISDrr ((s2), 
(s1)), SETAr (jit_reg8((d))))
+#define jit_unltr_d(d, s1, s2)          (XORLrr ((d), (d)), UCOMISDrr ((s2), 
(s1)), SETNAEr (jit_reg8((d))))
+#define jit_unler_d(d, s1, s2)          (XORLrr ((d), (d)), UCOMISDrr ((s2), 
(s1)), SETNAr (jit_reg8((d))))
+#define jit_uneqr_d(d, s1, s2)          (XORLrr ((d), (d)), UCOMISDrr ((s1), 
(s2)), SETEr (jit_reg8((d))))
+#define jit_ltgtr_d(d, s1, s2)          (XORLrr ((d), (d)), UCOMISDrr ((s1), 
(s2)), SETNEr (jit_reg8((d))))
+#define jit_unger_d(d, s1, s2)          (XORLrr ((d), (d)), UCOMISDrr ((s1), 
(s2)), SETNAr (jit_reg8((d))))
+#define jit_ungtr_d(d, s1, s2)          (XORLrr ((d), (d)), UCOMISDrr ((s1), 
(s2)), SETNAEr (jit_reg8((d))))
+#define jit_ordr_d(d, s1, s2)           (XORLrr ((d), (d)), UCOMISDrr ((s1), 
(s2)), SETNPr (jit_reg8((d))))
+#define jit_unordr_d(d, s1, s2)         (XORLrr ((d), (d)), UCOMISDrr ((s1), 
(s2)), SETPr (jit_reg8((d))))
+
+#define jit_prepare_f(num)              (_jitl.nextarg_putfp = _XMM0 + (num))
+#define jit_prepare_d(num)              (_jitl.nextarg_putfp = _XMM0 + (num))
+
+#define jit_arg_f()                     (_XMM0 + _jitl.nextarg_getfp++)
+#define jit_arg_d()                     (_XMM0 + _jitl.nextarg_getfp++)
+
+#define jit_getarg_f(rd, ofs)           (jit_movr_f ((rd), (ofs)))
+#define jit_getarg_d(rd, ofs)           (jit_movr_d ((rd), (ofs)))
+
+#define jit_pusharg_f(rs)               (--_jitl.nextarg_putfp, jit_movr_f 
(_jitl.nextarg_putfp, (rs)))
+#define jit_pusharg_d(rs)               (--_jitl.nextarg_putfp, jit_movr_d 
(_jitl.nextarg_putfp, (rs)))
 
 #endif /* __lightning_fp_h */
diff --git a/lightning/sparc/fp.h b/lightning/sparc/fp.h
index 5a34e7d..a11f2eb 100644
--- a/lightning/sparc/fp.h
+++ b/lightning/sparc/fp.h
@@ -60,8 +60,8 @@
 #define jit_abs_f(rd,rs)       FABSDrr((rs), (rd))
 #define jit_negr_f(rd,rs)      FNEGDrr((rs), (rd))
 #define jit_sqrt_f(rd,rs)      FSQRTDrr((rs), (rd))
-#define jit_extr_f_d(rs, rd)   FSTODrr((rs), (rd))
-#define jit_extr_d_f(rs, rd)   FDTOSrr((rs), (rd))
+#define jit_extr_f_d(rd, rs)   FSTODrr((rs), (rd))
+#define jit_extr_d_f(rd, rs)   FDTOSrr((rs), (rd))
 
 #define jit_movi_f(rd,immf)                      \
     do {                                         \



reply via email to

[Prev in Thread] Current Thread [Next in Thread]