guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] 40/437: checkin rewrite of i386 assembler


From: Andy Wingo
Subject: [Guile-commits] 40/437: checkin rewrite of i386 assembler
Date: Mon, 2 Jul 2018 05:13:41 -0400 (EDT)

wingo pushed a commit to branch lightning
in repository guile.

commit 2534af6d2d8e5075cb58c7f36fec5aee2bc0bef3
Author: Paolo Bonzini <address@hidden>
Date:   Mon Nov 20 15:52:55 2006 +0000

    checkin rewrite of i386 assembler
    
    2006-11-20  Paolo Bonzini  <address@hidden>
    
        * lightning/i386/asm-i386.h: Check in rewrite from aranym.
        * lightning/i386/asm-32.h: Adjust.
        * lightning/i386/asm-64.h: Adjust.
        * lightning/i386/fp-32.h: Adjust.
    
        * lightning/i386/core-32.h: Adjust.  Add jit_{ld,ldx,st,stx}i*.
        * lightning/i386/core-64.h: Adjust.  Add jit_{ld,ldx,st,stx}i*.
        * lightning/i386/core-i386.h: Adjust. Remove these patterns.
    
    git-archimport-id: address@hidden/lightning--stable--1.2--patch-43
---
 AUTHORS                    |    3 +-
 ChangeLog                  |   11 +
 lightning/i386/asm-32.h    |   33 +-
 lightning/i386/asm-64.h    |  185 ++--
 lightning/i386/asm-i386.h  | 2100 ++++++++++++++++++++++++--------------------
 lightning/i386/core-32.h   |   34 +-
 lightning/i386/core-64.h   |  103 ++-
 lightning/i386/core-i386.h |  227 +++--
 lightning/i386/fp-32.h     |    4 +-
 tests/ldxi.c               |    2 +-
 10 files changed, 1455 insertions(+), 1247 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index cec7e89..2272038 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,5 +1,6 @@
 Paolo Bonzini <address@hidden>
-i386 and PPC assemblers by Ian Piumarta <address@hidden>
+PPC assembler by Ian Piumarta <address@hidden>
+i386 assembler by Ian Piumarta <address@hidden> and Milan Jurik
 x86-64 backend by Matthew Flatt <address@hidden>
 Major PPC contributions by Laurent Michel <address@hidden>
 Major SPARC contributions by Ludovic Courtes <address@hidden>
diff --git a/ChangeLog b/ChangeLog
index 2e6dbd4..aa98807 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,16 @@
 2006-11-20  Paolo Bonzini  <address@hidden>
 
+       * lightning/i386/asm-i386.h: Check in rewrite from aranym.
+       * lightning/i386/asm-32.h: Adjust.
+       * lightning/i386/asm-64.h: Adjust.
+       * lightning/i386/fp-32.h: Adjust.
+
+       * lightning/i386/core-32.h: Adjust.  Add jit_{ld,ldx,st,stx}i*.
+       * lightning/i386/core-64.h: Adjust.  Add jit_{ld,ldx,st,stx}i*.
+       * lightning/i386/core-i386.h: Adjust. Remove these patterns.
+
+2006-11-20  Paolo Bonzini  <address@hidden>
+
        * lightning/i386/asm-i386.h: Merge 64-bit cleanliness changes from 
mzscheme.
        Add SSE.
        * lightning/i386/asm-64.h: Likewise.
diff --git a/lightning/i386/asm-32.h b/lightning/i386/asm-32.h
index 2854a48..c695091 100644
--- a/lightning/i386/asm-32.h
+++ b/lightning/i386/asm-32.h
@@ -34,6 +34,8 @@
 #ifndef __lightning_asm_h
 #define __lightning_asm_h
 
+#ifndef LIGHTNING_DEBUG
+
 /*     OPCODE  + i             = immediate operand
  *             + r             = register operand
  *             + m             = memory operand (disp,base,index,scale)
@@ -42,20 +44,31 @@
 
 #include "asm-i386.h"
 
-#ifndef LIGHTNING_DEBUG
+#define _rA(R)          _r4(R)
 
-#define _r_D(  R, D      )     (_Mrm(_b00,_rN(R),_b101 )                       
     ,_jit_I((long)(D)))
+/* Use RIP-addressing in 64-bit mode, if possible */
+#define _r_X(   R, D,B,I,S,O)  (_r0P(I) ? (_r0P(B)    ? _r_D   (R,D            
    ) : \
+                                          (_rsp12P(B) ? 
_r_DBIS(R,D,_ESP,_ESP,1)   : \
+                                                        _r_DB  (R,D,     B     
  )))  : \
+                                (_r0P(B)              ? _r_4IS (R,D,           
 I,S)   : \
+                                (!_rspP(I)            ? _r_DBIS(R,D,     B,    
 I,S)   : \
+                                                        JITFAIL("illegal index 
register: %esp"))))
+#define _m32only(X)            (X)
+#define _m64only(X)            JITFAIL("invalid instruction in 32-bit mode")
+#define _m64(X)                        ((void)0)
 
-#define CALLm(D,B,I,S)                 ((_r0P(B) && _r0P(I)) ? _O_D32  (0xe8   
                ,(long)(D)              ) : \
-                                                               
JITFAIL("illegal mode in direct jump"))
+#define CALLsr(R)                      CALLLsr(R)
+#define JMPsr(R)                       JMPLsr(R)
 
-#define JCCim(CC,D,B,I,S)              ((_r0P(B) && _r0P(I)) ? _OO_D32 
(0x0f80|(CC)            ,(long)(D)              ) : \
-                                                               
JITFAIL("illegal mode in conditional jump"))
+#define _AH            0x24
+#define _CH            0x25
+#define _DH            0x26
+#define _BH            0x27
 
-#define JMPm(D,B,I,S)                  ((_r0P(B) && _r0P(I)) ? _O_D32  (0xe9   
                ,(long)(D)              ) : \
-                                                               
JITFAIL("illegal mode in direct jump"))
+#define DECWr(RD)      (_d16(),        _Or             (0x48,_r2(RD)           
                                        ))
+#define DECLr(RD)                      _Or             (0x48,_r4(RD)           
                                        )
+#define INCWr(RD)      (_d16(),        _Or             (0x40,_r2(RD)           
                                        ))
+#define INCLr(RD)                      _Or             (0x40,_r4(RD)           
                                        )
 
 #endif
 #endif /* __lightning_asm_h */
-
-
diff --git a/lightning/i386/asm-64.h b/lightning/i386/asm-64.h
index 9f0f979..e3a6d78 100644
--- a/lightning/i386/asm-64.h
+++ b/lightning/i386/asm-64.h
@@ -34,6 +34,8 @@
 #ifndef __lightning_asm_h
 #define __lightning_asm_h
 
+#ifndef LIGHTNING_DEBUG
+
 #include "asm-i386.h"
 
 /*     OPCODE  + i             = immediate operand
@@ -43,101 +45,102 @@
  */
 
 
-#ifndef LIGHTNING_DEBUG
-#define _R12            0x4C
-#define _R13            0x4D
-#define JIT_CALLTMPSTART 0x48
-#define JIT_REXTMP       0x4B
-
-#define _r_8B( R, D,B    )     (_qMrm(_b10,_rN(R),_r8(B))                      
     ,_jit_I((long)(D)))
-#define _r_8IS( R, D,I,S)      (_qMrm(_b00,_rN(R),_b100 
),_SIB(_SCL(S),_r8(I),_b101 ),_jit_I((long)(D)))
-#define _r_8BIS(R, D,B,I,S)    (_qMrm(_b10,_rN(R),_b100 
),_SIB(_SCL(S),_r8(I),_r8(B)),_jit_I((long)(D)))
-
-#define _qMrm(Md,R,M)  _jit_B((_M(Md)<<6)|(_r((R & 0x7))<<3)|_m((M & 0x7)))
-#define _r_D(  R, D      )     (_Mrm(_b00,_rN(R),_b100 ),_SIB(0,_b100,_b101)   
     ,_jit_I((long)(D)))
-#define _r_Q(  R, D      )     (_qMrm(_b00,_rN(R),_b100 ),_SIB(0,_b100,_b101)  
      ,_jit_I((long)(D)))
-
-#define  _REX(R,X,B)                              ( 
_jit_B(0x48|((R&0x8)>>1)|((X&0x8)>>2)|((B&0x8)>>3)) )
-#define  _qO(       OP, R,X,B                  )  ( _REX(R,X,B), _jit_B(  OP   
) )
-#define  _qOr(      OP,R                       )  ( _REX(0,0,R), _jit_B( 
(OP)|_r(R&0x7))                                 )
-#define  _qOs(      OP, B, R, M                )  ( _REX(0, M, R), _Os(OP, B) )
-#define         _qOr_Q(     OP,R                   ,Q  )  (       _qOr     (  
OP,R)                          ,_jit_L(Q)          )
-#define         _qO_Mrm(    OP  ,MO,R,M                )  (        _qO     (  
OP,R,0,M),_qMrm(MO,R,M       )             )
-#define         _qO_Mrm_L(  OP  ,MO,R,M            ,L  )  (       _qO      (  
OP,R,0,M),_qMrm(MO,R,M       ) ,_jit_I(L)          )
-#define         _qOs_Mrm_sL(OP  ,MO,R,M            ,L  )  (       _qOs     (  
OP,L,R,M),_qMrm(MO,R,M       ),_sL(L)      )
-#define         _qO_r_X(    OP     ,R  ,MD,MB,MI,MS    )  (       _qO      (  
OP,R,0,MS),_qr_X(R,MD,MB,MI,MS)            )
-#define         _qO_r_XB(   OP     ,R  ,MD,MB,MI,MS    )  (       _qO      (  
OP,R,0,MB),_qr_X(R,MD,MB,MI,MS)            )
-
-
-#define ADDQrr(RS, RD)                 _qO_Mrm         (0x01           
,_b11,_r8(RS),_r8(RD)                           )
-#define ADDQir(IM, RD)                 _qOs_Mrm_sL     (0x81           
,_b11,_b000  ,_r8(RD)                   ,IM     )
-
-#define ANDQrr(RS, RD)                 _qO_Mrm         (0x21           
,_b11,_r8(RS),_r8(RD)                           )
-#define ANDQir(IM, RD)                 _qOs_Mrm_sL     (0x81           
,_b11,_b100  ,_r8(RD)                   ,IM     )
-
-#define CALLm(D,B,I,S)                 (MOVQir((D), JIT_REXTMP), 
CALLQsr(JIT_REXTMP))
-
-#define CALLsr(R)                      _O_Mrm  (0xff   ,_b11,_b010,_r4(R)      
                )
-#define CALLQsr(R)                       _qO_Mrm (0xff ,_b11,_b010,_r8(R))
-
-#define CMPQrr(RS, RD)                 _qO_Mrm         (0x39           
,_b11,_r8(RS),_r8(RD)                           )
-#define CMPQir(IM, RD)                 _qO_Mrm_L       (0x81           
,_b11,_b111  ,_r8(RD)                   ,IM     )
-
-#define JCCim(CC,D,B,I,S) (!_jitl.long_jumps \
-                               ? _OO_D32(0x0f80|(CC), (long)(D) ) \
-                               : (_O_D8(0x71^(CC), _jit_UL(_jit.x.pc) + 13), 
JMPm((long)D, 0, 0, 0)))
-
-#define JMPm(D,B,I,S) (!_jitl.long_jumps \
-                       ? _O_D32(0xe9, (long)(D)) \
-                       : (MOVQir((D), JIT_REXTMP), 
_qO_Mrm(0xff,_b11,_b100,_r8(JIT_REXTMP))))
-
-#define LEAQmr(MD, MB, MI, MS, RD)     _qO_r_X         (0x8d                
,_r8(RD)           ,MD,MB,MI,MS            )
-#define MOVQmr(MD, MB, MI, MS, RD)     _qO_r_X         (0x8b                
,_r8(RD)           ,MD,MB,MI,MS            )
-#define MOVQrm(RS, MD, MB, MI, MS)     _qO_r_X         (0x89                
,_r8(RS)           ,MD,MB,MI,MS            )
-#define MOVQrQm(RS, MD, MB, MI, MS)    _qO_r_XB        (0x89                
,_r8(RS)           ,MD,MB,MI,MS            )
-#define MOVQir(IM,  R)                 _qOr_Q          (0xb8,_r8(R)            
        ,IM     )
-
-#define MOVQrr(RS, RD)                 _qO_Mrm         (0x89           
,_b11,_r8(RS),_r8(RD)                           )
-
-#define NEGQr(RD)                      _qO_Mrm         (0xf7           
,_b11,_b011  ,_r8(RD)                           )
-
-#define ORQrr(RS, RD)                  _qO_Mrm         (0x09           
,_b11,_r8(RS),_r8(RD)                           )
-#define ORQir(IM, RD)                  _qOs_Mrm_sL     (0x81           
,_b11,_b001  ,_r8(RD)                   ,IM     )
-
-#define POPQr(RD)                      _qOr            (0x58,_r8(RD)           
                                        )
-
-#define PUSHQr(R)                      _qOr            (0x50,_r8(R)            
                                        )
-#define SALQir SHLQir
-#define SALQim SHLQim
-#define SALQrr SHLQrr
-#define SALQrm SHLQrm
-
-#define SARQir(IM,RD)          (((IM)==1) ?    _qO_Mrm         (0xd1   
,_b11,_b111,_r8(RD)                             ) : \
-                                               _qO_Mrm_B       (0xc1   
,_b11,_b111,_r4(RD)                     ,_u8(IM) ) )
-#define SARQrr(RS,RD)          (((RS)==_CL) ?  _qO_Mrm         (0xd3   
,_b11,_b111,_r8(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-
-
-#define SHLQir(IM,RD)          (((IM)==1) ?    _qO_Mrm         (0xd1   
,_b11,_b100,_r8(RD)                             ) : \
-                                               _qO_Mrm_B       (0xc1   
,_b11,_b100,_r8(RD)                     ,_u8(IM) ) )
-#define SHLQrr(RS,RD)          (((RS)==_CL) ?  _qO_Mrm         (0xd3   
,_b11,_b100,_r8(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-
-
-#define SHRQir(IM,RD)          (((IM)==1) ?    _qO_Mrm         (0xd1   
,_b11,_b101,_r8(RD)                             ) : \
-                                               _qO_Mrm_B       (0xc1   
,_b11,_b101,_r8(RD)                     ,_u8(IM) ) )
-#define SHRQrr(RS,RD)          (((RS)==_CL) ?  _qO_Mrm         (0xd3   
,_b11,_b101,_r8(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
+#define _rA(R)          _r8(R)
+
+/* Use RIP-addressing in 64-bit mode, if possible */
+#if 0
+#define _x86_RIP_addressing_possible(D,O)      (X86_RIP_RELATIVE_ADDR && \
+                                               ((unsigned 
long)x86_get_target() + 4 + (O) - (D) <= 0xffffffff))
+
+#define _r_X(   R, D,B,I,S,O)  (_r0P(I) ? (_r0P(B)    ? (!X86_TARGET_64BIT ? 
_r_D(R,D) : \
+                                                        
(_x86_RIP_addressing_possible(D, O) ? \
+                                                         _r_D(R, (D) - 
((unsigned long)x86_get_target() + 4 + (O))) : \
+                                                         _r_DSIB(R,D))) : \
+                                                        _r_DSIB(R,D            
    ))  : \
+                                          (_rIP(B)    ? _r_D   (R,D            
    )   : \
+                                          (_rsp12P(B) ? 
_r_DBIS(R,D,_RSP,_RSP,1)   : \
+                                                        _r_DB  (R,D,     B     
  ))))  : \
+                                (_r0P(B)              ? _r_4IS (R,D,           
 I,S)   : \
+                                (!_rspP(I)            ? _r_DBIS(R,D,     B,    
 I,S)   : \
+                                                        JITFAIL("illegal index 
register: %esp"))))
+#else
+#define _r_X(   R, D,B,I,S,O)  (_r0P(I) ? (_r0P(B)    ? _r_DSIB(R,D            
    )   : \
+                                          (_rIP(B)    ? _r_D   (R,D            
    )   : \
+                                          (_rsp12P(B) ? 
_r_DBIS(R,D,_RSP,_RSP,1)   : \
+                                                        _r_DB  (R,D,     B     
  ))))  : \
+                                (_r0P(B)              ? _r_4IS (R,D,           
 I,S)   : \
+                                (!_rspP(I)            ? _r_DBIS(R,D,     B,    
 I,S)   : \
+                                                        JITFAIL("illegal index 
register: %esp"))))
+#endif
 
 
-#define SUBQrr(RS, RD)                 _qO_Mrm         (0x29           
,_b11,_r8(RS),_r8(RD)                           )
-#define SUBQir(IM, RD)                 _qOs_Mrm_sL     (0x81           
,_b11,_b101  ,_r8(RD)                   ,IM     )
+#define _m32only(X)            (JITFAIL("invalid instruction in 64-bit mode"))
+#define _m64only(X)            (X)
+#define _m64(X)                        (X)
+
+#define CALLsr(R)                      CALLQsr(R)
+#define JMPsr(R)                       JMPQsr(R)
+
+#define _SPL           0x14
+#define _BPL           0x15
+#define _SIL           0x16
+#define _DIL           0x17
+#define _R8B           0x18
+#define _R9B           0x19
+#define _R10B          0x1A
+#define _R11B          0x1B
+#define _R12B          0x1C
+#define _R13B          0x1D
+#define _R14B          0x1E
+#define _R15B          0x1F
+
+#define _R8W           0x38
+#define _R9W           0x39
+#define _R10W          0x3A
+#define _R11W          0x3B
+#define _R12W          0x3C
+#define _R13W          0x3D
+#define _R14W          0x3E
+#define _R15W          0x3F
+#define _R8D           0x48
+#define _R9D           0x49
+#define _R10D          0x4A
+#define _R11D          0x4B
+#define _R12D          0x4C
+#define _R13D          0x4D
+#define _R14D          0x4E
+#define _R15D          0x4F
+
+#define _RAX           0x50
+#define _RCX           0x51
+#define _RDX           0x52
+#define _RBX           0x53
+#define _RSP           0x54
+#define _RBP           0x55
+#define _RSI           0x56
+#define _RDI           0x57
+#define _R8            0x48
+#define _R9            0x49
+#define _R10           0x4A
+#define _R11           0x4B
+#define _R12           0x4C
+#define _R13           0x4D
+#define _R14           0x4E
+#define _R15           0x4F
+#define _RIP           -2
+
+#if 0
+#define _r8(R)         ( (_rC(R) == 0x50)                      ? _rN(R) : 
JITFAIL("64-bit register required"))
+#else
+#define _r8(R)         ( (_rC(R) == 0x50)                      ? _rN(R) : 
_r4(R))
+#endif
 
-#define TESTQrr(RS, RD)                        _qO_Mrm         (0x85           
,_b11,_r8(RS),_r8(RD)                           )
-#define TESTQir(IM, RD)                        _qO_Mrm_L       (0xf7           
,_b11,_b000  ,_r8(RD)                   ,IM     )
+#define _r1e8lP(R)     ((int)(R) >= _SPL && (int)(R) <= _DIL)
 
-#define XORQrr(RS, RD)                 _qO_Mrm         (0x31           
,_b11,_r8(RS),_r8(RD)                           )
-#define XORQir(IM, RD)                 _qOs_Mrm_sL     (0x81           
,_b11,_b110  ,_r8(RD)                   ,IM     )
+#define DECWr(RD)      (_d16(), _REXLrr(0, RD),        _O_Mrm          (0xff   
        ,_b11,_b001  ,_r2(RD)                           ))
+#define DECLr(RD)      (_REXLrr(0, RD),                _O_Mrm          (0xff   
        ,_b11,_b001  ,_r4(RD)                           ))
+#define INCWr(RD)      (_d16(), _REXLrr(0, RD),        _O_Mrm          (0xff   
        ,_b11,_b000  ,_r2(RD)                           ))
+#define INCLr(RD)      (_REXLrr(0, RD),                _O_Mrm          (0xff   
        ,_b11,_b000  ,_r4(RD)                           ))
 
 #endif
 #endif /* __lightning_asm_h */
diff --git a/lightning/i386/asm-i386.h b/lightning/i386/asm-i386.h
index c04d2c1..e22a2cf 100644
--- a/lightning/i386/asm-i386.h
+++ b/lightning/i386/asm-i386.h
@@ -40,7 +40,6 @@
  *             + sr/sm         = a star preceding a register or memory
  */
 
-
 typedef _uc            jit_insn;
 
 #ifndef LIGHTNING_DEBUG
@@ -60,24 +59,21 @@ typedef _uc         jit_insn;
 
 /*** REGISTERS ***/    /* [size,,number] */
 
+#define _NOREG         0
 
 #define _AL            0x10
 #define _CL            0x11
 #define _DL            0x12
 #define _BL            0x13
-#define _AH            0x14
-#define _CH            0x15
-#define _DH            0x16
-#define _BH            0x17
-
-#define _AX            0x20
-#define _CX            0x21
-#define _DX            0x22
-#define _BX            0x23
-#define _SP            0x24
-#define _BP            0x25
-#define _SI            0x26
-#define _DI            0x27
+
+#define _AX            0x30
+#define _CX            0x31
+#define _DX            0x32
+#define _BX            0x33
+#define _SP            0x34
+#define _BP            0x35
+#define _SI            0x36
+#define _DI            0x37
 
 #define _EAX           0x40
 #define _ECX           0x41
@@ -88,6 +84,24 @@ typedef _uc          jit_insn;
 #define _ESI           0x46
 #define _EDI           0x47
 
+#define _MM0           0x60
+#define _MM1           0x61
+#define _MM2           0x62
+#define _MM3           0x63
+#define _MM4           0x64
+#define _MM5           0x65
+#define _MM6           0x66
+#define _MM7           0x67
+
+#define _XMM0          0x70
+#define _XMM1          0x71
+#define _XMM2          0x72
+#define _XMM3          0x73
+#define _XMM4          0x74
+#define _XMM5          0x75
+#define _XMM6          0x76
+#define _XMM7          0x77
+
 #define _ST0           0
 #define _ST1           1
 #define _ST2           2
@@ -97,21 +111,33 @@ typedef _uc                jit_insn;
 #define _ST6           6
 #define _ST7           7
 
-#define _rS(R)         ((R)>>4)
-#define _rN(R)         ((R)&0x7)
-#define _r0P(R)                ((R)==0)
+#define _r0P(R)                ((int)(R) == (int)_NOREG)
+#define _rIP(R)                ((int)(R) == (int)_RIP)
 
-#ifndef _ASM_SAFETY
+#define _rC(R)         ((R) & 0xf0)
+#define _rR(R)         ((R) & 0x0f)
+#define _rN(R)         ((R) & 0x07)
+#define _rXP(R)                ((R) > 0 && _rR(R) > 7)
+
+#if !defined(_ASM_SAFETY)
 #define _r1(R)         _rN(R)
 #define _r2(R)         _rN(R)
 #define _r4(R)         _rN(R)
+#define _r8(R)         _rN(R)
+#define _rM(R)         _rN(R)
+#define _rX(R)         _rN(R)
 #else
-#define _r1(R)         ((_rS(R)==1) ? _rN(R) : JITFAIL( "8-bit register 
required"))
-#define _r2(R)         ((_rS(R)==2) ? _rN(R) : JITFAIL("16-bit register 
required"))
-#define _r4(R)         ((_rS(R)==4) ? _rN(R) : JITFAIL("32-bit register 
required"))
+#define _r1(R)         ( ((unsigned) _rC((R) - 16)) < (0x30 - 16)      ? 
_rN(R) : JITFAIL( "8-bit register required"))
+#define _r2(R)         ( (_rC(R) == 0x30)                      ? _rN(R) : 
JITFAIL("16-bit register required"))
+#define _r4(R)         ( (_rC(R) == 0x40)                      ? _rN(R) : 
JITFAIL("32-bit register required"))
+#define _rM(R)         ( (_rC(R) == 0x60)                      ? _rN(R) : 
JITFAIL("MMX register required"))
+#define _rX(R)         ( (_rC(R) == 0x70)                      ? _rN(R) : 
JITFAIL("SSE register required"))
 #endif
 
-#define _r8(R)         _r4(R)
+#define _rbpP(R)       (_rR(R) == _rR(_EBP))
+#define _rspP(R)       (_rR(R) == _rR(_ESP))
+#define _rbp13P(R)     (_rN(R) == _rN(_EBP))
+#define _rsp12P(R)     (_rN(R) == _rN(_ESP))
 
 /*** ASSEMBLER ***/
 
@@ -149,26 +175,23 @@ typedef _uc               jit_insn;
 
 /* memory subformats - urgh! */
 
-#define _r_0B( R,   B    )     (_Mrm(_b00,_rN(R),_r4(B))                       
           )
-#define _r_0BIS(R,   B,I,S)    (_Mrm(_b00,_rN(R),_b100 
),_SIB(_SCL(S),_r4(I),_r4(B))      )
-#define _r_1B( R, D,B    )     (_Mrm(_b01,_rN(R),_r4(B))                       
     ,_jit_B((long)(D)))
-#define _r_1BIS(R, D,B,I,S)    (_Mrm(_b01,_rN(R),_b100 
),_SIB(_SCL(S),_r4(I),_r4(B)),_jit_B((long)(D)))
-#define _r_4B( R, D,B    )     (_Mrm(_b10,_rN(R),_r4(B))                       
     ,_jit_I((long)(D)))
-#define _r_4IS( R, D,I,S)      (_Mrm(_b00,_rN(R),_b100 
),_SIB(_SCL(S),_r4(I),_b101 ),_jit_I((long)(D)))
-#define _r_4BIS(R, D,B,I,S)    (_Mrm(_b10,_rN(R),_b100 
),_SIB(_SCL(S),_r4(I),_r4(B)),_jit_I((long)(D)))
+/* _r_D() is RIP addressing mode if X86_TARGET_64BIT, use _r_DSIB() instead */
+#define _r_D(  R, D      )     (_Mrm(_b00,_rN(R),_b101 )                       
     ,_jit_I((long)(D)))
+#define _r_DSIB(R, D      )    (_Mrm(_b00,_rN(R),_b100 ),_SIB(_SCL(1),_b100 
,_b101 ),_jit_I((long)(D)))
+#define _r_0B( R,   B    )     (_Mrm(_b00,_rN(R),_rA(B))                       
                   )
+#define _r_0BIS(R,   B,I,S)    (_Mrm(_b00,_rN(R),_b100 
),_SIB(_SCL(S),_rA(I),_rA(B))              )
+#define _r_1B( R, D,B    )     (_Mrm(_b01,_rN(R),_rA(B))                       
     ,_jit_B((long)(D)))
+#define _r_1BIS(R, D,B,I,S)    (_Mrm(_b01,_rN(R),_b100 
),_SIB(_SCL(S),_rA(I),_rA(B)),_jit_B((long)(D)))
+#define _r_4B( R, D,B    )     (_Mrm(_b10,_rN(R),_rA(B))                       
     ,_jit_I((long)(D)))
+#define _r_4IS( R, D,I,S)      (_Mrm(_b00,_rN(R),_b100 
),_SIB(_SCL(S),_rA(I),_b101 ),_jit_I((long)(D)))
+#define _r_4BIS(R, D,B,I,S)    (_Mrm(_b10,_rN(R),_b100 
),_SIB(_SCL(S),_rA(I),_rA(B)),_jit_I((long)(D)))
+
+#define _r_DB(  R, D,B    )    ((_s0P(D) && (!_rbp13P(B)) ? _r_0B  (R,  B    ) 
: (_s8P(D) ? _r_1B(  R,D,B    ) : _r_4B(  R,D,B    ))))
+#define _r_DBIS(R, D,B,I,S)    ((_s0P(D) && (!_rbp13P(B)) ? _r_0BIS(R,  B,I,S) 
: (_s8P(D) ? _r_1BIS(R,D,B,I,S) : _r_4BIS(R,D,B,I,S))))
 
-#define _r_DB(  R, D,B    )    ((_s0P(D) && (B != _EBP) ? _r_0B  (R,  B    ) : 
(_s8P(D) ? _r_1B(  R,D,B    ) : _r_4B(  R,D,B    ))))
-#define _r_DBIS(R, D,B,I,S)    ((_s0P(D)                ? _r_0BIS(R,  B,I,S) : 
(_s8P(D) ? _r_1BIS(R,D,B,I,S) : _r_4BIS(R,D,B,I,S))))
- 
-#define _r_X(   R, D,B,I,S)    (_r0P(I) ? (_r0P(B)   ? _r_D   (R,D            
)   : \
-                                          (_ESP==(B) ? 
_r_DBIS(R,D,_ESP,_ESP,1)   : \
-                                                       _r_DB  (R,D,   B       
))) : \
-                                (_r0P(B)             ? _r_4IS (R,D,        
I,S)   : \
-                                (((I)!=_ESP)         ? _r_DBIS(R,D,   B,   
I,S)   : \
-                                                       JITFAIL("illegal index 
register: %esp"))))
 
 
-/* instruction formats */
+/* --- Instruction formats ------------------------------------------------- */
 
 /*      _format                                                     Opcd       
  ModR/M dN(rB,rI,Sc)     imm... */
 
@@ -180,7 +203,11 @@ typedef _uc                jit_insn;
 #define          _Os(       OP,B                       )  (    _s8P(B) ? 
_jit_B(((OP)|_b10)) : _jit_B(OP)                        )
 #define            _sW(                             W  )  (                    
               _s8P(W) ? _jit_B(W):_jit_W(W)      )
 #define            _sL(                             L  )  (                    
               _s8P(L) ? _jit_B(L):_jit_I(L)      )
+#define            _sWO(                            W  )  (                    
               _s8P(W) ?    1 :   2       )
+#define            _sLO(                            L  )  (                    
               _s8P(L) ?    1 :   4       )
+#define          _O_B(      OP                     ,B  )  (        _O      (  
OP  )                          ,_jit_B(B)          )
 #define          _O_W(      OP                     ,W  )  (        _O      (  
OP  )                          ,_jit_W(W)          )
+#define          _O_L(      OP                     ,L  )  (        _O      (  
OP  )                          ,_jit_I(L)          )
 #define          _O_D8(     OP                     ,D  )  (        _O      (  
OP  )                         ,_D8(D)      )
 #define          _O_D32(     OP                    ,D  )  (        _O      (  
OP  )                         ,_D32(D)     )
 #define         _OO_D32(     OP                    ,D  )  (       _OO      (  
OP  )                         ,_D32(D)     )
@@ -190,6 +217,7 @@ typedef _uc         jit_insn;
 #define          _Or_B(     OP,R                   ,B  )  (        _Or     (  
OP,R)                          ,_jit_B(B)          )
 #define          _Or_W(     OP,R                   ,W  )  (        _Or     (  
OP,R)                          ,_jit_W(W)          )
 #define          _Or_L(     OP,R                   ,L  )  (        _Or     (  
OP,R)                          ,_jit_I(L)          )
+#define          _Or_Q(     OP,R                   ,Q  )  (        _Or     (  
OP,R)                          ,_jit_L(Q)          )
 #define          _O_Mrm(    OP  ,MO,R,M                )  (        _O      (  
OP  ),_Mrm(MO,R,M            )             )
 #define         _OO_Mrm(    OP  ,MO,R,M                )  (       _OO      (  
OP  ),_Mrm(MO,R,M            )             )
 #define          _O_Mrm_B(  OP  ,MO,R,M            ,B  )  (        _O      (  
OP  ),_Mrm(MO,R,M            ) ,_jit_B(B)          )
@@ -198,788 +226,1165 @@ typedef _uc            jit_insn;
 #define         _OO_Mrm_B(  OP  ,MO,R,M            ,B  )  (       _OO      (  
OP  ),_Mrm(MO,R,M            ) ,_jit_B(B)          )
 #define          _Os_Mrm_sW(OP  ,MO,R,M            ,W  )  (        _Os     (  
OP,W),_Mrm(MO,R,M            ),_sW(W)      )
 #define          _Os_Mrm_sL(OP  ,MO,R,M            ,L  )  (        _Os     (  
OP,L),_Mrm(MO,R,M            ),_sL(L)      )
-#define          _O_r_X(    OP     ,R  ,MD,MB,MI,MS    )  (        _O      (  
OP  ),_r_X(   R  ,MD,MB,MI,MS)             )
-#define         _OO_r_X(    OP     ,R  ,MD,MB,MI,MS    )  (       _OO      (  
OP  ),_r_X(   R  ,MD,MB,MI,MS)             )
-#define          _O_r_X_B(  OP     ,R  ,MD,MB,MI,MS,B  )  (        _O      (  
OP  ),_r_X(   R  ,MD,MB,MI,MS) ,_jit_B(B)          )
-#define          _O_r_X_W(  OP     ,R  ,MD,MB,MI,MS,W  )  (        _O      (  
OP  ),_r_X(   R  ,MD,MB,MI,MS) ,_jit_W(W)          )
-#define          _O_r_X_L(  OP     ,R  ,MD,MB,MI,MS,L  )  (        _O      (  
OP  ),_r_X(   R  ,MD,MB,MI,MS) ,_jit_I(L)          )
-#define         _OO_r_X_B(  OP     ,R  ,MD,MB,MI,MS,B  )  (       _OO      (  
OP  ),_r_X(   R  ,MD,MB,MI,MS) ,_jit_B(B)          )
-#define          _Os_r_X_sW(OP     ,R  ,MD,MB,MI,MS,W  )  (        _Os     (  
OP,W),_r_X(   R  ,MD,MB,MI,MS),_sW(W)      )
-#define          _Os_r_X_sL(OP     ,R  ,MD,MB,MI,MS,L  )  (        _Os     (  
OP,L),_r_X(   R  ,MD,MB,MI,MS),_sL(L)      )
+#define          _O_r_X(    OP     ,R  ,MD,MB,MI,MS    )  (        _O      (  
OP  ),_r_X(   R  ,MD,MB,MI,MS,0)           )
+#define         _OO_r_X(    OP     ,R  ,MD,MB,MI,MS    )  (       _OO      (  
OP  ),_r_X(   R  ,MD,MB,MI,MS,0)           )
+#define          _O_r_X_B(  OP     ,R  ,MD,MB,MI,MS,B  )  (        _O      (  
OP  ),_r_X(   R  ,MD,MB,MI,MS,1) ,_jit_B(B)        )
+#define          _O_r_X_W(  OP     ,R  ,MD,MB,MI,MS,W  )  (        _O      (  
OP  ),_r_X(   R  ,MD,MB,MI,MS,2) ,_jit_W(W)        )
+#define          _O_r_X_L(  OP     ,R  ,MD,MB,MI,MS,L  )  (        _O      (  
OP  ),_r_X(   R  ,MD,MB,MI,MS,4) ,_jit_I(L)        )
+#define         _OO_r_X_B(  OP     ,R  ,MD,MB,MI,MS,B  )  (       _OO      (  
OP  ),_r_X(   R  ,MD,MB,MI,MS,1) ,_jit_B(B)        )
+#define          _Os_r_X_sW(OP     ,R  ,MD,MB,MI,MS,W  )  (        _Os     (  
OP,W),_r_X(   R  ,MD,MB,MI,MS,_sWO(W)),_sW(W))
+#define          _Os_r_X_sL(OP     ,R  ,MD,MB,MI,MS,L  )  (        _Os     (  
OP,L),_r_X(   R  ,MD,MB,MI,MS,_sLO(L)),_sL(L))
 #define          _O_X_B(    OP         ,MD,MB,MI,MS,B  )  (        _O_r_X_B(  
OP           ,0  ,MD,MB,MI,MS     ,B)      )
 #define          _O_X_W(    OP         ,MD,MB,MI,MS,W  )  (        _O_r_X_W(  
OP           ,0  ,MD,MB,MI,MS     ,W)      )
 #define          _O_X_L(    OP         ,MD,MB,MI,MS,L  )  (        _O_r_X_L(  
OP           ,0  ,MD,MB,MI,MS     ,L)      )
-#define         _wO(        OP                         )  (_d16(), _O(        
OP                                  )      )
-#define         _wOr(       OP,R                       )  (_d16(), _Or(       
OP,R                                )      )
-#define         _wOr_W(     OP,R                   ,W  )  (_d16(), _Or_W(     
OP,R                              ,W)      )
-#define         _wOs_sW(    OP                     ,W  )  (_d16(), _Os_sW(    
OP                                ,W)      )
-#define         _wO_Mrm(    OP  ,MO,R,M                )  (_d16(), _O_Mrm(    
OP        ,MO,R,M                   )      )
-#define _wOO_Mrm(    OP         ,MO,R,M                )  (_d16(),_OO_Mrm(    
OP        ,MO,R,M                   )      )
-#define         _wO_Mrm_B(  OP  ,MO,R,M            ,B  )  (_d16(), _O_Mrm_B(  
OP        ,MO,R,M                 ,B)      )
-#define _wOO_Mrm_B(  OP         ,MO,R,M            ,B  )  (_d16(),_OO_Mrm_B(  
OP        ,MO,R,M                 ,B)      )
-#define         _wO_Mrm_W(  OP  ,MO,R,M            ,W  )  (_d16(), _O_Mrm_W(  
OP        ,MO,R,M                 ,W)      )
-#define         _wOs_Mrm_sW(OP  ,MO,R,M            ,W  )  (_d16(), 
_Os_Mrm_sW(OP        ,MO,R,M                 ,W)      )
-#define         _wO_X_W(    OP         ,MD,MB,MI,MS,W  )  (_d16(), _O_X_W(    
OP               ,MD,MB,MI,MS     ,W)      )
-#define         _wO_r_X(    OP     ,R  ,MD,MB,MI,MS    )  (_d16(), _O_r_X(    
OP           ,R  ,MD,MB,MI,MS       )      )
-#define _wOO_r_X(    OP            ,R  ,MD,MB,MI,MS    )  (_d16(),_OO_r_X(    
OP           ,R  ,MD,MB,MI,MS       )      )
-#define         _wO_r_X_B(  OP     ,R  ,MD,MB,MI,MS,B  )  (_d16(), _O_r_X_B(  
OP           ,R  ,MD,MB,MI,MS     ,B)      )
-#define _wOO_r_X_B(  OP            ,R  ,MD,MB,MI,MS,B  )  (_d16(),_OO_r_X_B(  
OP           ,R  ,MD,MB,MI,MS     ,B)      )
-#define         _wO_r_X_W(  OP     ,R  ,MD,MB,MI,MS,W  )  (_d16(), _O_r_X_W(  
OP           ,R  ,MD,MB,MI,MS     ,W)      )
-#define         _wOs_r_X_sW(OP     ,R  ,MD,MB,MI,MS,W  )  (_d16(), 
_Os_r_X_sW(OP           ,R  ,MD,MB,MI,MS     ,W)      )
-
-/* +++ fully-qualified intrinsic instructions */
-
-/*                                     _format          Opcd           ,Mod ,r 
    ,m          ,mem=dsp+sib    ,imm... */
-
-#define ADCBrr(RS, RD)                 _O_Mrm          (0x10           
,_b11,_r1(RS),_r1(RD)                           )
-#define ADCBmr(MD, MB, MI, MS, RD)     _O_r_X          (0x12                
,_r1(RD)           ,MD,MB,MI,MS            )
-#define ADCBrm(RS, MD, MB, MI, MS)     _O_r_X          (0x10                
,_r1(RS)           ,MD,MB,MI,MS            )
-#define ADCBir(IM, RD)                 _O_Mrm_B        (0x80           
,_b11,_b010  ,_r1(RD)                   ,_su8(IM))
-#define ADCBim(IM, MD, MB, MI, MS)     _O_r_X_B        (0x80                
,_b010             ,MD,MB,MI,MS    ,_su8(IM))
-
-#define ADCWrr(RS, RD)                 _wO_Mrm         (0x11           
,_b11,_r2(RS),_r2(RD)                           )
-#define ADCWmr(MD, MB, MI, MS, RD)     _wO_r_X         (0x13                
,_r2(RD)           ,MD,MB,MI,MS            )
-#define ADCWrm(RS, MD, MB, MI, MS)     _wO_r_X         (0x11                
,_r2(RS)           ,MD,MB,MI,MS            )
-#define ADCWir(IM, RD)                 _wOs_Mrm_sW     (0x81           
,_b11,_b010  ,_r2(RD)                   ,_su16(IM))
-#define ADCWim(IM, MD, MB, MI, MS)     _wOs_r_X_sW     (0x81                
,_b010             ,MD,MB,MI,MS    ,_su16(IM))
-
-#define ADCLrr(RS, RD)                 _O_Mrm          (0x11           
,_b11,_r4(RS),_r4(RD)                           )
-#define ADCLmr(MD, MB, MI, MS, RD)     _O_r_X          (0x13                
,_r4(RD)           ,MD,MB,MI,MS            )
-#define ADCLrm(RS, MD, MB, MI, MS)     _O_r_X          (0x11                
,_r4(RS)           ,MD,MB,MI,MS            )
-#define ADCLir(IM, RD)                 _Os_Mrm_sL      (0x81           
,_b11,_b010  ,_r4(RD)                   ,IM     )
-#define ADCLim(IM, MD, MB, MI, MS)     _Os_r_X_sL      (0x81                
,_b010             ,MD,MB,MI,MS    ,IM     )
-
-
-#define ADDBrr(RS, RD)                 _O_Mrm          (0x00           
,_b11,_r1(RS),_r1(RD)                           )
-#define ADDBmr(MD, MB, MI, MS, RD)     _O_r_X          (0x02                
,_r1(RD)           ,MD,MB,MI,MS            )
-#define ADDBrm(RS, MD, MB, MI, MS)     _O_r_X          (0x00                
,_r1(RS)           ,MD,MB,MI,MS            )
-#define ADDBir(IM, RD)                 _O_Mrm_B        (0x80           
,_b11,_b000  ,_r1(RD)                   ,_su8(IM))
-#define ADDBim(IM, MD, MB, MI, MS)     _O_r_X_B        (0x80                
,_b000             ,MD,MB,MI,MS    ,_su8(IM))
-
-#define ADDWrr(RS, RD)                 _wO_Mrm         (0x01           
,_b11,_r2(RS),_r2(RD)                           )
-#define ADDWmr(MD, MB, MI, MS, RD)     _wO_r_X         (0x03                
,_r2(RD)           ,MD,MB,MI,MS            )
-#define ADDWrm(RS, MD, MB, MI, MS)     _wO_r_X         (0x01                
,_r2(RS)           ,MD,MB,MI,MS            )
-#define ADDWir(IM, RD)                 _wOs_Mrm_sW     (0x81           
,_b11,_b000  ,_r2(RD)                   ,_su16(IM))
-#define ADDWim(IM, MD, MB, MI, MS)     _wOs_r_X_sW     (0x81                
,_b000             ,MD,MB,MI,MS    ,_su16(IM))
-
-#define ADDLrr(RS, RD)                 _O_Mrm          (0x01           
,_b11,_r4(RS),_r4(RD)                           )
-#define ADDLmr(MD, MB, MI, MS, RD)     _O_r_X          (0x03                
,_r4(RD)           ,MD,MB,MI,MS            )
-#define ADDLrm(RS, MD, MB, MI, MS)     _O_r_X          (0x01                
,_r4(RS)           ,MD,MB,MI,MS            )
-#define ADDLir(IM, RD)                 _Os_Mrm_sL      (0x81           
,_b11,_b000  ,_r4(RD)                   ,IM     )
-#define ADDLim(IM, MD, MB, MI, MS)     _Os_r_X_sL      (0x81                
,_b000             ,MD,MB,MI,MS    ,IM     )
 
 
-#define ANDBrr(RS, RD)                 _O_Mrm          (0x20           
,_b11,_r1(RS),_r1(RD)                           )
-#define ANDBmr(MD, MB, MI, MS, RD)     _O_r_X          (0x22                
,_r1(RD)           ,MD,MB,MI,MS            )
-#define ANDBrm(RS, MD, MB, MI, MS)     _O_r_X          (0x20                
,_r1(RS)           ,MD,MB,MI,MS            )
-#define ANDBir(IM, RD)                 _O_Mrm_B        (0x80           
,_b11,_b100  ,_r1(RD)                   ,_su8(IM))
-#define ANDBim(IM, MD, MB, MI, MS)     _O_r_X_B        (0x80                
,_b100             ,MD,MB,MI,MS    ,_su8(IM))
-
-#define ANDWrr(RS, RD)                 _wO_Mrm         (0x21           
,_b11,_r2(RS),_r2(RD)                           )
-#define ANDWmr(MD, MB, MI, MS, RD)     _wO_r_X         (0x23                
,_r2(RD)           ,MD,MB,MI,MS            )
-#define ANDWrm(RS, MD, MB, MI, MS)     _wO_r_X         (0x21                
,_r2(RS)           ,MD,MB,MI,MS            )
-#define ANDWir(IM, RD)                 _wOs_Mrm_sW     (0x81           
,_b11,_b100  ,_r2(RD)                   ,_su16(IM))
-#define ANDWim(IM, MD, MB, MI, MS)     _wOs_r_X_sW     (0x81                
,_b100             ,MD,MB,MI,MS    ,_su16(IM))
+/* --- REX prefixes -------------------------------------------------------- */
 
-#define ANDLrr(RS, RD)                 _O_Mrm          (0x21           
,_b11,_r4(RS),_r4(RD)                           )
-#define ANDLmr(MD, MB, MI, MS, RD)     _O_r_X          (0x23                
,_r4(RD)           ,MD,MB,MI,MS            )
-#define ANDLrm(RS, MD, MB, MI, MS)     _O_r_X          (0x21                
,_r4(RS)           ,MD,MB,MI,MS            )
-#define ANDLir(IM, RD)                 _Os_Mrm_sL      (0x81           
,_b11,_b100  ,_r4(RD)                   ,IM     )
-#define ANDLim(IM, MD, MB, MI, MS)     _Os_r_X_sL      (0x81                
,_b100             ,MD,MB,MI,MS    ,IM     )
+#define _VOID()                        ((void)0)
+#define _BIT(X)                        (!!(X))
+#define _d64(W,R,X,B)          (_jit_B(0x40|(W)<<3|(R)<<2|(X)<<1|(B)))
 
+#define __REXwrxb(L,W,R,X,B)   ((W|R|X|B) || (L) ? _d64(W,R,X,B) : _VOID())
+#define __REXwrx_(L,W,R,X,MR)  (__REXwrxb(L,W,R,X,_BIT(_rIP(MR)?0:_rXP(MR))))
+#define __REXw_x_(L,W,R,X,MR)  (__REXwrx_(L,W,_BIT(_rXP(R)),X,MR))
+#define __REX_reg(RR)          (__REXwrxb(0,0,0,00,_BIT(_rXP(RR))))
+#define __REX_mem(MB,MI)       (__REXwrxb(0,0,0,_BIT(_rXP(MI)),_BIT(_rXP(MB))))
 
-#define BSWAPLr(R)                     _OOr            (0x0fc8,_r4(R)          
                                        )
+// FIXME: can't mix new (SPL,BPL,SIL,DIL) with (AH,BH,CH,DH)
+#define _REXBrr(RR,MR)         
_m64(__REXw_x_(_r1e8lP(RR)||_r1e8lP(MR),0,RR,0,MR))
+#define _REXBmr(MB,MI,RD)      
_m64(__REXw_x_(_r1e8lP(RD)||_r1e8lP(MB),0,RD,_BIT(_rXP(MI)),MB))
+#define _REXBrm(RS,MB,MI)      _REXBmr(MB,MI,RS)
 
+#define _REXBLrr(RR,MR)                _m64(__REXw_x_(_r1e8lP(MR),0,RR,0,MR))
+#define _REXLrr(RR,MR)         _m64(__REXw_x_(0,0,RR,0,MR))
+#define _REXLmr(MB,MI,RD)      _m64(__REXw_x_(0,0,RD,_BIT(_rXP(MI)),MB))
+#define _REXLrm(RS,MB,MI)      _REXLmr(MB,MI,RS)
+#define _REXLr(RR)             _m64(__REX_reg(RR))
+#define _REXLm(MB,MI)          _m64(__REX_mem(MB,MI))
 
-#define BTWir(IM,RD)                   _wOO_Mrm_B      (0x0fba         
,_b11,_b100  ,_r2(RD)                   ,_u8(IM))
-#define BTWim(IM,MD,MB,MI,MS)          _wOO_r_X_B      (0x0fba              
,_b100             ,MD,MB,MI,MS    ,_u8(IM))
-#define BTWrr(RS,RD)                   _wOO_Mrm        (0x0fa3         
,_b11,_r2(RS),_r2(RD)                           )
-#define BTWrm(RS,MD,MB,MI,MS)          _wOO_r_X        (0x0fa3              
,_r2(RS)           ,MD,MB,MI,MS            )
+#define _REXQrr(RR,MR)         _m64only(__REXw_x_(0,1,RR,0,MR))
+#define _REXQmr(MB,MI,RD)      _m64only(__REXw_x_(0,1,RD,_BIT(_rXP(MI)),MB))
+#define _REXQrm(RS,MB,MI)      _REXQmr(MB,MI,RS)
+#define _REXQr(RR)             _m64only(__REX_reg(RR))
+#define _REXQm(MB,MI)          _m64only(__REX_mem(MB,MI))
 
-#define BTLir(IM,RD)                   _OO_Mrm_B       (0x0fba         
,_b11,_b100  ,_r4(RD)                   ,_u8(IM))
-#define BTLim(IM,MD,MB,MI,MS)          _OO_r_X_B       (0x0fba              
,_b100             ,MD,MB,MI,MS    ,_u8(IM))
-#define BTLrr(RS,RD)                   _OO_Mrm         (0x0fa3         
,_b11,_r4(RS),_r4(RD)                           )
-#define BTLrm(RS,MD,MB,MI,MS)          _OO_r_X         (0x0fa3              
,_r4(RS)           ,MD,MB,MI,MS            )
 
+/* ========================================================================= */
+/* --- Fully-qualified intrinsic instructions ------------------------------ */
+/* ========================================================================= */
 
-#define BTCWir(IM,RD)                  _wOO_Mrm_B      (0x0fba         
,_b11,_b111  ,_r2(RD)                   ,_u8(IM))
-#define BTCWim(IM,MD,MB,MI,MS)         _wOO_r_X_B      (0x0fba              
,_b111             ,MD,MB,MI,MS    ,_u8(IM))
-#define BTCWrr(RS,RD)                  _wOO_Mrm        (0x0fbb         
,_b11,_r2(RS),_r2(RD)                           )
-#define BTCWrm(RS,MD,MB,MI,MS)         _wOO_r_X        (0x0fbb              
,_r2(RS)           ,MD,MB,MI,MS            )
-
-#define BTCLir(IM,RD)                  _OO_Mrm_B       (0x0fba         
,_b11,_b111  ,_r4(RD)                   ,_u8(IM))
-#define BTCLim(IM,MD,MB,MI,MS)         _OO_r_X_B       (0x0fba              
,_b111             ,MD,MB,MI,MS    ,_u8(IM))
-#define BTCLrr(RS,RD)                  _OO_Mrm         (0x0fbb         
,_b11,_r4(RS),_r4(RD)                           )
-#define BTCLrm(RS,MD,MB,MI,MS)         _OO_r_X         (0x0fbb              
,_r4(RS)           ,MD,MB,MI,MS            )
-
-
-#define BTRWir(IM,RD)                  _wOO_Mrm_B      (0x0fba         
,_b11,_b110  ,_r2(RD)                   ,_u8(IM))
-#define BTRWim(IM,MD,MB,MI,MS)         _wOO_r_X_B      (0x0fba              
,_b110             ,MD,MB,MI,MS    ,_u8(IM))
-#define BTRWrr(RS,RD)                  _wOO_Mrm        (0x0fb3         
,_b11,_r2(RS),_r2(RD)                           )
-#define BTRWrm(RS,MD,MB,MI,MS)         _wOO_r_X        (0x0fb3              
,_r2(RS)           ,MD,MB,MI,MS            )
-
-#define BTRLir(IM,RD)                  _OO_Mrm_B       (0x0fba         
,_b11,_b110  ,_r4(RD)                   ,_u8(IM))
-#define BTRLim(IM,MD,MB,MI,MS)         _OO_r_X_B       (0x0fba              
,_b110             ,MD,MB,MI,MS    ,_u8(IM))
-#define BTRLrr(RS,RD)                  _OO_Mrm         (0x0fb3         
,_b11,_r4(RS),_r4(RD)                           )
-#define BTRLrm(RS,MD,MB,MI,MS)         _OO_r_X         (0x0fb3              
,_r4(RS)           ,MD,MB,MI,MS            )
+/*     OPCODE  + i     = immediate operand
+ *             + r     = register operand
+ *             + m     = memory operand (disp,base,index,scale)
+ *             + sr/sm = a star preceding a register or memory
+ *             + 0     = top of stack register (for FPU instructions)
+ *
+ *     NOTE in x86-64 mode: a memory operand with only a valid
+ *     displacement value will lead to the expect absolute mode. If
+ *     RIP addressing is necessary, X86_RIP shall be used as the base
+ *     register argument.
+ */
 
+/* --- ALU instructions ---------------------------------------------------- */
 
-#define BTSWir(IM,RD)                  _wOO_Mrm_B      (0x0fba         
,_b11,_b101  ,_r2(RD)                   ,_u8(IM))
-#define BTSWim(IM,MD,MB,MI,MS)         _wOO_r_X_B      (0x0fba              
,_b101             ,MD,MB,MI,MS    ,_u8(IM))
-#define BTSWrr(RS,RD)                  _wOO_Mrm        (0x0fab         
,_b11,_r2(RS),_r2(RD)                           )
-#define BTSWrm(RS,MD,MB,MI,MS)         _wOO_r_X        (0x0fab              
,_r2(RS)           ,MD,MB,MI,MS            )
+enum {
+  X86_ADD = 0,
+  X86_OR  = 1,
+  X86_ADC = 2,
+  X86_SBB = 3,
+  X86_AND = 4,
+  X86_SUB = 5,
+  X86_XOR = 6,
+  X86_CMP = 7,
+};
 
-#define BTSLir(IM,RD)                  _OO_Mrm_B       (0x0fba         
,_b11,_b101  ,_r4(RD)                   ,_u8(IM))
-#define BTSLim(IM,MD,MB,MI,MS)         _OO_r_X_B       (0x0fba              
,_b101             ,MD,MB,MI,MS    ,_u8(IM))
-#define BTSLrr(RS,RD)                  _OO_Mrm         (0x0fab         
,_b11,_r4(RS),_r4(RD)                           )
-#define BTSLrm(RS,MD,MB,MI,MS)         _OO_r_X         (0x0fab              
,_r4(RS)           ,MD,MB,MI,MS            )
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
+
+#define _ALUBrr(OP,RS, RD)             (_REXBrr(RS, RD),               _O_Mrm  
        (((OP) << 3)    ,_b11,_r1(RS),_r1(RD)                           ))
+#define _ALUBmr(OP, MD, MB, MI, MS, RD)        (_REXBmr(MB, MI, RD),           
_O_r_X          (((OP) << 3) + 2,_r1(RD)                ,MD,MB,MI,MS            
))
+#define _ALUBrm(OP, RS, MD, MB, MI, MS)        (_REXBrm(RS, MB, MI),           
_O_r_X          (((OP) << 3)    ,    ,_r1(RS)           ,MD,MB,MI,MS            
))
+#define _ALUBir(OP, IM, RD)            ((RD) == _AL ? \
+                                       (_REXBrr(0, RD),                _O_B    
        (((OP) << 3) + 4                                        ,_su8(IM))) : \
+                                       (_REXBrr(0, RD),                
_O_Mrm_B        (0x80           ,_b11,OP     ,_r1(RD)                   
,_su8(IM))) )
+#define _ALUBim(OP, IM, MD, MB, MI, MS)        (_REXBrm(0, MB, MI),            
_O_r_X_B        (0x80                ,OP                ,MD,MB,MI,MS    
,_su8(IM)))
+
+#define _ALUWrr(OP, RS, RD)            (_d16(), _REXLrr(RS, RD),       _O_Mrm  
        (((OP) << 3) + 1,_b11,_r2(RS),_r2(RD)                           ))
+#define _ALUWmr(OP, MD, MB, MI, MS, RD)        (_d16(), _REXLmr(MB, MI, RD),   
_O_r_X          (((OP) << 3) + 3     ,_r2(RD)           ,MD,MB,MI,MS            
))
+#define _ALUWrm(OP, RS, MD, MB, MI, MS)        (_d16(), _REXLrm(RS, MB, MI),   
_O_r_X          (((OP) << 3) + 1     ,_r2(RS)           ,MD,MB,MI,MS            
))
+#define _ALUWir(OP, IM, RD)            (!_s8P(IM) && (RD) == _AX ? \
+                                       (_d16(), _REXLrr(0, RD),        _O_W    
        (((OP) << 3) + 5                                        ,_su16(IM))) : \
+                                       (_d16(), _REXLrr(0, RD),        
_Os_Mrm_sW      (0x81           ,_b11,OP     ,_r2(RD)                   
,_su16(IM))) )
+#define _ALUWim(OP, IM, MD, MB, MI, MS)        (_d16(), _REXLrm(0, MB, MI),    
_Os_r_X_sW      (0x81                ,OP                ,MD,MB,MI,MS    
,_su16(IM)))
+
+#define _ALULrr(OP, RS, RD)            (_REXLrr(RS, RD),               _O_Mrm  
        (((OP) << 3) + 1,_b11,_r4(RS),_r4(RD)                           ))
+#define _ALULmr(OP, MD, MB, MI, MS, RD)        (_REXLmr(MB, MI, RD),           
_O_r_X          (((OP) << 3) + 3     ,_r4(RD)           ,MD,MB,MI,MS            
))
+#define _ALULrm(OP, RS, MD, MB, MI, MS)        (_REXLrm(RS, MB, MI),           
_O_r_X          (((OP) << 3) + 1     ,_r4(RS)           ,MD,MB,MI,MS            
))
+#define _ALULir(OP, IM, RD)            (!_s8P(IM) && (RD) == _EAX ? \
+                                       (_REXLrr(0, RD),                _O_L    
        (((OP) << 3) + 5                                        ,IM     )) : \
+                                       (_REXLrr(0, RD),                
_Os_Mrm_sL      (0x81           ,_b11,OP     ,_r4(RD)                   ,IM     
)) )
+#define _ALULim(OP, IM, MD, MB, MI, MS)        (_REXLrm(0, MB, MI),            
_Os_r_X_sL      (0x81                ,OP                ,MD,MB,MI,MS    ,IM     
))
+
+#define _ALUQrr(OP, RS, RD)            (_REXQrr(RS, RD),               _O_Mrm  
        (((OP) << 3) + 1,_b11,_r8(RS),_r8(RD)                           ))
+#define _ALUQmr(OP, MD, MB, MI, MS, RD)        (_REXQmr(MB, MI, RD),           
_O_r_X          (((OP) << 3) + 3     ,_r8(RD)           ,MD,MB,MI,MS            
))
+#define _ALUQrm(OP, RS, MD, MB, MI, MS)        (_REXQrm(RS, MB, MI),           
_O_r_X          (((OP) << 3) + 1     ,_r8(RS)           ,MD,MB,MI,MS            
))
+#define _ALUQir(OP, IM, RD)            (!_s8P(IM) && (RD) == _RAX ? \
+                                       (_REXQrr(0, RD),                _O_L    
        (((OP) << 3) + 5                                        ,IM     )) : \
+                                       (_REXQrr(0, RD),                
_Os_Mrm_sL      (0x81           ,_b11,OP     ,_r8(RD)                   ,IM     
)) )
+#define _ALUQim(OP, IM, MD, MB, MI, MS)        (_REXQrm(0, MB, MI),            
_Os_r_X_sL      (0x81                ,OP                ,MD,MB,MI,MS    ,IM     
))
+
+#define ADCBrr(RS, RD)                 _ALUBrr(X86_ADC, RS, RD)
+#define ADCBmr(MD, MB, MI, MS, RD)     _ALUBmr(X86_ADC, MD, MB, MI, MS, RD)
+#define ADCBrm(RS, MD, MB, MI, MS)     _ALUBrm(X86_ADC, RS, MD, MB, MI, MS)
+#define ADCBir(IM, RD)                 _ALUBir(X86_ADC, IM, RD)
+#define ADCBim(IM, MD, MB, MI, MS)     _ALUBim(X86_ADC, IM, MD, MB, MI, MS)
+
+#define ADCWrr(RS, RD)                 _ALUWrr(X86_ADC, RS, RD)
+#define ADCWmr(MD, MB, MI, MS, RD)     _ALUWmr(X86_ADC, MD, MB, MI, MS, RD)
+#define ADCWrm(RS, MD, MB, MI, MS)     _ALUWrm(X86_ADC, RS, MD, MB, MI, MS)
+#define ADCWir(IM, RD)                 _ALUWir(X86_ADC, IM, RD)
+#define ADCWim(IM, MD, MB, MI, MS)     _ALUWim(X86_ADC, IM, MD, MB, MI, MS)
+
+#define ADCLrr(RS, RD)                 _ALULrr(X86_ADC, RS, RD)
+#define ADCLmr(MD, MB, MI, MS, RD)     _ALULmr(X86_ADC, MD, MB, MI, MS, RD)
+#define ADCLrm(RS, MD, MB, MI, MS)     _ALULrm(X86_ADC, RS, MD, MB, MI, MS)
+#define ADCLir(IM, RD)                 _ALULir(X86_ADC, IM, RD)
+#define ADCLim(IM, MD, MB, MI, MS)     _ALULim(X86_ADC, IM, MD, MB, MI, MS)
+
+#define ADCQrr(RS, RD)                 _ALUQrr(X86_ADC, RS, RD)
+#define ADCQmr(MD, MB, MI, MS, RD)     _ALUQmr(X86_ADC, MD, MB, MI, MS, RD)
+#define ADCQrm(RS, MD, MB, MI, MS)     _ALUQrm(X86_ADC, RS, MD, MB, MI, MS)
+#define ADCQir(IM, RD)                 _ALUQir(X86_ADC, IM, RD)
+#define ADCQim(IM, MD, MB, MI, MS)     _ALUQim(X86_ADC, IM, MD, MB, MI, MS)
+
+#define ADDBrr(RS, RD)                 _ALUBrr(X86_ADD, RS, RD)
+#define ADDBmr(MD, MB, MI, MS, RD)     _ALUBmr(X86_ADD, MD, MB, MI, MS, RD)
+#define ADDBrm(RS, MD, MB, MI, MS)     _ALUBrm(X86_ADD, RS, MD, MB, MI, MS)
+#define ADDBir(IM, RD)                 _ALUBir(X86_ADD, IM, RD)
+#define ADDBim(IM, MD, MB, MI, MS)     _ALUBim(X86_ADD, IM, MD, MB, MI, MS)
+
+#define ADDWrr(RS, RD)                 _ALUWrr(X86_ADD, RS, RD)
+#define ADDWmr(MD, MB, MI, MS, RD)     _ALUWmr(X86_ADD, MD, MB, MI, MS, RD)
+#define ADDWrm(RS, MD, MB, MI, MS)     _ALUWrm(X86_ADD, RS, MD, MB, MI, MS)
+#define ADDWir(IM, RD)                 _ALUWir(X86_ADD, IM, RD)
+#define ADDWim(IM, MD, MB, MI, MS)     _ALUWim(X86_ADD, IM, MD, MB, MI, MS)
+
+#define ADDLrr(RS, RD)                 _ALULrr(X86_ADD, RS, RD)
+#define ADDLmr(MD, MB, MI, MS, RD)     _ALULmr(X86_ADD, MD, MB, MI, MS, RD)
+#define ADDLrm(RS, MD, MB, MI, MS)     _ALULrm(X86_ADD, RS, MD, MB, MI, MS)
+#define ADDLir(IM, RD)                 _ALULir(X86_ADD, IM, RD)
+#define ADDLim(IM, MD, MB, MI, MS)     _ALULim(X86_ADD, IM, MD, MB, MI, MS)
+
+#define ADDQrr(RS, RD)                 _ALUQrr(X86_ADD, RS, RD)
+#define ADDQmr(MD, MB, MI, MS, RD)     _ALUQmr(X86_ADD, MD, MB, MI, MS, RD)
+#define ADDQrm(RS, MD, MB, MI, MS)     _ALUQrm(X86_ADD, RS, MD, MB, MI, MS)
+#define ADDQir(IM, RD)                 _ALUQir(X86_ADD, IM, RD)
+#define ADDQim(IM, MD, MB, MI, MS)     _ALUQim(X86_ADD, IM, MD, MB, MI, MS)
+
+#define ANDBrr(RS, RD)                 _ALUBrr(X86_AND, RS, RD)
+#define ANDBmr(MD, MB, MI, MS, RD)     _ALUBmr(X86_AND, MD, MB, MI, MS, RD)
+#define ANDBrm(RS, MD, MB, MI, MS)     _ALUBrm(X86_AND, RS, MD, MB, MI, MS)
+#define ANDBir(IM, RD)                 _ALUBir(X86_AND, IM, RD)
+#define ANDBim(IM, MD, MB, MI, MS)     _ALUBim(X86_AND, IM, MD, MB, MI, MS)
+
+#define ANDWrr(RS, RD)                 _ALUWrr(X86_AND, RS, RD)
+#define ANDWmr(MD, MB, MI, MS, RD)     _ALUWmr(X86_AND, MD, MB, MI, MS, RD)
+#define ANDWrm(RS, MD, MB, MI, MS)     _ALUWrm(X86_AND, RS, MD, MB, MI, MS)
+#define ANDWir(IM, RD)                 _ALUWir(X86_AND, IM, RD)
+#define ANDWim(IM, MD, MB, MI, MS)     _ALUWim(X86_AND, IM, MD, MB, MI, MS)
+
+#define ANDLrr(RS, RD)                 _ALULrr(X86_AND, RS, RD)
+#define ANDLmr(MD, MB, MI, MS, RD)     _ALULmr(X86_AND, MD, MB, MI, MS, RD)
+#define ANDLrm(RS, MD, MB, MI, MS)     _ALULrm(X86_AND, RS, MD, MB, MI, MS)
+#define ANDLir(IM, RD)                 _ALULir(X86_AND, IM, RD)
+#define ANDLim(IM, MD, MB, MI, MS)     _ALULim(X86_AND, IM, MD, MB, MI, MS)
+
+#define ANDQrr(RS, RD)                 _ALUQrr(X86_AND, RS, RD)
+#define ANDQmr(MD, MB, MI, MS, RD)     _ALUQmr(X86_AND, MD, MB, MI, MS, RD)
+#define ANDQrm(RS, MD, MB, MI, MS)     _ALUQrm(X86_AND, RS, MD, MB, MI, MS)
+#define ANDQir(IM, RD)                 _ALUQir(X86_AND, IM, RD)
+#define ANDQim(IM, MD, MB, MI, MS)     _ALUQim(X86_AND, IM, MD, MB, MI, MS)
+
+#define CMPBrr(RS, RD)                 _ALUBrr(X86_CMP, RS, RD)
+#define CMPBmr(MD, MB, MI, MS, RD)     _ALUBmr(X86_CMP, MD, MB, MI, MS, RD)
+#define CMPBrm(RS, MD, MB, MI, MS)     _ALUBrm(X86_CMP, RS, MD, MB, MI, MS)
+#define CMPBir(IM, RD)                 _ALUBir(X86_CMP, IM, RD)
+#define CMPBim(IM, MD, MB, MI, MS)     _ALUBim(X86_CMP, IM, MD, MB, MI, MS)
+
+#define CMPWrr(RS, RD)                 _ALUWrr(X86_CMP, RS, RD)
+#define CMPWmr(MD, MB, MI, MS, RD)     _ALUWmr(X86_CMP, MD, MB, MI, MS, RD)
+#define CMPWrm(RS, MD, MB, MI, MS)     _ALUWrm(X86_CMP, RS, MD, MB, MI, MS)
+#define CMPWir(IM, RD)                 _ALUWir(X86_CMP, IM, RD)
+#define CMPWim(IM, MD, MB, MI, MS)     _ALUWim(X86_CMP, IM, MD, MB, MI, MS)
+
+#define CMPLrr(RS, RD)                 _ALULrr(X86_CMP, RS, RD)
+#define CMPLmr(MD, MB, MI, MS, RD)     _ALULmr(X86_CMP, MD, MB, MI, MS, RD)
+#define CMPLrm(RS, MD, MB, MI, MS)     _ALULrm(X86_CMP, RS, MD, MB, MI, MS)
+#define CMPLir(IM, RD)                 _ALULir(X86_CMP, IM, RD)
+#define CMPLim(IM, MD, MB, MI, MS)     _ALULim(X86_CMP, IM, MD, MB, MI, MS)
+
+#define CMPQrr(RS, RD)                 _ALUQrr(X86_CMP, RS, RD)
+#define CMPQmr(MD, MB, MI, MS, RD)     _ALUQmr(X86_CMP, MD, MB, MI, MS, RD)
+#define CMPQrm(RS, MD, MB, MI, MS)     _ALUQrm(X86_CMP, RS, MD, MB, MI, MS)
+#define CMPQir(IM, RD)                 _ALUQir(X86_CMP, IM, RD)
+#define CMPQim(IM, MD, MB, MI, MS)     _ALUQim(X86_CMP, IM, MD, MB, MI, MS)
+
+#define ORBrr(RS, RD)                  _ALUBrr(X86_OR, RS, RD)
+#define ORBmr(MD, MB, MI, MS, RD)      _ALUBmr(X86_OR, MD, MB, MI, MS, RD)
+#define ORBrm(RS, MD, MB, MI, MS)      _ALUBrm(X86_OR, RS, MD, MB, MI, MS)
+#define ORBir(IM, RD)                  _ALUBir(X86_OR, IM, RD)
+#define ORBim(IM, MD, MB, MI, MS)      _ALUBim(X86_OR, IM, MD, MB, MI, MS)
+
+#define ORWrr(RS, RD)                  _ALUWrr(X86_OR, RS, RD)
+#define ORWmr(MD, MB, MI, MS, RD)      _ALUWmr(X86_OR, MD, MB, MI, MS, RD)
+#define ORWrm(RS, MD, MB, MI, MS)      _ALUWrm(X86_OR, RS, MD, MB, MI, MS)
+#define ORWir(IM, RD)                  _ALUWir(X86_OR, IM, RD)
+#define ORWim(IM, MD, MB, MI, MS)      _ALUWim(X86_OR, IM, MD, MB, MI, MS)
+
+#define ORLrr(RS, RD)                  _ALULrr(X86_OR, RS, RD)
+#define ORLmr(MD, MB, MI, MS, RD)      _ALULmr(X86_OR, MD, MB, MI, MS, RD)
+#define ORLrm(RS, MD, MB, MI, MS)      _ALULrm(X86_OR, RS, MD, MB, MI, MS)
+#define ORLir(IM, RD)                  _ALULir(X86_OR, IM, RD)
+#define ORLim(IM, MD, MB, MI, MS)      _ALULim(X86_OR, IM, MD, MB, MI, MS)
+
+#define ORQrr(RS, RD)                  _ALUQrr(X86_OR, RS, RD)
+#define ORQmr(MD, MB, MI, MS, RD)      _ALUQmr(X86_OR, MD, MB, MI, MS, RD)
+#define ORQrm(RS, MD, MB, MI, MS)      _ALUQrm(X86_OR, RS, MD, MB, MI, MS)
+#define ORQir(IM, RD)                  _ALUQir(X86_OR, IM, RD)
+#define ORQim(IM, MD, MB, MI, MS)      _ALUQim(X86_OR, IM, MD, MB, MI, MS)
+
+#define SBBBrr(RS, RD)                 _ALUBrr(X86_SBB, RS, RD)
+#define SBBBmr(MD, MB, MI, MS, RD)     _ALUBmr(X86_SBB, MD, MB, MI, MS, RD)
+#define SBBBrm(RS, MD, MB, MI, MS)     _ALUBrm(X86_SBB, RS, MD, MB, MI, MS)
+#define SBBBir(IM, RD)                 _ALUBir(X86_SBB, IM, RD)
+#define SBBBim(IM, MD, MB, MI, MS)     _ALUBim(X86_SBB, IM, MD, MB, MI, MS)
+
+#define SBBWrr(RS, RD)                 _ALUWrr(X86_SBB, RS, RD)
+#define SBBWmr(MD, MB, MI, MS, RD)     _ALUWmr(X86_SBB, MD, MB, MI, MS, RD)
+#define SBBWrm(RS, MD, MB, MI, MS)     _ALUWrm(X86_SBB, RS, MD, MB, MI, MS)
+#define SBBWir(IM, RD)                 _ALUWir(X86_SBB, IM, RD)
+#define SBBWim(IM, MD, MB, MI, MS)     _ALUWim(X86_SBB, IM, MD, MB, MI, MS)
+
+#define SBBLrr(RS, RD)                 _ALULrr(X86_SBB, RS, RD)
+#define SBBLmr(MD, MB, MI, MS, RD)     _ALULmr(X86_SBB, MD, MB, MI, MS, RD)
+#define SBBLrm(RS, MD, MB, MI, MS)     _ALULrm(X86_SBB, RS, MD, MB, MI, MS)
+#define SBBLir(IM, RD)                 _ALULir(X86_SBB, IM, RD)
+#define SBBLim(IM, MD, MB, MI, MS)     _ALULim(X86_SBB, IM, MD, MB, MI, MS)
+
+#define SBBQrr(RS, RD)                 _ALUQrr(X86_SBB, RS, RD)
+#define SBBQmr(MD, MB, MI, MS, RD)     _ALUQmr(X86_SBB, MD, MB, MI, MS, RD)
+#define SBBQrm(RS, MD, MB, MI, MS)     _ALUQrm(X86_SBB, RS, MD, MB, MI, MS)
+#define SBBQir(IM, RD)                 _ALUQir(X86_SBB, IM, RD)
+#define SBBQim(IM, MD, MB, MI, MS)     _ALUQim(X86_SBB, IM, MD, MB, MI, MS)
+
+#define SUBBrr(RS, RD)                 _ALUBrr(X86_SUB, RS, RD)
+#define SUBBmr(MD, MB, MI, MS, RD)     _ALUBmr(X86_SUB, MD, MB, MI, MS, RD)
+#define SUBBrm(RS, MD, MB, MI, MS)     _ALUBrm(X86_SUB, RS, MD, MB, MI, MS)
+#define SUBBir(IM, RD)                 _ALUBir(X86_SUB, IM, RD)
+#define SUBBim(IM, MD, MB, MI, MS)     _ALUBim(X86_SUB, IM, MD, MB, MI, MS)
+
+#define SUBWrr(RS, RD)                 _ALUWrr(X86_SUB, RS, RD)
+#define SUBWmr(MD, MB, MI, MS, RD)     _ALUWmr(X86_SUB, MD, MB, MI, MS, RD)
+#define SUBWrm(RS, MD, MB, MI, MS)     _ALUWrm(X86_SUB, RS, MD, MB, MI, MS)
+#define SUBWir(IM, RD)                 _ALUWir(X86_SUB, IM, RD)
+#define SUBWim(IM, MD, MB, MI, MS)     _ALUWim(X86_SUB, IM, MD, MB, MI, MS)
+
+#define SUBLrr(RS, RD)                 _ALULrr(X86_SUB, RS, RD)
+#define SUBLmr(MD, MB, MI, MS, RD)     _ALULmr(X86_SUB, MD, MB, MI, MS, RD)
+#define SUBLrm(RS, MD, MB, MI, MS)     _ALULrm(X86_SUB, RS, MD, MB, MI, MS)
+#define SUBLir(IM, RD)                 _ALULir(X86_SUB, IM, RD)
+#define SUBLim(IM, MD, MB, MI, MS)     _ALULim(X86_SUB, IM, MD, MB, MI, MS)
+
+#define SUBQrr(RS, RD)                 _ALUQrr(X86_SUB, RS, RD)
+#define SUBQmr(MD, MB, MI, MS, RD)     _ALUQmr(X86_SUB, MD, MB, MI, MS, RD)
+#define SUBQrm(RS, MD, MB, MI, MS)     _ALUQrm(X86_SUB, RS, MD, MB, MI, MS)
+#define SUBQir(IM, RD)                 _ALUQir(X86_SUB, IM, RD)
+#define SUBQim(IM, MD, MB, MI, MS)     _ALUQim(X86_SUB, IM, MD, MB, MI, MS)
+
+#define XORBrr(RS, RD)                 _ALUBrr(X86_XOR, RS, RD)
+#define XORBmr(MD, MB, MI, MS, RD)     _ALUBmr(X86_XOR, MD, MB, MI, MS, RD)
+#define XORBrm(RS, MD, MB, MI, MS)     _ALUBrm(X86_XOR, RS, MD, MB, MI, MS)
+#define XORBir(IM, RD)                 _ALUBir(X86_XOR, IM, RD)
+#define XORBim(IM, MD, MB, MI, MS)     _ALUBim(X86_XOR, IM, MD, MB, MI, MS)
+
+#define XORWrr(RS, RD)                 _ALUWrr(X86_XOR, RS, RD)
+#define XORWmr(MD, MB, MI, MS, RD)     _ALUWmr(X86_XOR, MD, MB, MI, MS, RD)
+#define XORWrm(RS, MD, MB, MI, MS)     _ALUWrm(X86_XOR, RS, MD, MB, MI, MS)
+#define XORWir(IM, RD)                 _ALUWir(X86_XOR, IM, RD)
+#define XORWim(IM, MD, MB, MI, MS)     _ALUWim(X86_XOR, IM, MD, MB, MI, MS)
+
+#define XORLrr(RS, RD)                 _ALULrr(X86_XOR, RS, RD)
+#define XORLmr(MD, MB, MI, MS, RD)     _ALULmr(X86_XOR, MD, MB, MI, MS, RD)
+#define XORLrm(RS, MD, MB, MI, MS)     _ALULrm(X86_XOR, RS, MD, MB, MI, MS)
+#define XORLir(IM, RD)                 _ALULir(X86_XOR, IM, RD)
+#define XORLim(IM, MD, MB, MI, MS)     _ALULim(X86_XOR, IM, MD, MB, MI, MS)
+
+#define XORQrr(RS, RD)                 _ALUQrr(X86_XOR, RS, RD)
+#define XORQmr(MD, MB, MI, MS, RD)     _ALUQmr(X86_XOR, MD, MB, MI, MS, RD)
+#define XORQrm(RS, MD, MB, MI, MS)     _ALUQrm(X86_XOR, RS, MD, MB, MI, MS)
+#define XORQir(IM, RD)                 _ALUQir(X86_XOR, IM, RD)
+#define XORQim(IM, MD, MB, MI, MS)     _ALUQim(X86_XOR, IM, MD, MB, MI, MS)
+
+
+/* --- Shift/Rotate instructions ------------------------------------------- */
 
+enum {
+  X86_ROL = 0,
+  X86_ROR = 1,
+  X86_RCL = 2,
+  X86_RCR = 3,
+  X86_SHL = 4,
+  X86_SHR = 5,
+  X86_SAR = 7,
+};
 
-#define CALLsr(R)                      _O_Mrm  (0xff   ,_b11,_b010,_r4(R)      
                )
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
+
+#define _ROTSHIBir(OP,IM,RD)           ((IM) == 1 ? \
+                                       (_REXBrr(0, RD),                _O_Mrm  
        (0xd0           ,_b11,OP,_r1(RD)                                )) : \
+                                       (_REXBrr(0, RD),                
_O_Mrm_B        (0xc0           ,_b11,OP,_r1(RD)                        
,_u8(IM))) )
+#define _ROTSHIBim(OP,IM,MD,MB,MI,MS)  ((IM) == 1 ? \
+                                       (_REXBrm(0, MB, MI),            _O_r_X  
        (0xd0                ,OP                ,MD,MB,MI,MS            )) : \
+                                       (_REXBrm(0, MB, MI),            
_O_r_X_B        (0xc0                ,OP                ,MD,MB,MI,MS    
,_u8(IM))) )
+#define _ROTSHIBrr(OP,RS,RD)           (((RS) == _CL) ? \
+                                       (_REXBrr(RS, RD),               _O_Mrm  
        (0xd2           ,_b11,OP,_r1(RD)                                )) : \
+                                                                       
JITFAIL("source register must be CL"                                    )  )
+#define _ROTSHIBrm(OP,RS,MD,MB,MI,MS)  (((RS) == _CL) ? \
+                                       (_REXBrm(RS, MB, MI),           _O_r_X  
        (0xd2                ,OP                ,MD,MB,MI,MS            )) : \
+                                                                       
JITFAIL("source register must be CL"                                    )  )
+
+#define _ROTSHIWir(OP,IM,RD)           ((IM) == 1 ? \
+                                       (_d16(), _REXLrr(0, RD),        _O_Mrm  
        (0xd1           ,_b11,OP,_r2(RD)                                )) : \
+                                       (_d16(), _REXLrr(0, RD),        
_O_Mrm_B        (0xc1           ,_b11,OP,_r2(RD)                        
,_u8(IM))) )
+#define _ROTSHIWim(OP,IM,MD,MB,MI,MS)  ((IM) == 1 ? \
+                                       (_d16(), _REXLrm(0, MB, MI),    _O_r_X  
        (0xd1                ,OP                ,MD,MB,MI,MS            )) : \
+                                       (_d16(), _REXLrm(0, MB, MI),    
_O_r_X_B        (0xc1                ,OP                ,MD,MB,MI,MS    
,_u8(IM))) )
+#define _ROTSHIWrr(OP,RS,RD)           (((RS) == _CL) ? \
+                                       (_d16(), _REXLrr(RS, RD),       _O_Mrm  
        (0xd3           ,_b11,OP,_r2(RD)                                )) : \
+                                                                       
JITFAIL("source register must be CL"                                    )  )
+#define _ROTSHIWrm(OP,RS,MD,MB,MI,MS)  (((RS) == _CL) ? \
+                                       (_d16(), _REXLrm(RS, MB, MI),   _O_r_X  
        (0xd3                ,OP                ,MD,MB,MI,MS            )) : \
+                                                                       
JITFAIL("source register must be CL"                                    )  )
+
+#define _ROTSHILir(OP,IM,RD)           ((IM) == 1 ? \
+                                       (_REXLrr(0, RD),                _O_Mrm  
        (0xd1           ,_b11,OP,_r4(RD)                                )) : \
+                                       (_REXLrr(0, RD),                
_O_Mrm_B        (0xc1           ,_b11,OP,_r4(RD)                        
,_u8(IM))) )
+#define _ROTSHILim(OP,IM,MD,MB,MI,MS)  ((IM) == 1 ? \
+                                       (_REXLrm(0, MB, MI),            _O_r_X  
        (0xd1                ,OP                ,MD,MB,MI,MS            )) : \
+                                       (_REXLrm(0, MB, MI),            
_O_r_X_B        (0xc1                ,OP                ,MD,MB,MI,MS    
,_u8(IM))) )
+#define _ROTSHILrr(OP,RS,RD)           (((RS) == _CL) ? \
+                                       (_REXLrr(RS, RD),               _O_Mrm  
        (0xd3           ,_b11,OP,_r4(RD)                                )) : \
+                                                                       
JITFAIL("source register must be CL"                                    )  )
+#define _ROTSHILrm(OP,RS,MD,MB,MI,MS)  (((RS) == _CL) ? \
+                                       (_REXLrm(RS, MB, MI),           _O_r_X  
        (0xd3                ,OP                ,MD,MB,MI,MS            )) : \
+                                                                       
JITFAIL("source register must be CL"                                    )  )
+
+#define _ROTSHIQir(OP,IM,RD)           ((IM) == 1 ? \
+                                       (_REXQrr(0, RD),                _O_Mrm  
        (0xd1           ,_b11,OP,_r8(RD)                                )) : \
+                                       (_REXQrr(0, RD),                
_O_Mrm_B        (0xc1           ,_b11,OP,_r8(RD)                        
,_u8(IM))) )
+#define _ROTSHIQim(OP,IM,MD,MB,MI,MS)  ((IM) == 1 ? \
+                                       (_REXQrm(0, MB, MI),            _O_r_X  
        (0xd1                ,OP                ,MD,MB,MI,MS            )) : \
+                                       (_REXQrm(0, MB, MI),            
_O_r_X_B        (0xc1                ,OP                ,MD,MB,MI,MS    
,_u8(IM))) )
+#define _ROTSHIQrr(OP,RS,RD)           (((RS) == _CL) ? \
+                                       (_REXQrr(RS, RD),               _O_Mrm  
        (0xd3           ,_b11,OP,_r8(RD)                                )) : \
+                                                                       
JITFAIL("source register must be CL"                                    )  )
+#define _ROTSHIQrm(OP,RS,MD,MB,MI,MS)  (((RS) == _CL) ? \
+                                       (_REXQrm(RS, MB, MI),           _O_r_X  
        (0xd3                ,OP                ,MD,MB,MI,MS            )) : \
+                                                                       
JITFAIL("source register must be CL"                                    )  )
+
+#define ROLBir(IM, RD)                 _ROTSHIBir(X86_ROL, IM, RD)
+#define ROLBim(IM, MD, MB, MI, MS)     _ROTSHIBim(X86_ROL, IM, MD, MB, MI, MS)
+#define ROLBrr(RS, RD)                 _ROTSHIBrr(X86_ROL, RS, RD)
+#define ROLBrm(RS, MD, MB, MI, MS)     _ROTSHIBrm(X86_ROL, RS, MD, MB, MI, MS)
+
+#define ROLWir(IM, RD)                 _ROTSHIWir(X86_ROL, IM, RD)
+#define ROLWim(IM, MD, MB, MI, MS)     _ROTSHIWim(X86_ROL, IM, MD, MB, MI, MS)
+#define ROLWrr(RS, RD)                 _ROTSHIWrr(X86_ROL, RS, RD)
+#define ROLWrm(RS, MD, MB, MI, MS)     _ROTSHIWrm(X86_ROL, RS, MD, MB, MI, MS)
+
+#define ROLLir(IM, RD)                 _ROTSHILir(X86_ROL, IM, RD)
+#define ROLLim(IM, MD, MB, MI, MS)     _ROTSHILim(X86_ROL, IM, MD, MB, MI, MS)
+#define ROLLrr(RS, RD)                 _ROTSHILrr(X86_ROL, RS, RD)
+#define ROLLrm(RS, MD, MB, MI, MS)     _ROTSHILrm(X86_ROL, RS, MD, MB, MI, MS)
+
+#define ROLQir(IM, RD)                 _ROTSHIQir(X86_ROL, IM, RD)
+#define ROLQim(IM, MD, MB, MI, MS)     _ROTSHIQim(X86_ROL, IM, MD, MB, MI, MS)
+#define ROLQrr(RS, RD)                 _ROTSHIQrr(X86_ROL, RS, RD)
+#define ROLQrm(RS, MD, MB, MI, MS)     _ROTSHIQrm(X86_ROL, RS, MD, MB, MI, MS)
+
+#define RORBir(IM, RD)                 _ROTSHIBir(X86_ROR, IM, RD)
+#define RORBim(IM, MD, MB, MI, MS)     _ROTSHIBim(X86_ROR, IM, MD, MB, MI, MS)
+#define RORBrr(RS, RD)                 _ROTSHIBrr(X86_ROR, RS, RD)
+#define RORBrm(RS, MD, MB, MI, MS)     _ROTSHIBrm(X86_ROR, RS, MD, MB, MI, MS)
+
+#define RORWir(IM, RD)                 _ROTSHIWir(X86_ROR, IM, RD)
+#define RORWim(IM, MD, MB, MI, MS)     _ROTSHIWim(X86_ROR, IM, MD, MB, MI, MS)
+#define RORWrr(RS, RD)                 _ROTSHIWrr(X86_ROR, RS, RD)
+#define RORWrm(RS, MD, MB, MI, MS)     _ROTSHIWrm(X86_ROR, RS, MD, MB, MI, MS)
+
+#define RORLir(IM, RD)                 _ROTSHILir(X86_ROR, IM, RD)
+#define RORLim(IM, MD, MB, MI, MS)     _ROTSHILim(X86_ROR, IM, MD, MB, MI, MS)
+#define RORLrr(RS, RD)                 _ROTSHILrr(X86_ROR, RS, RD)
+#define RORLrm(RS, MD, MB, MI, MS)     _ROTSHILrm(X86_ROR, RS, MD, MB, MI, MS)
+
+#define RORQir(IM, RD)                 _ROTSHIQir(X86_ROR, IM, RD)
+#define RORQim(IM, MD, MB, MI, MS)     _ROTSHIQim(X86_ROR, IM, MD, MB, MI, MS)
+#define RORQrr(RS, RD)                 _ROTSHIQrr(X86_ROR, RS, RD)
+#define RORQrm(RS, MD, MB, MI, MS)     _ROTSHIQrm(X86_ROR, RS, MD, MB, MI, MS)
+
+#define RCLBir(IM, RD)                 _ROTSHIBir(X86_RCL, IM, RD)
+#define RCLBim(IM, MD, MB, MI, MS)     _ROTSHIBim(X86_RCL, IM, MD, MB, MI, MS)
+#define RCLBrr(RS, RD)                 _ROTSHIBrr(X86_RCL, RS, RD)
+#define RCLBrm(RS, MD, MB, MI, MS)     _ROTSHIBrm(X86_RCL, RS, MD, MB, MI, MS)
+
+#define RCLWir(IM, RD)                 _ROTSHIWir(X86_RCL, IM, RD)
+#define RCLWim(IM, MD, MB, MI, MS)     _ROTSHIWim(X86_RCL, IM, MD, MB, MI, MS)
+#define RCLWrr(RS, RD)                 _ROTSHIWrr(X86_RCL, RS, RD)
+#define RCLWrm(RS, MD, MB, MI, MS)     _ROTSHIWrm(X86_RCL, RS, MD, MB, MI, MS)
+
+#define RCLLir(IM, RD)                 _ROTSHILir(X86_RCL, IM, RD)
+#define RCLLim(IM, MD, MB, MI, MS)     _ROTSHILim(X86_RCL, IM, MD, MB, MI, MS)
+#define RCLLrr(RS, RD)                 _ROTSHILrr(X86_RCL, RS, RD)
+#define RCLLrm(RS, MD, MB, MI, MS)     _ROTSHILrm(X86_RCL, RS, MD, MB, MI, MS)
+
+#define RCLQir(IM, RD)                 _ROTSHIQir(X86_RCL, IM, RD)
+#define RCLQim(IM, MD, MB, MI, MS)     _ROTSHIQim(X86_RCL, IM, MD, MB, MI, MS)
+#define RCLQrr(RS, RD)                 _ROTSHIQrr(X86_RCL, RS, RD)
+#define RCLQrm(RS, MD, MB, MI, MS)     _ROTSHIQrm(X86_RCL, RS, MD, MB, MI, MS)
+
+#define RCRBir(IM, RD)                 _ROTSHIBir(X86_RCR, IM, RD)
+#define RCRBim(IM, MD, MB, MI, MS)     _ROTSHIBim(X86_RCR, IM, MD, MB, MI, MS)
+#define RCRBrr(RS, RD)                 _ROTSHIBrr(X86_RCR, RS, RD)
+#define RCRBrm(RS, MD, MB, MI, MS)     _ROTSHIBrm(X86_RCR, RS, MD, MB, MI, MS)
+
+#define RCRWir(IM, RD)                 _ROTSHIWir(X86_RCR, IM, RD)
+#define RCRWim(IM, MD, MB, MI, MS)     _ROTSHIWim(X86_RCR, IM, MD, MB, MI, MS)
+#define RCRWrr(RS, RD)                 _ROTSHIWrr(X86_RCR, RS, RD)
+#define RCRWrm(RS, MD, MB, MI, MS)     _ROTSHIWrm(X86_RCR, RS, MD, MB, MI, MS)
+
+#define RCRLir(IM, RD)                 _ROTSHILir(X86_RCR, IM, RD)
+#define RCRLim(IM, MD, MB, MI, MS)     _ROTSHILim(X86_RCR, IM, MD, MB, MI, MS)
+#define RCRLrr(RS, RD)                 _ROTSHILrr(X86_RCR, RS, RD)
+#define RCRLrm(RS, MD, MB, MI, MS)     _ROTSHILrm(X86_RCR, RS, MD, MB, MI, MS)
+
+#define RCRQir(IM, RD)                 _ROTSHIQir(X86_RCR, IM, RD)
+#define RCRQim(IM, MD, MB, MI, MS)     _ROTSHIQim(X86_RCR, IM, MD, MB, MI, MS)
+#define RCRQrr(RS, RD)                 _ROTSHIQrr(X86_RCR, RS, RD)
+#define RCRQrm(RS, MD, MB, MI, MS)     _ROTSHIQrm(X86_RCR, RS, MD, MB, MI, MS)
+
+#define SHLBir(IM, RD)                 _ROTSHIBir(X86_SHL, IM, RD)
+#define SHLBim(IM, MD, MB, MI, MS)     _ROTSHIBim(X86_SHL, IM, MD, MB, MI, MS)
+#define SHLBrr(RS, RD)                 _ROTSHIBrr(X86_SHL, RS, RD)
+#define SHLBrm(RS, MD, MB, MI, MS)     _ROTSHIBrm(X86_SHL, RS, MD, MB, MI, MS)
+
+#define SHLWir(IM, RD)                 _ROTSHIWir(X86_SHL, IM, RD)
+#define SHLWim(IM, MD, MB, MI, MS)     _ROTSHIWim(X86_SHL, IM, MD, MB, MI, MS)
+#define SHLWrr(RS, RD)                 _ROTSHIWrr(X86_SHL, RS, RD)
+#define SHLWrm(RS, MD, MB, MI, MS)     _ROTSHIWrm(X86_SHL, RS, MD, MB, MI, MS)
+
+#define SHLLir(IM, RD)                 _ROTSHILir(X86_SHL, IM, RD)
+#define SHLLim(IM, MD, MB, MI, MS)     _ROTSHILim(X86_SHL, IM, MD, MB, MI, MS)
+#define SHLLrr(RS, RD)                 _ROTSHILrr(X86_SHL, RS, RD)
+#define SHLLrm(RS, MD, MB, MI, MS)     _ROTSHILrm(X86_SHL, RS, MD, MB, MI, MS)
+
+#define SHLQir(IM, RD)                 _ROTSHIQir(X86_SHL, IM, RD)
+#define SHLQim(IM, MD, MB, MI, MS)     _ROTSHIQim(X86_SHL, IM, MD, MB, MI, MS)
+#define SHLQrr(RS, RD)                 _ROTSHIQrr(X86_SHL, RS, RD)
+#define SHLQrm(RS, MD, MB, MI, MS)     _ROTSHIQrm(X86_SHL, RS, MD, MB, MI, MS)
+
+#define SHRBir(IM, RD)                 _ROTSHIBir(X86_SHR, IM, RD)
+#define SHRBim(IM, MD, MB, MI, MS)     _ROTSHIBim(X86_SHR, IM, MD, MB, MI, MS)
+#define SHRBrr(RS, RD)                 _ROTSHIBrr(X86_SHR, RS, RD)
+#define SHRBrm(RS, MD, MB, MI, MS)     _ROTSHIBrm(X86_SHR, RS, MD, MB, MI, MS)
+
+#define SHRWir(IM, RD)                 _ROTSHIWir(X86_SHR, IM, RD)
+#define SHRWim(IM, MD, MB, MI, MS)     _ROTSHIWim(X86_SHR, IM, MD, MB, MI, MS)
+#define SHRWrr(RS, RD)                 _ROTSHIWrr(X86_SHR, RS, RD)
+#define SHRWrm(RS, MD, MB, MI, MS)     _ROTSHIWrm(X86_SHR, RS, MD, MB, MI, MS)
+
+#define SHRLir(IM, RD)                 _ROTSHILir(X86_SHR, IM, RD)
+#define SHRLim(IM, MD, MB, MI, MS)     _ROTSHILim(X86_SHR, IM, MD, MB, MI, MS)
+#define SHRLrr(RS, RD)                 _ROTSHILrr(X86_SHR, RS, RD)
+#define SHRLrm(RS, MD, MB, MI, MS)     _ROTSHILrm(X86_SHR, RS, MD, MB, MI, MS)
+
+#define SHRQir(IM, RD)                 _ROTSHIQir(X86_SHR, IM, RD)
+#define SHRQim(IM, MD, MB, MI, MS)     _ROTSHIQim(X86_SHR, IM, MD, MB, MI, MS)
+#define SHRQrr(RS, RD)                 _ROTSHIQrr(X86_SHR, RS, RD)
+#define SHRQrm(RS, MD, MB, MI, MS)     _ROTSHIQrm(X86_SHR, RS, MD, MB, MI, MS)
+
+#define SALBir                         SHLBir
+#define SALBim                         SHLBim
+#define SALBrr                         SHLBrr
+#define SALBrm                         SHLBrm
+
+#define SALWir                         SHLWir
+#define SALWim                         SHLWim
+#define SALWrr                         SHLWrr
+#define SALWrm                         SHLWrm
+
+#define SALLir                         SHLLir
+#define SALLim                         SHLLim
+#define SALLrr                         SHLLrr
+#define SALLrm                         SHLLrm
+
+#define SALQir                         SHLQir
+#define SALQim                         SHLQim
+#define SALQrr                         SHLQrr
+#define SALQrm                         SHLQrm
+
+#define SARBir(IM, RD)                 _ROTSHIBir(X86_SAR, IM, RD)
+#define SARBim(IM, MD, MB, MI, MS)     _ROTSHIBim(X86_SAR, IM, MD, MB, MI, MS)
+#define SARBrr(RS, RD)                 _ROTSHIBrr(X86_SAR, RS, RD)
+#define SARBrm(RS, MD, MB, MI, MS)     _ROTSHIBrm(X86_SAR, RS, MD, MB, MI, MS)
+
+#define SARWir(IM, RD)                 _ROTSHIWir(X86_SAR, IM, RD)
+#define SARWim(IM, MD, MB, MI, MS)     _ROTSHIWim(X86_SAR, IM, MD, MB, MI, MS)
+#define SARWrr(RS, RD)                 _ROTSHIWrr(X86_SAR, RS, RD)
+#define SARWrm(RS, MD, MB, MI, MS)     _ROTSHIWrm(X86_SAR, RS, MD, MB, MI, MS)
+
+#define SARLir(IM, RD)                 _ROTSHILir(X86_SAR, IM, RD)
+#define SARLim(IM, MD, MB, MI, MS)     _ROTSHILim(X86_SAR, IM, MD, MB, MI, MS)
+#define SARLrr(RS, RD)                 _ROTSHILrr(X86_SAR, RS, RD)
+#define SARLrm(RS, MD, MB, MI, MS)     _ROTSHILrm(X86_SAR, RS, MD, MB, MI, MS)
+
+#define SARQir(IM, RD)                 _ROTSHIQir(X86_SAR, IM, RD)
+#define SARQim(IM, MD, MB, MI, MS)     _ROTSHIQim(X86_SAR, IM, MD, MB, MI, MS)
+#define SARQrr(RS, RD)                 _ROTSHIQrr(X86_SAR, RS, RD)
+#define SARQrm(RS, MD, MB, MI, MS)     _ROTSHIQrm(X86_SAR, RS, MD, MB, MI, MS)
+
+
+/* --- Bit test instructions ----------------------------------------------- */
 
-#define CALLsm(D,B,I,S)                        _O_r_X  (0xff        ,_b010     
,(int)(D),B,I,S         )
+enum {
+  X86_BT  = 4,
+  X86_BTS = 5,
+  X86_BTR = 6,
+  X86_BTC = 7,
+};
 
-#define CBW_()                         _O              (0x98                   
                                        )
-#define CLC_()                         _O              (0xf8                   
                                        )
-#define CLTD_()                                _O              (0x99           
                                                )
-#define CMC_()                         _O              (0xf5                   
                                        )
+/*                                                                     _format 
        Opcd             ,Mod ,r      ,m        ,mem=dsp+sib    ,imm... */
+
+#define _BTWir(OP, IM, RD)             (_d16(), _REXLrr(0, RD),        
_OO_Mrm_B       (0x0fba          ,_b11,OP     ,_r2(RD)                  
,_u8(IM)))
+#define _BTWim(OP, IM, MD, MB, MI, MS) (_d16(), _REXLrm(0, MB, MI),    
_OO_r_X_B       (0x0fba               ,OP               ,MD,MB,MI,MS    
,_u8(IM)))
+#define _BTWrr(OP, RS, RD)             (_d16(), _REXLrr(RS, RD),       _OO_Mrm 
        (0x0f83|((OP)<<3),_b11,_r2(RS),_r2(RD)                          ))
+#define _BTWrm(OP, RS, MD, MB, MI, MS) (_d16(), _REXLrm(RS, MB, MI),   _OO_r_X 
        (0x0f83|((OP)<<3)     ,_r2(RS)          ,MD,MB,MI,MS            ))
+
+#define _BTLir(OP, IM, RD)             (_REXLrr(0, RD),                
_OO_Mrm_B       (0x0fba          ,_b11,OP     ,_r4(RD)                  
,_u8(IM)))
+#define _BTLim(OP, IM, MD, MB, MI, MS) (_REXLrm(0, MB, MI),            
_OO_r_X_B       (0x0fba               ,OP               ,MD,MB,MI,MS    
,_u8(IM)))
+#define _BTLrr(OP, RS, RD)             (_REXLrr(RS, RD),               _OO_Mrm 
        (0x0f83|((OP)<<3),_b11,_r4(RS),_r4(RD)                          ))
+#define _BTLrm(OP, RS, MD, MB, MI, MS) (_REXLrm(RS, MB, MI),           _OO_r_X 
        (0x0f83|((OP)<<3)     ,_r4(RS)          ,MD,MB,MI,MS            ))
+
+#define _BTQir(OP, IM, RD)             (_REXQrr(0, RD),                
_OO_Mrm_B       (0x0fba          ,_b11,OP     ,_r8(RD)                  
,_u8(IM)))
+#define _BTQim(OP, IM, MD, MB, MI, MS) (_REXQrm(0, MB, MI),            
_OO_r_X_B       (0x0fba               ,OP               ,MD,MB,MI,MS    
,_u8(IM)))
+#define _BTQrr(OP, RS, RD)             (_REXQrr(RS, RD),               _OO_Mrm 
        (0x0f83|((OP)<<3),_b11,_r8(RS),_r8(RD)                          ))
+#define _BTQrm(OP, RS, MD, MB, MI, MS) (_REXQrm(RS, MB, MI),           _OO_r_X 
        (0x0f83|((OP)<<3)     ,_r8(RS)          ,MD,MB,MI,MS            ))
+
+#define BTWir(IM, RD)                  _BTWir(X86_BT, IM, RD)
+#define BTWim(IM, MD, MB, MI, MS)      _BTWim(X86_BT, IM, MD, MI, MS)
+#define BTWrr(RS, RD)                  _BTWrr(X86_BT, RS, RD)
+#define BTWrm(RS, MD, MB, MI, MS)      _BTWrm(X86_BT, RS, MD, MB, MI, MS)
+
+#define BTLir(IM, RD)                  _BTLir(X86_BT, IM, RD)
+#define BTLim(IM, MD, MB, MI, MS)      _BTLim(X86_BT, IM, MD, MB, MI, MS)
+#define BTLrr(RS, RD)                  _BTLrr(X86_BT, RS, RD)
+#define BTLrm(RS, MD, MB, MI, MS)      _BTLrm(X86_BT, RS, MD, MB, MI, MS)
+
+#define BTQir(IM, RD)                  _BTQir(X86_BT, IM, RD)
+#define BTQim(IM, MD, MB, MI, MS)      _BTQim(X86_BT, IM, MD, MB, MI, MS)
+#define BTQrr(RS, RD)                  _BTQrr(X86_BT, RS, RD)
+#define BTQrm(RS, MD, MB, MI, MS)      _BTQrm(X86_BT, RS, MD, MB, MI, MS)
+
+#define BTCWir(IM, RD)                 _BTWir(X86_BTC, IM, RD)
+#define BTCWim(IM, MD, MB, MI, MS)     _BTWim(X86_BTC, IM, MD, MI, MS)
+#define BTCWrr(RS, RD)                 _BTWrr(X86_BTC, RS, RD)
+#define BTCWrm(RS, MD, MB, MI, MS)     _BTWrm(X86_BTC, RS, MD, MB, MI, MS)
+
+#define BTCLir(IM, RD)                 _BTLir(X86_BTC, IM, RD)
+#define BTCLim(IM, MD, MB, MI, MS)     _BTLim(X86_BTC, IM, MD, MB, MI, MS)
+#define BTCLrr(RS, RD)                 _BTLrr(X86_BTC, RS, RD)
+#define BTCLrm(RS, MD, MB, MI, MS)     _BTLrm(X86_BTC, RS, MD, MB, MI, MS)
+
+#define BTCQir(IM, RD)                 _BTQir(X86_BTC, IM, RD)
+#define BTCQim(IM, MD, MB, MI, MS)     _BTQim(X86_BTC, IM, MD, MB, MI, MS)
+#define BTCQrr(RS, RD)                 _BTQrr(X86_BTC, RS, RD)
+#define BTCQrm(RS, MD, MB, MI, MS)     _BTQrm(X86_BTC, RS, MD, MB, MI, MS)
+
+#define BTRWir(IM, RD)                 _BTWir(X86_BTR, IM, RD)
+#define BTRWim(IM, MD, MB, MI, MS)     _BTWim(X86_BTR, IM, MD, MI, MS)
+#define BTRWrr(RS, RD)                 _BTWrr(X86_BTR, RS, RD)
+#define BTRWrm(RS, MD, MB, MI, MS)     _BTWrm(X86_BTR, RS, MD, MB, MI, MS)
+
+#define BTRLir(IM, RD)                 _BTLir(X86_BTR, IM, RD)
+#define BTRLim(IM, MD, MB, MI, MS)     _BTLim(X86_BTR, IM, MD, MB, MI, MS)
+#define BTRLrr(RS, RD)                 _BTLrr(X86_BTR, RS, RD)
+#define BTRLrm(RS, MD, MB, MI, MS)     _BTLrm(X86_BTR, RS, MD, MB, MI, MS)
+
+#define BTRQir(IM, RD)                 _BTQir(X86_BTR, IM, RD)
+#define BTRQim(IM, MD, MB, MI, MS)     _BTQim(X86_BTR, IM, MD, MB, MI, MS)
+#define BTRQrr(RS, RD)                 _BTQrr(X86_BTR, RS, RD)
+#define BTRQrm(RS, MD, MB, MI, MS)     _BTQrm(X86_BTR, RS, MD, MB, MI, MS)
+
+#define BTSWir(IM, RD)                 _BTWir(X86_BTS, IM, RD)
+#define BTSWim(IM, MD, MB, MI, MS)     _BTWim(X86_BTS, IM, MD, MI, MS)
+#define BTSWrr(RS, RD)                 _BTWrr(X86_BTS, RS, RD)
+#define BTSWrm(RS, MD, MB, MI, MS)     _BTWrm(X86_BTS, RS, MD, MB, MI, MS)
+
+#define BTSLir(IM, RD)                 _BTLir(X86_BTS, IM, RD)
+#define BTSLim(IM, MD, MB, MI, MS)     _BTLim(X86_BTS, IM, MD, MB, MI, MS)
+#define BTSLrr(RS, RD)                 _BTLrr(X86_BTS, RS, RD)
+#define BTSLrm(RS, MD, MB, MI, MS)     _BTLrm(X86_BTS, RS, MD, MB, MI, MS)
+
+#define BTSQir(IM, RD)                 _BTQir(X86_BTS, IM, RD)
+#define BTSQim(IM, MD, MB, MI, MS)     _BTQim(X86_BTS, IM, MD, MB, MI, MS)
+#define BTSQrr(RS, RD)                 _BTQrr(X86_BTS, RS, RD)
+#define BTSQrm(RS, MD, MB, MI, MS)     _BTQrm(X86_BTS, RS, MD, MB, MI, MS)
+
+
+/* --- Move instructions --------------------------------------------------- */
+
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
+
+#define MOVBrr(RS, RD)                 (_REXBrr(RS, RD),               _O_Mrm  
        (0x88           ,_b11,_r1(RS),_r1(RD)                           ))
+#define MOVBmr(MD, MB, MI, MS, RD)     (_REXBmr(MB, MI, RD),           _O_r_X  
        (0x8a                ,_r1(RD)           ,MD,MB,MI,MS            ))
+#define MOVBrm(RS, MD, MB, MI, MS)     (_REXBrm(RS, MB, MI),           _O_r_X  
        (0x88                ,_r1(RS)           ,MD,MB,MI,MS            ))
+#define MOVBir(IM,  R)                 (_REXBrr(0, R),                 _Or_B   
        (0xb0,_r1(R)                                            ,_su8(IM)))
+#define MOVBim(IM, MD, MB, MI, MS)     (_REXBrm(0, MB, MI),            _O_X_B  
        (0xc6                                   ,MD,MB,MI,MS    ,_su8(IM)))
+
+#define MOVWrr(RS, RD)                 (_d16(), _REXLrr(RS, RD),       _O_Mrm  
        (0x89           ,_b11,_r2(RS),_r2(RD)                           ))
+#define MOVWmr(MD, MB, MI, MS, RD)     (_d16(), _REXLmr(MB, MI, RD),   _O_r_X  
        (0x8b                ,_r2(RD)           ,MD,MB,MI,MS            ))
+#define MOVWrm(RS, MD, MB, MI, MS)     (_d16(), _REXLrm(RS, MB, MI),   _O_r_X  
        (0x89                ,_r2(RS)           ,MD,MB,MI,MS            ))
+#define MOVWir(IM,  R)                 (_d16(), _REXLrr(0, R),         _Or_W   
        (0xb8,_r2(R)                                            ,_su16(IM)))
+#define MOVWim(IM, MD, MB, MI, MS)     (_d16(), _REXLrm(0, MB, MI),    _O_X_W  
        (0xc7                                   ,MD,MB,MI,MS    ,_su16(IM)))
+
+#define MOVLrr(RS, RD)                 (_REXLrr(RS, RD),               _O_Mrm  
        (0x89           ,_b11,_r4(RS),_r4(RD)                           ))
+#define MOVLmr(MD, MB, MI, MS, RD)     (_REXLmr(MB, MI, RD),           _O_r_X  
        (0x8b                ,_r4(RD)           ,MD,MB,MI,MS            ))
+#define MOVLrm(RS, MD, MB, MI, MS)     (_REXLrm(RS, MB, MI),           _O_r_X  
        (0x89                ,_r4(RS)           ,MD,MB,MI,MS            ))
+#define MOVLir(IM,  R)                 (_REXLrr(0, R),                 _Or_L   
        (0xb8,_r4(R)                                            ,IM     ))
+#define MOVLim(IM, MD, MB, MI, MS)     (_REXLrm(0, MB, MI),            _O_X_L  
        (0xc7                                   ,MD,MB,MI,MS    ,IM     ))
+
+#define MOVQrr(RS, RD)                 (_REXQrr(RS, RD),               _O_Mrm  
        (0x89           ,_b11,_r8(RS),_r8(RD)                           ))
+#define MOVQmr(MD, MB, MI, MS, RD)     (_REXQmr(MB, MI, RD),           _O_r_X  
        (0x8b                ,_r8(RD)           ,MD,MB,MI,MS            ))
+#define MOVQrm(RS, MD, MB, MI, MS)     (_REXQrm(RS, MB, MI),           _O_r_X  
        (0x89                ,_r8(RS)           ,MD,MB,MI,MS            ))
+#define MOVQir(IM,  R)                 (_REXQrr(0, R),                 _Or_Q   
        (0xb8,_r8(R)                                            ,IM     ))
+#define MOVQim(IM, MD, MB, MI, MS)     (_REXQrm(0, MB, MI),            _O_X_L  
        (0xc7                                   ,MD,MB,MI,MS    ,IM     ))
+
+
+/* --- Unary and Multiply/Divide instructions ------------------------------ */
 
+enum {
+  X86_NOT  = 2,
+  X86_NEG  = 3,
+  X86_MUL  = 4,
+  X86_IMUL = 5,
+  X86_DIV  = 6,
+  X86_IDIV = 7,
+};
 
-#define CMPBrr(RS, RD)                 _O_Mrm          (0x38           
,_b11,_r1(RS),_r1(RD)                           )
-#define CMPBmr(MD, MB, MI, MS, RD)     _O_r_X          (0x3a                
,_r1(RD)           ,MD,MB,MI,MS            )
-#define CMPBrm(RS, MD, MB, MI, MS)     _O_r_X          (0x38                
,_r1(RS)           ,MD,MB,MI,MS            )
-#define CMPBir(IM, RD)                 _O_Mrm_B        (0x80           
,_b11,_b111  ,_r1(RD)                   ,_su8(IM))
-#define CMPBim(IM, MD, MB, MI, MS)     _O_r_X_B        (0x80                
,_b111             ,MD,MB,MI,MS    ,_su8(IM))
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
+
+#define _UNARYBr(OP, RS)               (_REXBrr(0, RS),                _O_Mrm  
        (0xf6           ,_b11,OP    ,_r1(RS)                            ))
+#define _UNARYBm(OP, MD, MB, MI, MS)   (_REXBrm(0, MB, MI),            _O_r_X  
        (0xf6                ,OP                ,MD,MB,MI,MS            ))
+#define _UNARYWr(OP, RS)               (_d16(), _REXLrr(0, RS),        _O_Mrm  
        (0xf7           ,_b11,OP    ,_r2(RS)                            ))
+#define _UNARYWm(OP, MD, MB, MI, MS)   (_d16(), _REXLmr(MB, MI, 0),    _O_r_X  
        (0xf7                ,OP                ,MD,MB,MI,MS            ))
+#define _UNARYLr(OP, RS)               (_REXLrr(0, RS),                _O_Mrm  
        (0xf7           ,_b11,OP    ,_r4(RS)                            ))
+#define _UNARYLm(OP, MD, MB, MI, MS)   (_REXLmr(MB, MI, 0),            _O_r_X  
        (0xf7                ,OP                ,MD,MB,MI,MS            ))
+#define _UNARYQr(OP, RS)               (_REXQrr(0, RS),                _O_Mrm  
        (0xf7           ,_b11,OP    ,_r8(RS)                            ))
+#define _UNARYQm(OP, MD, MB, MI, MS)   (_REXQmr(MB, MI, 0),            _O_r_X  
        (0xf7                ,OP                ,MD,MB,MI,MS            ))
+
+#define NOTBr(RS)                      _UNARYBr(X86_NOT, RS)
+#define NOTBm(MD, MB, MI, MS)          _UNARYBm(X86_NOT, MD, MB, MI, MS)
+#define NOTWr(RS)                      _UNARYWr(X86_NOT, RS)
+#define NOTWm(MD, MB, MI, MS)          _UNARYWm(X86_NOT, MD, MB, MI, MS)
+#define NOTLr(RS)                      _UNARYLr(X86_NOT, RS)
+#define NOTLm(MD, MB, MI, MS)          _UNARYLm(X86_NOT, MD, MB, MI, MS)
+#define NOTQr(RS)                      _UNARYQr(X86_NOT, RS)
+#define NOTQm(MD, MB, MI, MS)          _UNARYQm(X86_NOT, MD, MB, MI, MS)
+
+#define NEGBr(RS)                      _UNARYBr(X86_NEG, RS)
+#define NEGBm(MD, MB, MI, MS)          _UNARYBm(X86_NEG, MD, MB, MI, MS)
+#define NEGWr(RS)                      _UNARYWr(X86_NEG, RS)
+#define NEGWm(MD, MB, MI, MS)          _UNARYWm(X86_NEG, MD, MB, MI, MS)
+#define NEGLr(RS)                      _UNARYLr(X86_NEG, RS)
+#define NEGLm(MD, MB, MI, MS)          _UNARYLm(X86_NEG, MD, MB, MI, MS)
+#define NEGQr(RS)                      _UNARYQr(X86_NEG, RS)
+#define NEGQm(MD, MB, MI, MS)          _UNARYQm(X86_NEG, MD, MB, MI, MS)
+
+#define MULBr(RS)                      _UNARYBr(X86_MUL, RS)
+#define MULBm(MD, MB, MI, MS)          _UNARYBm(X86_MUL, MD, MB, MI, MS)
+#define MULWr(RS)                      _UNARYWr(X86_MUL, RS)
+#define MULWm(MD, MB, MI, MS)          _UNARYWm(X86_MUL, MD, MB, MI, MS)
+#define MULLr(RS)                      _UNARYLr(X86_MUL, RS)
+#define MULLm(MD, MB, MI, MS)          _UNARYLm(X86_MUL, MD, MB, MI, MS)
+#define MULQr(RS)                      _UNARYQr(X86_MUL, RS)
+#define MULQm(MD, MB, MI, MS)          _UNARYQm(X86_MUL, MD, MB, MI, MS)
+
+#define IMULBr(RS)                     _UNARYBr(X86_IMUL, RS)
+#define IMULBm(MD, MB, MI, MS)         _UNARYBm(X86_IMUL, MD, MB, MI, MS)
+#define IMULWr(RS)                     _UNARYWr(X86_IMUL, RS)
+#define IMULWm(MD, MB, MI, MS)         _UNARYWm(X86_IMUL, MD, MB, MI, MS)
+#define IMULLr(RS)                     _UNARYLr(X86_IMUL, RS)
+#define IMULLm(MD, MB, MI, MS)         _UNARYLm(X86_IMUL, MD, MB, MI, MS)
+#define IMULQr(RS)                     _UNARYQr(X86_IMUL, RS)
+#define IMULQm(MD, MB, MI, MS)         _UNARYQm(X86_IMUL, MD, MB, MI, MS)
+
+#define DIVBr(RS)                      _UNARYBr(X86_DIV, RS)
+#define DIVBm(MD, MB, MI, MS)          _UNARYBm(X86_DIV, MD, MB, MI, MS)
+#define DIVWr(RS)                      _UNARYWr(X86_DIV, RS)
+#define DIVWm(MD, MB, MI, MS)          _UNARYWm(X86_DIV, MD, MB, MI, MS)
+#define DIVLr(RS)                      _UNARYLr(X86_DIV, RS)
+#define DIVLm(MD, MB, MI, MS)          _UNARYLm(X86_DIV, MD, MB, MI, MS)
+#define DIVQr(RS)                      _UNARYQr(X86_DIV, RS)
+#define DIVQm(MD, MB, MI, MS)          _UNARYQm(X86_DIV, MD, MB, MI, MS)
+
+#define IDIVBr(RS)                     _UNARYBr(X86_IDIV, RS)
+#define IDIVBm(MD, MB, MI, MS)         _UNARYBm(X86_IDIV, MD, MB, MI, MS)
+#define IDIVWr(RS)                     _UNARYWr(X86_IDIV, RS)
+#define IDIVWm(MD, MB, MI, MS)         _UNARYWm(X86_IDIV, MD, MB, MI, MS)
+#define IDIVLr(RS)                     _UNARYLr(X86_IDIV, RS)
+#define IDIVLm(MD, MB, MI, MS)         _UNARYLm(X86_IDIV, MD, MB, MI, MS)
+#define IDIVQr(RS)                     _UNARYQr(X86_IDIV, RS)
+#define IDIVQm(MD, MB, MI, MS)         _UNARYQm(X86_IDIV, MD, MB, MI, MS)
+
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
+
+#define IMULWrr(RS, RD)                        (_d16(), _REXLrr(RD, RS),       
_OO_Mrm         (0x0faf         ,_b11,_r2(RD),_r2(RS)                           
))
+#define IMULWmr(MD, MB, MI, MS, RD)    (_d16(), _REXLmr(MB, MI, RD),   _OO_r_X 
        (0x0faf              ,_r2(RD)           ,MD,MB,MI,MS            ))
+
+#define IMULWirr(IM,RS,RD)             (_d16(), _REXLrr(RS, RD),       
_Os_Mrm_sW      (0x69           ,_b11,_r2(RS),_r2(RD)                   
,_su16(IM)      ))
+#define IMULWimr(IM,MD,MB,MI,MS,RD)    (_d16(), _REXLmr(MB, MI, RD),   
_Os_r_X_sW      (0x69                ,_r2(RD)           ,MD,MB,MI,MS    
,_su16(IM)      ))
+
+#define IMULLir(IM, RD)                        (_REXLrr(0, RD),                
_Os_Mrm_sL      (0x69           ,_b11,_r4(RD),_r4(RD)                   ,IM     
))
+#define IMULLrr(RS, RD)                        (_REXLrr(RD, RS),               
_OO_Mrm         (0x0faf         ,_b11,_r4(RD),_r4(RS)                           
))
+#define IMULLmr(MD, MB, MI, MS, RD)    (_REXLmr(MB, MI, RD),           _OO_r_X 
        (0x0faf              ,_r4(RD)           ,MD,MB,MI,MS            ))
+
+#define IMULQir(IM, RD)                        (_REXQrr(0, RD),                
_Os_Mrm_sL      (0x69           ,_b11,_r8(RD),_r8(RD)                   ,IM     
))
+#define IMULQrr(RS, RD)                        (_REXQrr(RD, RS),               
_OO_Mrm         (0x0faf         ,_b11,_r8(RD),_r8(RS)                           
))
+#define IMULQmr(MD, MB, MI, MS, RD)    (_REXQmr(MB, MI, RD),           _OO_r_X 
        (0x0faf              ,_r8(RD)           ,MD,MB,MI,MS            ))
+
+#define IMULLirr(IM,RS,RD)             (_REXLrr(RS, RD),               
_Os_Mrm_sL      (0x69           ,_b11,_r4(RS),_r4(RD)                   ,IM     
))
+#define IMULLimr(IM,MD,MB,MI,MS,RD)    (_REXLmr(MB, MI, RD),           
_Os_r_X_sL      (0x69                ,_r4(RD)           ,MD,MB,MI,MS    ,IM     
))
+
+#define IMULQirr(IM,RS,RD)             (_REXQrr(RS, RD),               
_Os_Mrm_sL      (0x69           ,_b11,_r8(RS),_r8(RD)                   ,IM     
))
+#define IMULQimr(IM,MD,MB,MI,MS,RD)    (_REXQmr(MB, MI, RD),           
_Os_r_X_sL      (0x69                ,_r8(RD)           ,MD,MB,MI,MS    ,IM     
))
+
+
+/* --- Control Flow related instructions ----------------------------------- */
 
-#define CMPWrr(RS, RD)                 _wO_Mrm         (0x39           
,_b11,_r2(RS),_r2(RD)                           )
-#define CMPWmr(MD, MB, MI, MS, RD)     _wO_r_X         (0x3b                
,_r2(RD)           ,MD,MB,MI,MS            )
-#define CMPWrm(RS, MD, MB, MI, MS)     _wO_r_X         (0x39                
,_r2(RS)           ,MD,MB,MI,MS            )
-#define CMPWir(IM, RD)                 _wOs_Mrm_sW     (0x81           
,_b11,_b111  ,_r2(RD)                   ,_su16(IM))
-#define CMPWim(IM, MD, MB, MI, MS)     _wOs_r_X_sW     (0x81                
,_b111             ,MD,MB,MI,MS    ,_su16(IM))
+enum {
+  X86_CC_O   = 0x0,
+  X86_CC_NO  = 0x1,
+  X86_CC_NAE = 0x2,
+  X86_CC_B   = 0x2,
+  X86_CC_C   = 0x2,
+  X86_CC_AE  = 0x3,
+  X86_CC_NB  = 0x3,
+  X86_CC_NC  = 0x3,
+  X86_CC_E   = 0x4,
+  X86_CC_Z   = 0x4,
+  X86_CC_NE  = 0x5,
+  X86_CC_NZ  = 0x5,
+  X86_CC_BE  = 0x6,
+  X86_CC_NA  = 0x6,
+  X86_CC_A   = 0x7,
+  X86_CC_NBE = 0x7,
+  X86_CC_S   = 0x8,
+  X86_CC_NS  = 0x9,
+  X86_CC_P   = 0xa,
+  X86_CC_PE  = 0xa,
+  X86_CC_NP  = 0xb,
+  X86_CC_PO  = 0xb,
+  X86_CC_L   = 0xc,
+  X86_CC_NGE = 0xc,
+  X86_CC_GE  = 0xd,
+  X86_CC_NL  = 0xd,
+  X86_CC_LE  = 0xe,
+  X86_CC_NG  = 0xe,
+  X86_CC_G   = 0xf,
+  X86_CC_NLE = 0xf,
+};
 
-#define CMPLrr(RS, RD)                 _O_Mrm          (0x39           
,_b11,_r4(RS),_r4(RD)                           )
-#define CMPLmr(MD, MB, MI, MS, RD)     _O_r_X          (0x3b                
,_r4(RD)           ,MD,MB,MI,MS            )
-#define CMPLrm(RS, MD, MB, MI, MS)     _O_r_X          (0x39                
,_r4(RS)           ,MD,MB,MI,MS            )
-#define CMPLir(IM, RD)                 _O_Mrm_L        (0x81           
,_b11,_b111  ,_r4(RD)                   ,IM     )
-#define CMPLim(IM, MD, MB, MI, MS)     _O_r_X_L        (0x81                
,_b111             ,MD,MB,MI,MS    ,IM     )
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
+
+// FIXME: no prefix is availble to encode a 32-bit operand size in 64-bit mode
+#define CALLm(M)                                                       _O_D32  
        (0xe8                                   ,(int)(M)               )
+#define CALLLsr(R)                     (_REXLrr(0, R),                 _O_Mrm  
        (0xff           ,_b11,_b010,_r4(R)                              ))
+#define CALLQsr(R)                     (_REXQrr(0, R),                 _O_Mrm  
        (0xff           ,_b11,_b010,_r8(R)                              ))
+#define CALLsm(D,B,I,S)                        (_REXLrm(0, B, I),              
_O_r_X          (0xff                ,_b010             ,(int)(D),B,I,S         
))
+
+// FIXME: no prefix is availble to encode a 32-bit operand size in 64-bit mode
+#define JMPSm(M)                                                       _O_D8   
        (0xeb                                   ,(int)(M)               )
+#define JMPm(M)                                                                
_O_D32          (0xe9                                   ,(int)(M)               
)
+#define JMPLsr(R)                      (_REXLrr(0, R),                 _O_Mrm  
        (0xff           ,_b11,_b100,_r4(R)                              ))
+#define JMPQsr(R)                      (_REXQrr(0, R),                 _O_Mrm  
        (0xff           ,_b11,_b100,_r8(R)                              ))
+#define JMPsm(D,B,I,S)                 (_REXLrm(0, B, I),              _O_r_X  
        (0xff                ,_b100             ,(int)(D),B,I,S         ))
+
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
+#define JCCSii(CC, D)                                                  _O_B    
        (0x70|(CC)                              ,(_sc)(int)(D)          )
+#define JCCSim(CC, D)                                                  _O_D8   
        (0x70|(CC)                              ,(int)(D)               )
+#define JOSm(D)                                JCCSim(0x0, D)
+#define JNOSm(D)                       JCCSim(0x1, D)
+#define JBSm(D)                                JCCSim(0x2, D)
+#define JNAESm(D)                      JCCSim(0x2, D)
+#define JNBSm(D)                       JCCSim(0x3, D)
+#define JAESm(D)                       JCCSim(0x3, D)
+#define JESm(D)                                JCCSim(0x4, D)
+#define JZSm(D)                                JCCSim(0x4, D)
+#define JNESm(D)                       JCCSim(0x5, D)
+#define JNZSm(D)                       JCCSim(0x5, D)
+#define JBESm(D)                       JCCSim(0x6, D)
+#define JNASm(D)                       JCCSim(0x6, D)
+#define JNBESm(D)                      JCCSim(0x7, D)
+#define JASm(D)                                JCCSim(0x7, D)
+#define JSSm(D)                                JCCSim(0x8, D)
+#define JNSSm(D)                       JCCSim(0x9, D)
+#define JPSm(D)                                JCCSim(0xa, D)
+#define JPESm(D)                       JCCSim(0xa, D)
+#define JNPSm(D)                       JCCSim(0xb, D)
+#define JPOSm(D)                       JCCSim(0xb, D)
+#define JLSm(D)                                JCCSim(0xc, D)
+#define JNGESm(D)                      JCCSim(0xc, D)
+#define JNLSm(D)                       JCCSim(0xd, D)
+#define JGESm(D)                       JCCSim(0xd, D)
+#define JLESm(D)                       JCCSim(0xe, D)
+#define JNGSm(D)                       JCCSim(0xe, D)
+#define JNLESm(D)                      JCCSim(0xf, D)
+#define JGSm(D)                                JCCSim(0xf, D)
+
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
+#define JCCii(CC, D)                                                   _OO_L   
        (0x0f80|(CC)                            ,(int)(D)               )
+#define JCCim(CC, D)                                                   _OO_D32 
        (0x0f80|(CC)                            ,(int)(D)               )
+#define JOm(D)                         JCCim(0x0, D)
+#define JNOm(D)                                JCCim(0x1, D)
+#define JBm(D)                         JCCim(0x2, D)
+#define JNAEm(D)                       JCCim(0x2, D)
+#define JNBm(D)                                JCCim(0x3, D)
+#define JAEm(D)                                JCCim(0x3, D)
+#define JEm(D)                         JCCim(0x4, D)
+#define JZm(D)                         JCCim(0x4, D)
+#define JNEm(D)                                JCCim(0x5, D)
+#define JNZm(D)                                JCCim(0x5, D)
+#define JBEm(D)                                JCCim(0x6, D)
+#define JNAm(D)                                JCCim(0x6, D)
+#define JNBEm(D)                       JCCim(0x7, D)
+#define JAm(D)                         JCCim(0x7, D)
+#define JSm(D)                         JCCim(0x8, D)
+#define JNSm(D)                                JCCim(0x9, D)
+#define JPm(D)                         JCCim(0xa, D)
+#define JPEm(D)                                JCCim(0xa, D)
+#define JNPm(D)                                JCCim(0xb, D)
+#define JPOm(D)                                JCCim(0xb, D)
+#define JLm(D)                         JCCim(0xc, D)
+#define JNGEm(D)                       JCCim(0xc, D)
+#define JNLm(D)                                JCCim(0xd, D)
+#define JGEm(D)                                JCCim(0xd, D)
+#define JLEm(D)                                JCCim(0xe, D)
+#define JNGm(D)                                JCCim(0xe, D)
+#define JNLEm(D)                       JCCim(0xf, D)
+#define JGm(D)                         JCCim(0xf, D)
+
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
+#define SETCCir(CC, RD)                        (_REXBrr(0, RD),                
_OO_Mrm         (0x0f90|(CC)    ,_b11,_b000,_r1(RD)                             
))
+#define SETOr(RD)                      SETCCir(0x0,RD)
+#define SETNOr(RD)                     SETCCir(0x1,RD)
+#define SETBr(RD)                      SETCCir(0x2,RD)
+#define SETNAEr(RD)                    SETCCir(0x2,RD)
+#define SETNBr(RD)                     SETCCir(0x3,RD)
+#define SETAEr(RD)                     SETCCir(0x3,RD)
+#define SETEr(RD)                      SETCCir(0x4,RD)
+#define SETZr(RD)                      SETCCir(0x4,RD)
+#define SETNEr(RD)                     SETCCir(0x5,RD)
+#define SETNZr(RD)                     SETCCir(0x5,RD)
+#define SETBEr(RD)                     SETCCir(0x6,RD)
+#define SETNAr(RD)                     SETCCir(0x6,RD)
+#define SETNBEr(RD)                    SETCCir(0x7,RD)
+#define SETAr(RD)                      SETCCir(0x7,RD)
+#define SETSr(RD)                      SETCCir(0x8,RD)
+#define SETNSr(RD)                     SETCCir(0x9,RD)
+#define SETPr(RD)                      SETCCir(0xa,RD)
+#define SETPEr(RD)                     SETCCir(0xa,RD)
+#define SETNPr(RD)                     SETCCir(0xb,RD)
+#define SETPOr(RD)                     SETCCir(0xb,RD)
+#define SETLr(RD)                      SETCCir(0xc,RD)
+#define SETNGEr(RD)                    SETCCir(0xc,RD)
+#define SETNLr(RD)                     SETCCir(0xd,RD)
+#define SETGEr(RD)                     SETCCir(0xd,RD)
+#define SETLEr(RD)                     SETCCir(0xe,RD)
+#define SETNGr(RD)                     SETCCir(0xe,RD)
+#define SETNLEr(RD)                    SETCCir(0xf,RD)
+#define SETGr(RD)                      SETCCir(0xf,RD)
 
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
+#define SETCCim(CC,MD,MB,MI,MS)                (_REXBrm(0, MB, MI),            
_OO_r_X         (0x0f90|(CC)         ,_b000             ,MD,MB,MI,MS            
))
+#define SETOm(D, B, I, S)              SETCCim(0x0, D, B, I, S)
+#define SETNOm(D, B, I, S)             SETCCim(0x1, D, B, I, S)
+#define SETBm(D, B, I, S)              SETCCim(0x2, D, B, I, S)
+#define SETNAEm(D, B, I, S)            SETCCim(0x2, D, B, I, S)
+#define SETNBm(D, B, I, S)             SETCCim(0x3, D, B, I, S)
+#define SETAEm(D, B, I, S)             SETCCim(0x3, D, B, I, S)
+#define SETEm(D, B, I, S)              SETCCim(0x4, D, B, I, S)
+#define SETZm(D, B, I, S)              SETCCim(0x4, D, B, I, S)
+#define SETNEm(D, B, I, S)             SETCCim(0x5, D, B, I, S)
+#define SETNZm(D, B, I, S)             SETCCim(0x5, D, B, I, S)
+#define SETBEm(D, B, I, S)             SETCCim(0x6, D, B, I, S)
+#define SETNAm(D, B, I, S)             SETCCim(0x6, D, B, I, S)
+#define SETNBEm(D, B, I, S)            SETCCim(0x7, D, B, I, S)
+#define SETAm(D, B, I, S)              SETCCim(0x7, D, B, I, S)
+#define SETSm(D, B, I, S)              SETCCim(0x8, D, B, I, S)
+#define SETNSm(D, B, I, S)             SETCCim(0x9, D, B, I, S)
+#define SETPm(D, B, I, S)              SETCCim(0xa, D, B, I, S)
+#define SETPEm(D, B, I, S)             SETCCim(0xa, D, B, I, S)
+#define SETNPm(D, B, I, S)             SETCCim(0xb, D, B, I, S)
+#define SETPOm(D, B, I, S)             SETCCim(0xb, D, B, I, S)
+#define SETLm(D, B, I, S)              SETCCim(0xc, D, B, I, S)
+#define SETNGEm(D, B, I, S)            SETCCim(0xc, D, B, I, S)
+#define SETNLm(D, B, I, S)             SETCCim(0xd, D, B, I, S)
+#define SETGEm(D, B, I, S)             SETCCim(0xd, D, B, I, S)
+#define SETLEm(D, B, I, S)             SETCCim(0xe, D, B, I, S)
+#define SETNGm(D, B, I, S)             SETCCim(0xe, D, B, I, S)
+#define SETNLEm(D, B, I, S)            SETCCim(0xf, D, B, I, S)
+#define SETGm(D, B, I, S)              SETCCim(0xf, D, B, I, S)
 
-#define CWD_()                         _O              (0x99                   
                                        )
+/*                                                                     _format 
        Opcd            ,Mod ,r      ,m         ,mem=dsp+sib    ,imm... */
+#define CMOVWrr(CC,RS,RD)              (_d16(), _REXLrr(RD, RS),       _OO_Mrm 
        (0x0f40|(CC)    ,_b11,_r2(RD),_r2(RS)                           ))
+#define CMOVWmr(CC,MD,MB,MI,MS,RD)     (_d16(), _REXLmr(MB, MI, RD),   _OO_r_X 
        (0x0f40|(CC)         ,_r2(RD)           ,MD,MB,MI,MS            ))
+#define CMOVLrr(CC,RS,RD)              (_REXLrr(RD, RS),               _OO_Mrm 
        (0x0f40|(CC)    ,_b11,_r4(RD),_r4(RS)                           ))
+#define CMOVLmr(CC,MD,MB,MI,MS,RD)     (_REXLmr(MB, MI, RD),           _OO_r_X 
        (0x0f40|(CC)         ,_r4(RD)           ,MD,MB,MI,MS            ))
+#define CMOVQrr(CC,RS,RD)              (_REXQrr(RD, RS),               _OO_Mrm 
        (0x0f40|(CC)    ,_b11,_r8(RD),_r8(RS)                           ))
+#define CMOVQmr(CC,MD,MB,MI,MS,RD)     (_REXQmr(MB, MI, RD),           _OO_r_X 
        (0x0f40|(CC)         ,_r8(RD)           ,MD,MB,MI,MS            ))
 
 
-#define CMPXCHGBrr(RS,RD)              _OO_Mrm         (0x0fb0         
,_b11,_r1(RS),_r1(RD)                           )
-#define CMPXCHGBrm(RS,MD,MB,MI,MS)     _OO_r_X         (0x0fb0              
,_r1(RS)           ,MD,MB,MI,MS            )
+/* --- Push/Pop instructions ----------------------------------------------- */
 
-#define CMPXCHGWrr(RS,RD)              _wOO_Mrm        (0x0fb1         
,_b11,_r2(RS),_r2(RD)                           )
-#define CMPXCHGWrm(RS,MD,MB,MI,MS)     _wOO_r_X        (0x0fb1              
,_r2(RS)           ,MD,MB,MI,MS            )
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
 
-#define CMPXCHGLrr(RS,RD)              _OO_Mrm         (0x0fb1         
,_b11,_r4(RS),_r4(RD)                           )
-#define CMPXCHGLrm(RS,MD,MB,MI,MS)     _OO_r_X         (0x0fb1              
,_r4(RS)           ,MD,MB,MI,MS            )
+#define POPWr(RD)                      _m32only((_d16(),               _Or     
        (0x58,_r2(RD)                                                   )))
+#define POPWm(MD, MB, MI, MS)          _m32only((_d16(),               _O_r_X  
        (0x8f                ,_b000             ,MD,MB,MI,MS            )))
 
+#define POPLr(RD)                      _m32only(                       _Or     
        (0x58,_r4(RD)                                                   ))
+#define POPLm(MD, MB, MI, MS)          _m32only(                       _O_r_X  
        (0x8f                ,_b000             ,MD,MB,MI,MS            ))
 
-#define DECBr(RD)                      _O_Mrm          (0xfe           
,_b11,_b001  ,_r1(RD)                           )
-#define DECBm(MD,MB,MI,MS)             _O_r_X          (0xfe                
,_b001             ,MD,MB,MI,MS            )
+#define POPQr(RD)                      _m64only((_REXQr(RD),           _Or     
        (0x58,_r8(RD)                                                   )))
+#define POPQm(MD, MB, MI, MS)          _m64only((_REXQm(MB, MI),       _O_r_X  
        (0x8f                ,_b000             ,MD,MB,MI,MS            )))
 
-#define DECWr(RD)                      _wOr            (0x48,_r2(RD)           
                                        )
-#define DECWm(MD,MB,MI,MS)             _wO_r_X         (0xff                
,_b001             ,MD,MB,MI,MS            )
+#define PUSHWr(RS)                     _m32only((_d16(),               _Or     
        (0x50,_r2(RS)                                                   )))
+#define PUSHWm(MD, MB, MI, MS)         _m32only((_d16(),               _O_r_X  
        (0xff,               ,_b110             ,MD,MB,MI,MS            )))
+#define PUSHWi(IM)                     _m32only((_d16(),               _Os_sW  
        (0x68                                                   ,IM     )))
 
-#define DECLr(RD)                      _Or             (0x48,_r4(RD)           
                                        )
-#define DECLm(MD,MB,MI,MS)             _O_r_X          (0xff                
,_b001             ,MD,MB,MI,MS            )
+#define PUSHLr(RS)                     _m32only(                       _Or     
        (0x50,_r4(RS)                                                   ))
+#define PUSHLm(MD, MB, MI, MS)         _m32only(                       _O_r_X  
        (0xff                ,_b110             ,MD,MB,MI,MS            ))
+#define PUSHLi(IM)                     _m32only(                       _Os_sL  
        (0x68                                                   ,IM     ))
 
+#define PUSHQr(RS)                     _m64only((_REXQr(RS),           _Or     
        (0x50,_r8(RS)                                                   )))
+#define PUSHQm(MD, MB, MI, MS)         _m64only((_REXQm(MB, MI),       _O_r_X  
        (0xff                ,_b110             ,MD,MB,MI,MS            )))
+#define PUSHQi(IM)                     _m64only(                       _Os_sL  
        (0x68                                                   ,IM     ))
 
-#define DIVBr(RS)                      _O_Mrm          (0xf6           
,_b11,_b110  ,_r1(RS)                           )
-#define DIVBm(MD,MB,MI,MS)             _O_r_X          (0xf6                
,_b110             ,MD,MB,MI,MS            )
+#define POPA()                         (_d16(),                        _O      
        (0x61                                                           ))
+#define POPAD()                                                                
_O              (0x61                                                           
)
 
-#define DIVWr(RS)                      _wO_Mrm         (0xf7           
,_b11,_b110  ,_r2(RS)                           )
-#define DIVWm(MD,MB,MI,MS)             _wO_r_X         (0xf7                
,_b110             ,MD,MB,MI,MS            )
+#define PUSHA()                                (_d16(),                        
_O              (0x60                                                           
))
+#define PUSHAD()                                                       _O      
        (0x60                                                           )
 
-#define DIVLr(RS)                      _O_Mrm          (0xf7           
,_b11,_b110  ,_r4(RS)                           )
-#define DIVLm(MD,MB,MI,MS)             _O_r_X          (0xf7                
,_b110             ,MD,MB,MI,MS            )
+#define POPF()                                                         _O      
        (0x9d                                                           )
+#define PUSHF()                                                                
_O              (0x9c                                                           
)
 
 
-#define ENTERii(W, B)                  _O_W_B          (0xc8                   
                          ,_su16(W),_su8(B))
-#define HLT_()                         _O              (0xf4                   
                                        )
+/* --- Test instructions --------------------------------------------------- */
 
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
 
-#define IDIVBr(RS)                     _O_Mrm          (0xf6           
,_b11,_b111  ,_r1(RS)                           )
-#define IDIVBm(MD,MB,MI,MS)            _O_r_X          (0xf6                
,_b111             ,MD,MB,MI,MS            )
+#define TESTBrr(RS, RD)                        (_REXBrr(RS, RD),               
_O_Mrm          (0x84           ,_b11,_r1(RS),_r1(RD)                           
))
+#define TESTBrm(RS, MD, MB, MI, MS)    (_REXBrm(RS, MB, MI),           _O_r_X  
        (0x84                ,_r1(RS)           ,MD,MB,MI,MS            ))
+#define TESTBir(IM, RD)                        ((RD) == _AL ? \
+                                       (_REXBrr(0, RD),                _O_B    
        (0xa8                                                   ,_u8(IM))) : \
+                                       (_REXBrr(0, RD),                
_O_Mrm_B        (0xf6           ,_b11,_b000  ,_r1(RD)                   
,_u8(IM))) )
+#define TESTBim(IM, MD, MB, MI, MS)    (_REXBrm(0, MB, MI),            
_O_r_X_B        (0xf6                ,_b000             ,MD,MB,MI,MS    
,_u8(IM)))
 
-#define IDIVWr(RS)                     _wO_Mrm         (0xf7           
,_b11,_b111  ,_r2(RS)                           )
-#define IDIVWm(MD,MB,MI,MS)            _wO_r_X         (0xf7                
,_b111             ,MD,MB,MI,MS            )
+#define TESTWrr(RS, RD)                        (_d16(), _REXLrr(RS, RD),       
_O_Mrm          (0x85           ,_b11,_r2(RS),_r2(RD)                           
))
+#define TESTWrm(RS, MD, MB, MI, MS)    (_d16(), _REXLrm(RS, MB, MI),   _O_r_X  
        (0x85                ,_r2(RS)           ,MD,MB,MI,MS            ))
+#define TESTWir(IM, RD)                        (!_s8P(IM) && (RD) == _AX ? \
+                                       (_d16(), _REXLrr(0, RD),        _O_W    
        (0xa9                                                   ,_u16(IM))) : \
+                                       (_d16(), _REXLrr(0, RD),        
_O_Mrm_W        (0xf7           ,_b11,_b000  ,_r2(RD)                   
,_u16(IM))) )
+#define TESTWim(IM, MD, MB, MI, MS)    (_d16(), _REXLrm(0, MB, MI),    
_O_r_X_W        (0xf7                ,_b000             ,MD,MB,MI,MS    
,_u16(IM)))
 
-#define IDIVLr(RS)                     _O_Mrm          (0xf7           
,_b11,_b111  ,_r4(RS)                           )
-#define IDIVLm(MD,MB,MI,MS)            _O_r_X          (0xf7                
,_b111             ,MD,MB,MI,MS            )
+#define TESTLrr(RS, RD)                        (_REXLrr(RS, RD),               
_O_Mrm          (0x85           ,_b11,_r4(RS),_r4(RD)                           
))
+#define TESTLrm(RS, MD, MB, MI, MS)    (_REXLrm(RS, MB, MI),           _O_r_X  
        (0x85                ,_r4(RS)           ,MD,MB,MI,MS            ))
+#define TESTLir(IM, RD)                        (!_s8P(IM) && (RD) == _EAX ? \
+                                       (_REXLrr(0, RD),                _O_L    
        (0xa9                                                   ,IM     )) : \
+                                       (_REXLrr(0, RD),                
_O_Mrm_L        (0xf7           ,_b11,_b000  ,_r4(RD)                   ,IM     
)) )
+#define TESTLim(IM, MD, MB, MI, MS)    (_REXLrm(0, MB, MI),            
_O_r_X_L        (0xf7                ,_b000             ,MD,MB,MI,MS    ,IM     
))
 
-#define IMULBr(RS)                     _O_Mrm          (0xf6           
,_b11,_b101  ,_r1(RS)                           )
-#define IMULBm(MD,MB,MI,MS)            _O_r_X          (0xf6                
,_b101             ,MD,MB,MI,MS            )
+#define TESTQrr(RS, RD)                        (_REXQrr(RS, RD),               
_O_Mrm          (0x85           ,_b11,_r8(RS),_r8(RD)                           
))
+#define TESTQrm(RS, MD, MB, MI, MS)    (_REXQrm(RS, MB, MI),           _O_r_X  
        (0x85                ,_r8(RS)           ,MD,MB,MI,MS            ))
+#define TESTQir(IM, RD)                        (!_s8P(IM) && (RD) == _RAX ? \
+                                       (_REXQrr(0, RD),                _O_L    
        (0xa9                                                   ,IM     )) : \
+                                       (_REXQrr(0, RD),                
_O_Mrm_L        (0xf7           ,_b11,_b000  ,_r8(RD)                   ,IM     
)) )
+#define TESTQim(IM, MD, MB, MI, MS)    (_REXQrm(0, MB, MI),            
_O_r_X_L        (0xf7                ,_b000             ,MD,MB,MI,MS    ,IM     
))
 
-#define IMULWr(RS)                     _wO_Mrm         (0xf7           
,_b11,_b101  ,_r2(RS)                           )
-#define IMULWm(MD,MB,MI,MS)            _wO_r_X         (0xf7                
,_b101             ,MD,MB,MI,MS            )
 
-#define IMULLr(RS)                     _O_Mrm          (0xf7           
,_b11,_b101  ,_r4(RS)                           )
-#define IMULLm(MD,MB,MI,MS)            _O_r_X          (0xf7                
,_b101             ,MD,MB,MI,MS            )
+/* --- Exchange instructions ----------------------------------------------- */
 
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
 
-#define IMULWrr(RS,RD)                 _wOO_Mrm        (0x0faf         
,_b11,_r2(RS),_r2(RD)                           )
-#define IMULWmr(MD,MB,MI,MS,RD)                _wOO_r_X        (0x0faf         
     ,_r2(RD)           ,MD,MB,MI,MS            )
-#define IMULWirr(IM,RS,RD)             _wOs_Mrm_sW     (0x69           
,_b11,_r2(RS),_r2(RD)                   ,_su16(IM)      )
-#define IMULWimr(IM,MD,MB,MI,MS,RD)    _wOs_r_X_sW     (0x69                
,_r2(RD)           ,MD,MB,MI,MS    ,_su16(IM)      )
+#define CMPXCHGBrr(RS, RD)             (_REXBrr(RS, RD),               _OO_Mrm 
        (0x0fb0         ,_b11,_r1(RS),_r1(RD)                           ))
+#define CMPXCHGBrm(RS, MD, MB, MI, MS) (_REXBrm(RS, MB, MI),           _OO_r_X 
        (0x0fb0              ,_r1(RS)           ,MD,MB,MI,MS            ))
 
-#define IMULLir(IM,RD)                 _Os_Mrm_sL      (0x69           
,_b11,_r4(RD),_r4(RD)                   ,IM     )
-#define IMULLrr(RS,RD)                 _OO_Mrm         (0x0faf         
,_b11,_r4(RD),_r4(RS)                           )
-#define IMULLmr(MD,MB,MI,MS,RD)                _OO_r_X         (0x0faf         
     ,_r4(RD)           ,MD,MB,MI,MS            )
-#define IMULLirr(IM,RS,RD)             _Os_Mrm_sL      (0x69           
,_b11,_r4(RS),_r4(RD)                   ,IM     )
-#define IMULLimr(IM,MD,MB,MI,MS,RD)    _Os_r_X_sL      (0x69                
,_r4(RD)           ,MD,MB,MI,MS    ,IM     )
+#define CMPXCHGWrr(RS, RD)             (_d16(), _REXLrr(RS, RD),       _OO_Mrm 
        (0x0fb1         ,_b11,_r2(RS),_r2(RD)                           ))
+#define CMPXCHGWrm(RS, MD, MB, MI, MS) (_d16(), _REXLrm(RS, MB, MI),   _OO_r_X 
        (0x0fb1              ,_r2(RS)           ,MD,MB,MI,MS            ))
 
+#define CMPXCHGLrr(RS, RD)             (_REXLrr(RS, RD),               _OO_Mrm 
        (0x0fb1         ,_b11,_r4(RS),_r4(RD)                           ))
+#define CMPXCHGLrm(RS, MD, MB, MI, MS) (_REXLrm(RS, MB, MI),           _OO_r_X 
        (0x0fb1              ,_r4(RS)           ,MD,MB,MI,MS            ))
 
-#define INCBr(RD)                      _O_Mrm          (0xfe           
,_b11,_b000  ,_r1(RD)                           )
-#define INCBm(MD,MB,MI,MS)             _O_r_X          (0xfe                
,_b000             ,MD,MB,MI,MS            )
+#define CMPXCHGQrr(RS, RD)             (_REXQrr(RS, RD),               _OO_Mrm 
        (0x0fb1         ,_b11,_r8(RS),_r8(RD)                           ))
+#define CMPXCHGQrm(RS, MD, MB, MI, MS) (_REXQrm(RS, MB, MI),           _OO_r_X 
        (0x0fb1              ,_r8(RS)           ,MD,MB,MI,MS            ))
 
-#define INCWr(RD)                      _wOr            (0x40,_r2(RD)           
                                        )
-#define INCWm(MD,MB,MI,MS)             _wO_r_X         (0xff                
,_b000             ,MD,MB,MI,MS            )
+#define XADDBrr(RS, RD)                        (_REXBrr(RS, RD),               
_OO_Mrm         (0x0fc0         ,_b11,_r1(RS),_r1(RD)                           
))
+#define XADDBrm(RS, MD, MB, MI, MS)    (_REXBrm(RS, MB, MI),           _OO_r_X 
        (0x0fc0              ,_r1(RS)           ,MD,MB,MI,MS            ))
 
-#define INCLr(RD)                      _Or             (0x40,_r4(RD)           
                                        )
-#define INCLm(MD,MB,MI,MS)             _O_r_X          (0xff                
,_b000             ,MD,MB,MI,MS            )
+#define XADDWrr(RS, RD)                        (_d16(), _REXLrr(RS, RD),       
_OO_Mrm         (0x0fc1         ,_b11,_r2(RS),_r2(RD)                           
))
+#define XADDWrm(RS, MD, MB, MI, MS)    (_d16(), _REXLrm(RS, MB, MI),   _OO_r_X 
        (0x0fc1              ,_r2(RS)           ,MD,MB,MI,MS            ))
 
+#define XADDLrr(RS, RD)                        (_REXLrr(RS, RD),               
_OO_Mrm         (0x0fc1         ,_b11,_r4(RS),_r4(RD)                           
))
+#define XADDLrm(RS, MD, MB, MI, MS)    (_REXLrm(RS, MB, MI),           _OO_r_X 
        (0x0fc1              ,_r4(RS)           ,MD,MB,MI,MS            ))
 
-#define INVD_()                                _OO             (0x0f08         
                                                )
-#define INVLPGm(MD, MB, MI, MS)                _OO_r_X         (0x0f01         
     ,_b111             ,MD,MB,MI,MS            )
+#define XADDQrr(RS, RD)                        (_REXQrr(RS, RD),               
_OO_Mrm         (0x0fc1         ,_b11,_r8(RS),_r8(RD)                           
))
+#define XADDQrm(RS, MD, MB, MI, MS)    (_REXQrm(RS, MB, MI),           _OO_r_X 
        (0x0fc1              ,_r8(RS)           ,MD,MB,MI,MS            ))
 
+#define XCHGBrr(RS, RD)                        (_REXBrr(RS, RD),               
_O_Mrm          (0x86           ,_b11,_r1(RS),_r1(RD)                           
))
+#define XCHGBrm(RS, MD, MB, MI, MS)    (_REXBrm(RS, MB, MI),           _O_r_X  
        (0x86                ,_r1(RS)           ,MD,MB,MI,MS            ))
 
-#define JCCSim(CC,D,B,I,S)             ((_r0P(B) && _r0P(I)) ? _O_D8   
(0x70|(CC)              ,(int)(D)               ) : \
-                                                               
JITFAIL("illegal mode in conditional jump"))
+#define XCHGWrr(RS, RD)                        (_d16(), _REXLrr(RS, RD),       
_O_Mrm          (0x87           ,_b11,_r2(RS),_r2(RD)                           
))
+#define XCHGWrm(RS, MD, MB, MI, MS)    (_d16(), _REXLrm(RS, MB, MI),   _O_r_X  
        (0x87                ,_r2(RS)           ,MD,MB,MI,MS            ))
 
-#define JOSm(D,B,I,S)                  JCCSim(0x0,D,B,I,S)
-#define JNOSm(D,B,I,S)                 JCCSim(0x1,D,B,I,S)
-#define JCSm(D,B,I,S)                  JCCSim(0x2,D,B,I,S)
-#define JBSm(D,B,I,S)                  JCCSim(0x2,D,B,I,S)
-#define JNAESm(D,B,I,S)                        JCCSim(0x2,D,B,I,S)
-#define JNCSm(D,B,I,S)                 JCCSim(0x3,D,B,I,S)
-#define JNBSm(D,B,I,S)                 JCCSim(0x3,D,B,I,S)
-#define JAESm(D,B,I,S)                 JCCSim(0x3,D,B,I,S)
-#define JESm(D,B,I,S)                  JCCSim(0x4,D,B,I,S)
-#define JZSm(D,B,I,S)                  JCCSim(0x4,D,B,I,S)
-#define JNESm(D,B,I,S)                 JCCSim(0x5,D,B,I,S)
-#define JNZSm(D,B,I,S)                 JCCSim(0x5,D,B,I,S)
-#define JBESm(D,B,I,S)                 JCCSim(0x6,D,B,I,S)
-#define JNASm(D,B,I,S)                 JCCSim(0x6,D,B,I,S)
-#define JNBESm(D,B,I,S)                        JCCSim(0x7,D,B,I,S)
-#define JASm(D,B,I,S)                  JCCSim(0x7,D,B,I,S)
-#define JSSm(D,B,I,S)                  JCCSim(0x8,D,B,I,S)
-#define JNSSm(D,B,I,S)                 JCCSim(0x9,D,B,I,S)
-#define JPSm(D,B,I,S)                  JCCSim(0xa,D,B,I,S)
-#define JPESm(D,B,I,S)                 JCCSim(0xa,D,B,I,S)
-#define JNPSm(D,B,I,S)                 JCCSim(0xb,D,B,I,S)
-#define JPOSm(D,B,I,S)                 JCCSim(0xb,D,B,I,S)
-#define JLSm(D,B,I,S)                  JCCSim(0xc,D,B,I,S)
-#define JNGESm(D,B,I,S)                        JCCSim(0xc,D,B,I,S)
-#define JNLSm(D,B,I,S)                 JCCSim(0xd,D,B,I,S)
-#define JGESm(D,B,I,S)                 JCCSim(0xd,D,B,I,S)
-#define JLESm(D,B,I,S)                 JCCSim(0xe,D,B,I,S)
-#define JNGSm(D,B,I,S)                 JCCSim(0xe,D,B,I,S)
-#define JNLESm(D,B,I,S)                        JCCSim(0xf,D,B,I,S)
-#define JGSm(D,B,I,S)                  JCCSim(0xf,D,B,I,S)
+#define XCHGLrr(RS, RD)                        (_REXLrr(RS, RD),               
_O_Mrm          (0x87           ,_b11,_r4(RS),_r4(RD)                           
))
+#define XCHGLrm(RS, MD, MB, MI, MS)    (_REXLrm(RS, MB, MI),           _O_r_X  
        (0x87                ,_r4(RS)           ,MD,MB,MI,MS            ))
 
-#define JOm(D,B,I,S)                   JCCim(0x0,D,B,I,S)
-#define JNOm(D,B,I,S)                  JCCim(0x1,D,B,I,S)
-#define JCm(D,B,I,S)                   JCCim(0x2,D,B,I,S)
-#define JBm(D,B,I,S)                   JCCim(0x2,D,B,I,S)
-#define JNAEm(D,B,I,S)                 JCCim(0x2,D,B,I,S)
-#define JNCm(D,B,I,S)                  JCCim(0x3,D,B,I,S)
-#define JNBm(D,B,I,S)                  JCCim(0x3,D,B,I,S)
-#define JAEm(D,B,I,S)                  JCCim(0x3,D,B,I,S)
-#define JEm(D,B,I,S)                   JCCim(0x4,D,B,I,S)
-#define JZm(D,B,I,S)                   JCCim(0x4,D,B,I,S)
-#define JNEm(D,B,I,S)                  JCCim(0x5,D,B,I,S)
-#define JNZm(D,B,I,S)                  JCCim(0x5,D,B,I,S)
-#define JBEm(D,B,I,S)                  JCCim(0x6,D,B,I,S)
-#define JNAm(D,B,I,S)                  JCCim(0x6,D,B,I,S)
-#define JNBEm(D,B,I,S)                 JCCim(0x7,D,B,I,S)
-#define JAm(D,B,I,S)                   JCCim(0x7,D,B,I,S)
-#define JSm(D,B,I,S)                   JCCim(0x8,D,B,I,S)
-#define JNSm(D,B,I,S)                  JCCim(0x9,D,B,I,S)
-#define JPm(D,B,I,S)                   JCCim(0xa,D,B,I,S)
-#define JPEm(D,B,I,S)                  JCCim(0xa,D,B,I,S)
-#define JNPm(D,B,I,S)                  JCCim(0xb,D,B,I,S)
-#define JPOm(D,B,I,S)                  JCCim(0xb,D,B,I,S)
-#define JLm(D,B,I,S)                   JCCim(0xc,D,B,I,S)
-#define JNGEm(D,B,I,S)                 JCCim(0xc,D,B,I,S)
-#define JNLm(D,B,I,S)                  JCCim(0xd,D,B,I,S)
-#define JGEm(D,B,I,S)                  JCCim(0xd,D,B,I,S)
-#define JLEm(D,B,I,S)                  JCCim(0xe,D,B,I,S)
-#define JNGm(D,B,I,S)                  JCCim(0xe,D,B,I,S)
-#define JNLEm(D,B,I,S)                 JCCim(0xf,D,B,I,S)
-#define JGm(D,B,I,S)                   JCCim(0xf,D,B,I,S)
+#define XCHGQrr(RS, RD)                        (_REXQrr(RS, RD),               
_O_Mrm          (0x87           ,_b11,_r8(RS),_r8(RD)                           
))
+#define XCHGQrm(RS, MD, MB, MI, MS)    (_REXQrm(RS, MB, MI),           _O_r_X  
        (0x87                ,_r8(RS)           ,MD,MB,MI,MS            ))
 
 
-#define JMPSm(D,B,I,S)                 ((_r0P(B) && _r0P(I)) ? _O_D8   (0xeb   
                ,(int)(D)               ) : \
-                                                               
JITFAIL("illegal mode in short jump"))
-
-#define JMPsr(R)                       _O_Mrm  (0xff   ,_b11,_b100,_r4(R)      
                )
-
-#define JMPsm(D,B,I,S)                 _O_r_X  (0xff        ,_b100     
,(int)(D),B,I,S         )
-
-
-#define LAHF_()                                _O              (0x9f           
                                                )
-#define LEALmr(MD, MB, MI, MS, RD)     _O_r_X          (0x8d                
,_r4(RD)           ,MD,MB,MI,MS            )
-#define LEAVE_()                       _O              (0xc9                   
                                        )
-
-
-#define LMSWr(RS)                      _OO_Mrm         (0x0f01         
,_b11,_b110,_r4(RS)                             )
-#define LMSWm(MD,MB,MI,MS)             _OO_r_X         (0x0f01              
,_b110             ,MD,MB,MI,MS            )
-
-#define LOOPm(MD,MB,MI,MS)             ((_r0P(MB) && _r0P(MI)) ? _O_D8 (0xe2   
                ,MD                     ) : \
-                                                                 
JITFAIL("illegal mode in loop"))
-
-#define LOOPEm(MD,MB,MI,MS)            ((_r0P(MB) && _r0P(MI)) ? _O_D8 (0xe1   
                ,MD                     ) : \
-                                                                 
JITFAIL("illegal mode in loope"))
-
-#define LOOPZm(MD,MB,MI,MS)            ((_r0P(MB) && _r0P(MI)) ? _O_D8 (0xe1   
                ,MD                     ) : \
-                                                                 
JITFAIL("illegal mode in loopz"))
-
-#define LOOPNEm(MD,MB,MI,MS)           ((_r0P(MB) && _r0P(MI)) ? _O_D8 (0xe0   
                ,MD                     ) : \
-                                                                 
JITFAIL("illegal mode in loopne"))
-
-#define LOOPNZm(MD,MB,MI,MS)           ((_r0P(MB) && _r0P(MI)) ? _O_D8 (0xe0   
                ,MD                     ) : \
-                                                                 
JITFAIL("illegal mode in loopnz"))
-
-
-#define MOVBrr(RS, RD)                 _O_Mrm          (0x80           
,_b11,_r1(RS),_r1(RD)                           )
-#define MOVBmr(MD, MB, MI, MS, RD)     _O_r_X          (0x8a                
,_r1(RD)           ,MD,MB,MI,MS            )
-#define MOVBrm(RS, MD, MB, MI, MS)     _O_r_X          (0x88                
,_r1(RS)           ,MD,MB,MI,MS            )
-#define MOVBir(IM,  R)                 _Or_B           (0xb0,_r1(R)            
                                ,_su8(IM))
-#define MOVBim(IM, MD, MB, MI, MS)     _O_X_B          (0xc6                   
                ,MD,MB,MI,MS    ,_su8(IM))
-
-#define MOVWrr(RS, RD)                 _wO_Mrm         (0x89           
,_b11,_r2(RS),_r2(RD)                           )
-#define MOVWmr(MD, MB, MI, MS, RD)     _wO_r_X         (0x8b                
,_r2(RD)           ,MD,MB,MI,MS            )
-#define MOVWrm(RS, MD, MB, MI, MS)     _wO_r_X         (0x89                
,_r2(RS)           ,MD,MB,MI,MS            )
-#define MOVWir(IM,  R)                 _wOr_W          (0xb8,_r2(R)            
                                ,_su16(IM))
-#define MOVWim(IM, MD, MB, MI, MS)     _wO_X_W         (0xc7                   
                ,MD,MB,MI,MS    ,_su16(IM))
+/* --- Increment/Decrement instructions ------------------------------------ */
 
-#define MOVLrr(RS, RD)                 _O_Mrm          (0x89           
,_b11,_r4(RS),_r4(RD)                           )
-#define MOVLmr(MD, MB, MI, MS, RD)     _O_r_X          (0x8b                
,_r4(RD)           ,MD,MB,MI,MS            )
-#define MOVLrm(RS, MD, MB, MI, MS)     _O_r_X          (0x89                
,_r4(RS)           ,MD,MB,MI,MS            )
-#define MOVLir(IM,  R)                 _Or_L           (0xb8,_r4(R)            
                                ,IM     )
-#define MOVLim(IM, MD, MB, MI, MS)     _O_X_L          (0xc7                   
                ,MD,MB,MI,MS    ,IM     )
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
 
-#define MOVZBLrr(RS, RD)               _OO_Mrm         (0x0fb6         
,_b11,_r4(RD),_r1(RS)                           )
-#define MOVZBLmr(MD, MB, MI, MS, RD)   _OO_r_X         (0x0fb6              
,_r4(RD)           ,MD,MB,MI,MS            )
-#define MOVZBWrr(RS, RD)               _wOO_Mrm        (0x0fb6         
,_b11,_r2(RD),_r1(RS)                           )
-#define MOVZBWmr(MD, MB, MI, MS, RD)   _wOO_r_X        (0x0fb6              
,_r2(RD)           ,MD,MB,MI,MS            )
-#define MOVZWLrr(RS, RD)               _OO_Mrm         (0x0fb7         
,_b11,_r4(RD),_r2(RS)                           )
-#define MOVZWLmr(MD, MB, MI, MS, RD)   _OO_r_X         (0x0fb7              
,_r4(RD)           ,MD,MB,MI,MS            )
+#define DECBm(MD, MB, MI, MS)          (_REXBrm(0, MB, MI),            _O_r_X  
        (0xfe                ,_b001             ,MD,MB,MI,MS            ))
+#define DECBr(RD)                      (_REXBrr(0, RD),                _O_Mrm  
        (0xfe           ,_b11,_b001  ,_r1(RD)                           ))
 
-#define MOVSBLrr(RS, RD)               _OO_Mrm         (0x0fbe         
,_b11,_r4(RD),_r1(RS)                           )
-#define MOVSBLmr(MD, MB, MI, MS, RD)   _OO_r_X         (0x0fbe              
,_r4(RD)           ,MD,MB,MI,MS            )
-#define MOVSBWrr(RS, RD)               _wOO_Mrm        (0x0fbe         
,_b11,_r2(RD),_r1(RS)                           )
-#define MOVSBWmr(MD, MB, MI, MS, RD)   _wOO_r_X        (0x0fbe              
,_r2(RD)           ,MD,MB,MI,MS            )
-#define MOVSWLrr(RS, RD)               _OO_Mrm         (0x0fbf         
,_b11,_r4(RD),_r2(RS)                           )
-#define MOVSWLmr(MD, MB, MI, MS, RD)   _OO_r_X         (0x0fbf              
,_r4(RD)           ,MD,MB,MI,MS            )
+#define DECWm(MD, MB, MI, MS)          (_d16(), _REXLrm(0, MB, MI),    _O_r_X  
        (0xff                ,_b001             ,MD,MB,MI,MS            ))
 
+#define DECLm(MD, MB, MI, MS)          (_REXLrm(0, MB, MI),            _O_r_X  
        (0xff                ,_b001             ,MD,MB,MI,MS            ))
 
-#define MULBr(RS)                      _O_Mrm          (0xf6           
,_b11,_b100  ,_r1(RS)                           )
-#define MULBm(MD,MB,MI,MS)             _O_r_X          (0xf6                
,_b100             ,MD,MB,MI,MS            )
+#define DECQm(MD, MB, MI, MS)          (_REXQrm(0, MB, MI),            _O_r_X  
        (0xff                ,_b001             ,MD,MB,MI,MS            ))
+#define DECQr(RD)                      (_REXQrr(0, RD),                _O_Mrm  
        (0xff           ,_b11,_b001  ,_r8(RD)                           ))
 
-#define MULWr(RS)                      _wO_Mrm         (0xf7           
,_b11,_b100  ,_r2(RS)                           )
-#define MULWm(MD,MB,MI,MS)             _wO_r_X         (0xf7                
,_b100             ,MD,MB,MI,MS            )
+#define INCBm(MD, MB, MI, MS)          (_REXBrm(0, MB, MI),            _O_r_X  
        (0xfe                ,_b000             ,MD,MB,MI,MS            ))
+#define INCBr(RD)                      (_REXBrr(0, RD),                _O_Mrm  
        (0xfe           ,_b11,_b000  ,_r1(RD)                           ))
 
-#define MULLr(RS)                      _O_Mrm          (0xf7           
,_b11,_b100  ,_r4(RS)                           )
-#define MULLm(MD,MB,MI,MS)             _O_r_X          (0xf7                
,_b100             ,MD,MB,MI,MS            )
+#define INCWm(MD, MB, MI, MS)          (_d16(), _REXLrm(0, MB, MI),    _O_r_X  
        (0xff                ,_b000             ,MD,MB,MI,MS            ))
 
+#define INCLm(MD, MB, MI, MS)          (_REXLrm(0, MB, MI),            _O_r_X  
        (0xff                ,_b000             ,MD,MB,MI,MS            ))
 
-#define NEGBr(RD)                      _O_Mrm          (0xf6           
,_b11,_b011  ,_r1(RD)                           )
-#define NEGBm(MD,MB,MI,MS)             _O_r_X          (0xf6                
,_b011             ,MD,MB,MI,MS            )
+#define INCQm(MD, MB, MI, MS)          (_REXQrm(0, MB, MI),            _O_r_X  
        (0xff                ,_b000             ,MD,MB,MI,MS            ))
+#define INCQr(RD)                      (_REXQrr(0, RD),                _O_Mrm  
        (0xff           ,_b11,_b000  ,_r8(RD)                           ))
 
-#define NEGWr(RD)                      _wO_Mrm         (0xf7           
,_b11,_b011  ,_r2(RD)                           )
-#define NEGWm(MD,MB,MI,MS)             _wO_r_X         (0xf7                
,_b011             ,MD,MB,MI,MS            )
 
-#define NEGLr(RD)                      _O_Mrm          (0xf7           
,_b11,_b011  ,_r4(RD)                           )
-#define NEGLm(MD,MB,MI,MS)             _O_r_X          (0xf7                
,_b011             ,MD,MB,MI,MS            )
+/* --- Misc instructions --------------------------------------------------- */
 
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
 
-#define NOP_()                         _O              (0x90                   
                                        )
+#define BSFWrr(RS, RD)                 (_d16(), _REXLrr(RD, RS),       _OO_Mrm 
        (0x0fbc         ,_b11,_r2(RD),_r2(RS)                           ))
+#define BSFWmr(MD, MB, MI, MS, RD)     (_d16(), _REXLmr(MB, MI, RD),   _OO_r_X 
        (0x0fbc              ,_r2(RD)           ,MD,MB,MI,MS            ))
+#define BSRWrr(RS, RD)                 (_d16(), _REXLrr(RD, RS),       _OO_Mrm 
        (0x0fbd         ,_b11,_r2(RD),_r2(RS)                           ))
+#define BSRWmr(MD, MB, MI, MS, RD)     (_d16(), _REXLmr(MB, MI, RD),   _OO_r_X 
        (0x0fbd              ,_r2(RD)           ,MD,MB,MI,MS            ))
 
+#define BSFLrr(RS, RD)                 (_REXLrr(RD, RS),               _OO_Mrm 
        (0x0fbc         ,_b11,_r4(RD),_r4(RS)                           ))
+#define BSFLmr(MD, MB, MI, MS, RD)     (_REXLmr(MB, MI, RD),           _OO_r_X 
        (0x0fbc              ,_r4(RD)           ,MD,MB,MI,MS            ))
+#define BSRLrr(RS, RD)                 (_REXLrr(RD, RS),               _OO_Mrm 
        (0x0fbd         ,_b11,_r4(RD),_r4(RS)                           ))
+#define BSRLmr(MD, MB, MI, MS, RD)     (_REXLmr(MB, MI, RD),           _OO_r_X 
        (0x0fbd              ,_r4(RD)           ,MD,MB,MI,MS            ))
 
-#define NOTBr(RD)                      _O_Mrm          (0xf6           
,_b11,_b010  ,_r1(RD)                           )
-#define NOTBm(MD,MB,MI,MS)             _O_r_X          (0xf6                
,_b010             ,MD,MB,MI,MS            )
+#define BSFQrr(RS, RD)                 (_REXQrr(RD, RS),               _OO_Mrm 
        (0x0fbc         ,_b11,_r8(RD),_r8(RS)                           ))
+#define BSFQmr(MD, MB, MI, MS, RD)     (_REXQmr(MB, MI, RD),           _OO_r_X 
        (0x0fbc              ,_r8(RD)           ,MD,MB,MI,MS            ))
+#define BSRQrr(RS, RD)                 (_REXQrr(RD, RS),               _OO_Mrm 
        (0x0fbd         ,_b11,_r8(RD),_r8(RS)                           ))
+#define BSRQmr(MD, MB, MI, MS, RD)     (_REXQmr(MB, MI, RD),           _OO_r_X 
        (0x0fbd              ,_r8(RD)           ,MD,MB,MI,MS            ))
 
-#define NOTWr(RD)                      _wO_Mrm         (0xf7           
,_b11,_b010  ,_r2(RD)                           )
-#define NOTWm(MD,MB,MI,MS)             _wO_r_X         (0xf7                
,_b010             ,MD,MB,MI,MS            )
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
 
-#define NOTLr(RD)                      _O_Mrm          (0xf7           
,_b11,_b010  ,_r4(RD)                           )
-#define NOTLm(MD,MB,MI,MS)             _O_r_X          (0xf7                
,_b010             ,MD,MB,MI,MS            )
+#define MOVSBWrr(RS, RD)               (_d16(), _REXBLrr(RD, RS),      _OO_Mrm 
        (0x0fbe         ,_b11,_r2(RD),_r1(RS)                           ))
+#define MOVSBWmr(MD, MB, MI, MS, RD)   (_d16(), _REXLmr(MB, MI, RD),   _OO_r_X 
        (0x0fbe              ,_r2(RD)           ,MD,MB,MI,MS            ))
+#define MOVZBWrr(RS, RD)               (_d16(), _REXBLrr(RD, RS),      _OO_Mrm 
        (0x0fb6         ,_b11,_r2(RD),_r1(RS)                           ))
+#define MOVZBWmr(MD, MB, MI, MS, RD)   (_d16(), _REXLmr(MB, MI, RD),   _OO_r_X 
        (0x0fb6              ,_r2(RD)           ,MD,MB,MI,MS            ))
 
+#define MOVSBLrr(RS, RD)               (_REXBLrr(RD, RS),              _OO_Mrm 
        (0x0fbe         ,_b11,_r4(RD),_r1(RS)                           ))
+#define MOVSBLmr(MD, MB, MI, MS, RD)   (_REXLmr(MB, MI, RD),           _OO_r_X 
        (0x0fbe              ,_r4(RD)           ,MD,MB,MI,MS            ))
+#define MOVZBLrr(RS, RD)               (_REXBLrr(RD, RS),              _OO_Mrm 
        (0x0fb6         ,_b11,_r4(RD),_r1(RS)                           ))
+#define MOVZBLmr(MD, MB, MI, MS, RD)   (_REXLmr(MB, MI, RD),           _OO_r_X 
        (0x0fb6              ,_r4(RD)           ,MD,MB,MI,MS            ))
 
-#define ORBrr(RS, RD)                  _O_Mrm          (0x08           
,_b11,_r1(RS),_r1(RD)                           )
-#define ORBmr(MD, MB, MI, MS, RD)      _O_r_X          (0x0a                
,_r1(RD)           ,MD,MB,MI,MS            )
-#define ORBrm(RS, MD, MB, MI, MS)      _O_r_X          (0x08                
,_r1(RS)           ,MD,MB,MI,MS            )
-#define ORBir(IM, RD)                  _O_Mrm_B        (0x80           
,_b11,_b001  ,_r1(RD)                   ,_su8(IM))
-#define ORBim(IM, MD, MB, MI, MS)      _O_r_X_B        (0x80                
,_b001             ,MD,MB,MI,MS    ,_su8(IM))
+#define MOVSBQrr(RS, RD)               (_REXQrr(RD, RS),               _OO_Mrm 
        (0x0fbe         ,_b11,_r8(RD),_r1(RS)                           ))
+#define MOVSBQmr(MD, MB, MI, MS, RD)   (_REXQmr(MB, MI, RD),           _OO_r_X 
        (0x0fbe              ,_r8(RD)           ,MD,MB,MI,MS            ))
+#define MOVZBQrr(RS, RD)               (_REXQrr(RD, RS),               _OO_Mrm 
        (0x0fb6         ,_b11,_r8(RD),_r1(RS)                           ))
+#define MOVZBQmr(MD, MB, MI, MS, RD)   (_REXQmr(MB, MI, RD),           _OO_r_X 
        (0x0fb6              ,_r8(RD)           ,MD,MB,MI,MS            ))
 
-#define ORWrr(RS, RD)                  _wO_Mrm         (0x09           
,_b11,_r2(RS),_r2(RD)                           )
-#define ORWmr(MD, MB, MI, MS, RD)      _wO_r_X         (0x0b                
,_r2(RD)           ,MD,MB,MI,MS            )
-#define ORWrm(RS, MD, MB, MI, MS)      _wO_r_X         (0x09                
,_r2(RS)           ,MD,MB,MI,MS            )
-#define ORWir(IM, RD)                  _wOs_Mrm_sW     (0x81           
,_b11,_b001  ,_r2(RD)                   ,_su16(IM))
-#define ORWim(IM, MD, MB, MI, MS)      _wOs_r_X_sW     (0x81                
,_b001             ,MD,MB,MI,MS    ,_su16(IM))
+#define MOVSWLrr(RS, RD)               (_REXLrr(RD, RS),               _OO_Mrm 
        (0x0fbf         ,_b11,_r4(RD),_r2(RS)                           ))
+#define MOVSWLmr(MD, MB, MI, MS, RD)   (_REXLmr(MB, MI, RD),           _OO_r_X 
        (0x0fbf              ,_r4(RD)           ,MD,MB,MI,MS            ))
+#define MOVZWLrr(RS, RD)               (_REXLrr(RD, RS),               _OO_Mrm 
        (0x0fb7         ,_b11,_r4(RD),_r2(RS)                           ))
+#define MOVZWLmr(MD, MB, MI, MS, RD)   (_REXLmr(MB, MI, RD),           _OO_r_X 
        (0x0fb7              ,_r4(RD)           ,MD,MB,MI,MS            ))
 
-#define ORLrr(RS, RD)                  _O_Mrm          (0x09           
,_b11,_r4(RS),_r4(RD)                           )
-#define ORLmr(MD, MB, MI, MS, RD)      _O_r_X          (0x0b                
,_r4(RD)           ,MD,MB,MI,MS            )
-#define ORLrm(RS, MD, MB, MI, MS)      _O_r_X          (0x09                
,_r4(RS)           ,MD,MB,MI,MS            )
-#define ORLir(IM, RD)                  _Os_Mrm_sL      (0x81           
,_b11,_b001  ,_r4(RD)                   ,IM     )
-#define ORLim(IM, MD, MB, MI, MS)      _Os_r_X_sL      (0x81                
,_b001             ,MD,MB,MI,MS    ,IM     )
+#define MOVSWQrr(RS, RD)               (_REXQrr(RD, RS),               _OO_Mrm 
        (0x0fbf         ,_b11,_r8(RD),_r2(RS)                           ))
+#define MOVSWQmr(MD, MB, MI, MS, RD)   (_REXQmr(MB, MI, RD),           _OO_r_X 
        (0x0fbf              ,_r8(RD)           ,MD,MB,MI,MS            ))
+#define MOVZWQrr(RS, RD)               (_REXQrr(RD, RS),               _OO_Mrm 
        (0x0fb7         ,_b11,_r8(RD),_r2(RS)                           ))
+#define MOVZWQmr(MD, MB, MI, MS, RD)   (_REXQmr(MB, MI, RD),           _OO_r_X 
        (0x0fb7              ,_r8(RD)           ,MD,MB,MI,MS            ))
 
+#define MOVSLQrr(RS, RD)               _m64only((_REXQrr(RD, RS),      _O_Mrm  
        (0x63           ,_b11,_r8(RD),_r4(RS)                           )))
+#define MOVSLQmr(MD, MB, MI, MS, RD)   _m64only((_REXQmr(MB, MI, RD),  _O_r_X  
        (0x63                ,_r8(RD)           ,MD,MB,MI,MS            )))
 
-#define POPWr(RD)                      _wOr            (0x58,_r2(RD)           
                                        )
-#define POPWm(MD,MB,MI,MS)             _wO_r_X         (0x8f                
,_b000             ,MD,MB,MI,MS            )
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
 
-#define POPLr(RD)                      _Or             (0x58,_r4(RD)           
                                        )
-#define POPLm(MD,MB,MI,MS)             _O_r_X          (0x8f                
,_b000             ,MD,MB,MI,MS            )
+#define LEALmr(MD, MB, MI, MS, RD)     (_REXLmr(MB, MI, RD),           _O_r_X  
        (0x8d                ,_r4(RD)           ,MD,MB,MI,MS            ))
 
+#define BSWAPLr(R)                     (_REXLrr(0, R),                 _OOr    
        (0x0fc8,_r4(R)                                                  ))
+#define BSWAPQr(R)                     (_REXQrr(0, R),                 _OOr    
        (0x0fc8,_r8(R)                                                  ))
 
-#define POPA_()                                _wO             (0x61           
                                                )
-#define POPAD_()                       _O              (0x61                   
                                        )
+#define CLC()                                                          _O      
        (0xf8                                                           )
+#define STC()                                                          _O      
        (0xf9                                                           )
 
-#define POPF_()                                _wO             (0x9d           
                                                )
-#define POPFD_()                       _O              (0x9d                   
                                        )
+#define CMC()                                                          _O      
        (0xf5                                                           )
+#define CLD()                                                          _O      
        (0xfc                                                           )
+#define STD()                                                          _O      
        (0xfd                                                           )
 
+#define CBTW()                         (_d16(),                        _O      
        (0x98                                                           ))
+#define CWTL()                                                         _O      
        (0x98                                                           )
+#define CLTQ()                         _m64only(_REXQrr(0, 0),         _O      
        (0x98                                                           ))
 
-#define PUSHWr(R)                      _wOr            (0x50,_r2(R)            
                                        )
-#define PUSHWm(MD,MB,MI,MS)            _wO_r_X         (0xff,               
,_b110             ,MD,MB,MI,MS            )
-#define PUSHWi(IM)                     _wOs_sW         (0x68                   
                                ,IM     )
+#define CBW                            CBTW
+#define CWDE                           CWTL
+#define CDQE                           CLTQ
 
-#define PUSHLr(R)                      _Or             (0x50,_r4(R)            
                                        )
-#define PUSHLm(MD,MB,MI,MS)            _O_r_X          (0xff                
,_b110             ,MD,MB,MI,MS            )
-#define PUSHLi(IM)                     _Os_sL          (0x68                   
                                ,IM     )
-
-
-#define PUSHA_()                       _wO             (0x60                   
                                        )
-#define PUSHAD_()                      _O              (0x60                   
                                        )
-
-#define PUSHF_()                       _O              (0x9c                   
                                        )
-#define PUSHFD_()                      _wO             (0x9c                   
                                        )
-
-#define RET_()                         _O              (0xc3                   
                                        )
-#define RETi(IM)                       _O_W            (0xc2                   
                                ,_su16(IM))
+#define CWTD()                         (_d16(),                        _O      
        (0x99                                                           ))
+#define CLTD()                                                         _O      
        (0x99                                                           )
+#define CQTO()                         _m64only(_REXQrr(0, 0),         _O      
        (0x99                                                           ))
 
+#define CWD                            CWTD
+#define CDQ                            CLTD
+#define CQO                            CQTO
 
-#define ROLBir(IM,RD)          (((IM)==1) ?    _O_Mrm          (0xd0   
,_b11,_b000,_r1(RD)                             ) : \
-                                               _O_Mrm_B        (0xc0   
,_b11,_b000,_r1(RD)                     ,_u8(IM) ) )
-#define ROLBim(IM,MD,MB,MS,MI) (((IM)==1) ?    _O_r_X          (0xd0        
,_b000             ,MD,MB,MI,MS            ) : \
-                                               _O_r_X_B        (0xc0        
,_b000             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define ROLBrr(RS,RD)          (((RS)==_CL) ?  _O_Mrm          (0xd2   
,_b11,_b000,_r1(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-#define ROLBrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _O_r_X          (0xd2        
,_b000             ,MD,MB,MI,MS            ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-
-#define ROLWir(IM,RD)          (((IM)==1) ?    _wO_Mrm (0xd1   
,_b11,_b000,_r2(RD)                             ) : \
-                                               _wO_Mrm_B       (0xc1   
,_b11,_b000,_r2(RD)                     ,_u8(IM) ) )
-#define ROLWim(IM,MD,MB,MS,MI) (((IM)==1) ?    _wO_r_X (0xd1        ,_b000     
        ,MD,MB,MI,MS            ) : \
-                                               _wO_r_X_B       (0xc1        
,_b000             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define ROLWrr(RS,RD)          (((RS)==_CL) ?  _wO_Mrm (0xd3   
,_b11,_b000,_r2(RD)                             ) : \
-                                               JITFAIL ("source register must 
be CL"                                   ) )
-#define ROLWrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _wO_r_X (0xd3        ,_b000     
        ,MD,MB,MI,MS            ) : \
-                                               JITFAIL ("source register must 
be CL"                                   ) )
-
-#define ROLLir(IM,RD)          (((IM)==1) ?    _O_Mrm          (0xd1   
,_b11,_b000,_r4(RD)                             ) : \
-                                               _O_Mrm_B        (0xc1   
,_b11,_b000,_r4(RD)                     ,_u8(IM) ) )
-#define ROLLim(IM,MD,MB,MS,MI) (((IM)==1) ?    _O_r_X          (0xd1        
,_b000             ,MD,MB,MI,MS            ) : \
-                                               _O_r_X_B        (0xc1        
,_b000             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define ROLLrr(RS,RD)          (((RS)==_CL) ?  _O_Mrm          (0xd3   
,_b11,_b000,_r4(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-#define ROLLrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _O_r_X          (0xd3        
,_b000             ,MD,MB,MI,MS            ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
+#define LAHF_()                                _m32only(                       
_O              (0x9f                                                           
))
+#define SAHF_()                                _m32only(                       
_O              (0x9e                                                           
))
 
+/*                                                                     _format 
        Opcd            ,Mod ,r     ,m          ,mem=dsp+sib    ,imm... */
 
-#define RORBir(IM,RD)          (((IM)==1) ?    _O_Mrm          (0xd0   
,_b11,_b001,_r1(RD)                             ) : \
-                                               _O_Mrm_B        (0xc0   
,_b11,_b001,_r1(RD)                     ,_u8(IM) ) )
-#define RORBim(IM,MD,MB,MS,MI) (((IM)==1) ?    _O_r_X          (0xd0        
,_b001             ,MD,MB,MI,MS            ) : \
-                                               _O_r_X_B        (0xc0        
,_b001             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define RORBrr(RS,RD)          (((RS)==_CL) ?  _O_Mrm          (0xd2   
,_b11,_b001,_r1(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-#define RORBrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _O_r_X          (0xd2        
,_b001             ,MD,MB,MI,MS            ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-
-#define RORWir(IM,RD)          (((IM)==1) ?    _wO_Mrm (0xd1   
,_b11,_b001,_r2(RD)                             ) : \
-                                               _wO_Mrm_B       (0xc1   
,_b11,_b001,_r2(RD)                     ,_u8(IM) ) )
-#define RORWim(IM,MD,MB,MS,MI) (((IM)==1) ?    _wO_r_X (0xd1        ,_b001     
        ,MD,MB,MI,MS            ) : \
-                                               _wO_r_X_B       (0xc1        
,_b001             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define RORWrr(RS,RD)          (((RS)==_CL) ?  _wO_Mrm (0xd3   
,_b11,_b001,_r2(RD)                             ) : \
-                                               JITFAIL ("source register must 
be CL"                                   ) )
-#define RORWrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _wO_r_X (0xd3        ,_b001     
        ,MD,MB,MI,MS            ) : \
-                                               JITFAIL ("source register must 
be CL"                                   ) )
-
-#define RORLir(IM,RD)          (((IM)==1) ?    _O_Mrm          (0xd1   
,_b11,_b001,_r4(RD)                             ) : \
-                                               _O_Mrm_B        (0xc1   
,_b11,_b001,_r4(RD)                     ,_u8(IM) ) )
-#define RORLim(IM,MD,MB,MS,MI) (((IM)==1) ?    _O_r_X          (0xd1        
,_b001             ,MD,MB,MI,MS            ) : \
-                                               _O_r_X_B        (0xc1        
,_b001             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define RORLrr(RS,RD)          (((RS)==_CL) ?  _O_Mrm          (0xd3   
,_b11,_b001,_r4(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-#define RORLrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _O_r_X          (0xd3        
,_b001             ,MD,MB,MI,MS            ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-
-
-#define SAHF_()                                        _O      (0x9e           
                                                )
-
-
-#define SALBir SHLBir
-#define SALBim SHLBim
-#define SALBrr SHLBrr
-#define SALBrm SHLBrm
-#define SALWir SHLWir
-#define SALWim SHLWim
-#define SALWrr SHLWrr
-#define SALWrm SHLWrm
-#define SALLir SHLLir
-#define SALLim SHLLim
-#define SALLrr SHLLrr
-#define SALLrm SHLLrm
-
-
-#define SARBir(IM,RD)          (((IM)==1) ?    _O_Mrm          (0xd0   
,_b11,_b111,_r1(RD)                             ) : \
-                                               _O_Mrm_B        (0xc0   
,_b11,_b111,_r1(RD)                     ,_u8(IM) ) )
-#define SARBim(IM,MD,MB,MS,MI) (((IM)==1) ?    _O_r_X          (0xd0        
,_b111             ,MD,MB,MI,MS            ) : \
-                                               _O_r_X_B        (0xc0        
,_b111             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define SARBrr(RS,RD)          (((RS)==_CL) ?  _O_Mrm          (0xd2   
,_b11,_b111,_r1(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-#define SARBrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _O_r_X          (0xd2        
,_b111             ,MD,MB,MI,MS            ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-
-#define SARWir(IM,RD)          (((IM)==1) ?    _wO_Mrm (0xd1   
,_b11,_b111,_r2(RD)                             ) : \
-                                               _wO_Mrm_B       (0xc1   
,_b11,_b111,_r2(RD)                     ,_u8(IM) ) )
-#define SARWim(IM,MD,MB,MS,MI) (((IM)==1) ?    _wO_r_X (0xd1        ,_b111     
        ,MD,MB,MI,MS            ) : \
-                                               _wO_r_X_B       (0xc1        
,_b111             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define SARWrr(RS,RD)          (((RS)==_CL) ?  _wO_Mrm (0xd3   
,_b11,_b111,_r2(RD)                             ) : \
-                                               JITFAIL ("source register must 
be CL"                                   ) )
-#define SARWrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _wO_r_X (0xd3        ,_b111     
        ,MD,MB,MI,MS            ) : \
-                                               JITFAIL ("source register must 
be CL"                                   ) )
-
-#define SARLir(IM,RD)          (((IM)==1) ?    _O_Mrm          (0xd1   
,_b11,_b111,_r4(RD)                             ) : \
-                                               _O_Mrm_B        (0xc1   
,_b11,_b111,_r4(RD)                     ,_u8(IM) ) )
-#define SARLim(IM,MD,MB,MS,MI) (((IM)==1) ?    _O_r_X          (0xd1        
,_b111             ,MD,MB,MI,MS            ) : \
-                                               _O_r_X_B        (0xc1        
,_b111             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define SARLrr(RS,RD)          (((RS)==_CL) ?  _O_Mrm          (0xd3   
,_b11,_b111,_r4(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-#define SARLrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _O_r_X          (0xd3        
,_b111             ,MD,MB,MI,MS            ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-
-
-#define SBBBrr(RS, RD)                 _O_Mrm          (0x18           
,_b11,_r1(RS),_r1(RD)                           )
-#define SBBBmr(MD, MB, MI, MS, RD)     _O_r_X          (0x1a                
,_r1(RD)           ,MD,MB,MI,MS            )
-#define SBBBrm(RS, MD, MB, MI, MS)     _O_r_X          (0x18                
,_r1(RS)           ,MD,MB,MI,MS            )
-#define SBBBir(IM, RD)                 _O_Mrm_B        (0x80           
,_b11,_b011  ,_r1(RD)                   ,_su8(IM))
-#define SBBBim(IM, MD, MB, MI, MS)     _O_r_X_B        (0x80                
,_b011             ,MD,MB,MI,MS    ,_su8(IM))
-
-#define SBBWrr(RS, RD)                 _wO_Mrm         (0x19           
,_b11,_r2(RS),_r2(RD)                           )
-#define SBBWmr(MD, MB, MI, MS, RD)     _wO_r_X         (0x1b                
,_r2(RD)           ,MD,MB,MI,MS            )
-#define SBBWrm(RS, MD, MB, MI, MS)     _wO_r_X         (0x19                
,_r2(RS)           ,MD,MB,MI,MS            )
-#define SBBWir(IM, RD)                 _wOs_Mrm_sW     (0x81           
,_b11,_b011  ,_r2(RD)                   ,_su16(IM))
-#define SBBWim(IM, MD, MB, MI, MS)     _wOs_r_X_sW     (0x81                
,_b011             ,MD,MB,MI,MS    ,_su16(IM))
-
-#define SBBLrr(RS, RD)                 _O_Mrm          (0x19           
,_b11,_r4(RS),_r4(RD)                           )
-#define SBBLmr(MD, MB, MI, MS, RD)     _O_r_X          (0x1b                
,_r4(RD)           ,MD,MB,MI,MS            )
-#define SBBLrm(RS, MD, MB, MI, MS)     _O_r_X          (0x19                
,_r4(RS)           ,MD,MB,MI,MS            )
-#define SBBLir(IM, RD)                 _Os_Mrm_sL      (0x81           
,_b11,_b011  ,_r4(RD)                   ,IM     )
-#define SBBLim(IM, MD, MB, MI, MS)     _Os_r_X_sL      (0x81                
,_b011             ,MD,MB,MI,MS    ,IM     )
-
-
-#define SETCCir(CC,RD)                 _OO_Mrm         (0x0f90|(CC)    
,_b11,_b000,_r1(RD)                             )
+#define CPUID_()                                                       _OO     
        (0x0fa2                                                         )
+#define RDTSC_()                                                       _OO     
        (0xff31                                                         )
 
-#define SETOr(RD)                      SETCCir(0x0,RD)
-#define SETNOr(RD)                     SETCCir(0x1,RD)
-#define SETBr(RD)                      SETCCir(0x2,RD)
-#define SETNAEr(RD)                    SETCCir(0x2,RD)
-#define SETNBr(RD)                     SETCCir(0x3,RD)
-#define SETAEr(RD)                     SETCCir(0x3,RD)
-#define SETEr(RD)                      SETCCir(0x4,RD)
-#define SETZr(RD)                      SETCCir(0x4,RD)
-#define SETNEr(RD)                     SETCCir(0x5,RD)
-#define SETNZr(RD)                     SETCCir(0x5,RD)
-#define SETBEr(RD)                     SETCCir(0x6,RD)
-#define SETNAr(RD)                     SETCCir(0x6,RD)
-#define SETNBEr(RD)                    SETCCir(0x7,RD)
-#define SETAr(RD)                      SETCCir(0x7,RD)
-#define SETSr(RD)                      SETCCir(0x8,RD)
-#define SETNSr(RD)                     SETCCir(0x9,RD)
-#define SETPr(RD)                      SETCCir(0xa,RD)
-#define SETPEr(RD)                     SETCCir(0xa,RD)
-#define SETNPr(RD)                     SETCCir(0xb,RD)
-#define SETPOr(RD)                     SETCCir(0xb,RD)
-#define SETLr(RD)                      SETCCir(0xc,RD)
-#define SETNGEr(RD)                    SETCCir(0xc,RD)
-#define SETNLr(RD)                     SETCCir(0xd,RD)
-#define SETGEr(RD)                     SETCCir(0xd,RD)
-#define SETLEr(RD)                     SETCCir(0xe,RD)
-#define SETNGr(RD)                     SETCCir(0xe,RD)
-#define SETNLEr(RD)                    SETCCir(0xf,RD)
-#define SETGr(RD)                      SETCCir(0xf,RD)
+#define ENTERii(W, B)                                                  _O_W_B  
        (0xc8                                             ,_su16(W),_su8(B))
 
-#define SETCCim(CC,MD,MB,MI,MS)                _OO_r_X         (0x0f90|(CC)    
     ,_b000             ,MD,MB,MI,MS            )
-
-#define SETOm(D,B,I,S)                 SETCCim(0x0,D,B,I,S)
-#define SETNOm(D,B,I,S)                        SETCCim(0x1,D,B,I,S)
-#define SETBm(D,B,I,S)                 SETCCim(0x2,D,B,I,S)
-#define SETNAEm(D,B,I,S)               SETCCim(0x2,D,B,I,S)
-#define SETNBm(D,B,I,S)                        SETCCim(0x3,D,B,I,S)
-#define SETAEm(D,B,I,S)                        SETCCim(0x3,D,B,I,S)
-#define SETEm(D,B,I,S)                 SETCCim(0x4,D,B,I,S)
-#define SETZm(D,B,I,S)                 SETCCim(0x4,D,B,I,S)
-#define SETNEm(D,B,I,S)                        SETCCim(0x5,D,B,I,S)
-#define SETNZm(D,B,I,S)                        SETCCim(0x5,D,B,I,S)
-#define SETBEm(D,B,I,S)                        SETCCim(0x6,D,B,I,S)
-#define SETNAm(D,B,I,S)                        SETCCim(0x6,D,B,I,S)
-#define SETNBEm(D,B,I,S)               SETCCim(0x7,D,B,I,S)
-#define SETAm(D,B,I,S)                 SETCCim(0x7,D,B,I,S)
-#define SETSm(D,B,I,S)                 SETCCim(0x8,D,B,I,S)
-#define SETNSm(D,B,I,S)                        SETCCim(0x9,D,B,I,S)
-#define SETPm(D,B,I,S)                 SETCCim(0xa,D,B,I,S)
-#define SETPEm(D,B,I,S)                        SETCCim(0xa,D,B,I,S)
-#define SETNPm(D,B,I,S)                        SETCCim(0xb,D,B,I,S)
-#define SETPOm(D,B,I,S)                        SETCCim(0xb,D,B,I,S)
-#define SETLm(D,B,I,S)                 SETCCim(0xc,D,B,I,S)
-#define SETNGEm(D,B,I,S)               SETCCim(0xc,D,B,I,S)
-#define SETNLm(D,B,I,S)                        SETCCim(0xd,D,B,I,S)
-#define SETGEm(D,B,I,S)                        SETCCim(0xd,D,B,I,S)
-#define SETLEm(D,B,I,S)                        SETCCim(0xe,D,B,I,S)
-#define SETNGm(D,B,I,S)                        SETCCim(0xe,D,B,I,S)
-#define SETNLEm(D,B,I,S)               SETCCim(0xf,D,B,I,S)
-#define SETGm(D,B,I,S)                 SETCCim(0xf,D,B,I,S)
-
-
-#define SHLBir(IM,RD)          (((IM)==1) ?    _O_Mrm          (0xd0   
,_b11,_b100,_r1(RD)                             ) : \
-                                               _O_Mrm_B        (0xc0   
,_b11,_b100,_r1(RD)                     ,_u8(IM) ) )
-#define SHLBim(IM,MD,MB,MS,MI) (((IM)==1) ?    _O_r_X          (0xd0        
,_b100             ,MD,MB,MI,MS            ) : \
-                                               _O_r_X_B        (0xc0        
,_b100             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define SHLBrr(RS,RD)          (((RS)==_CL) ?  _O_Mrm          (0xd2   
,_b11,_b100,_r1(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-#define SHLBrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _O_r_X          (0xd2        
,_b100             ,MD,MB,MI,MS            ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-
-#define SHLWir(IM,RD)          (((IM)==1) ?    _wO_Mrm         (0xd1   
,_b11,_b100,_r2(RD)                             ) : \
-                                               _wO_Mrm_B       (0xc1   
,_b11,_b100,_r2(RD)                     ,_u8(IM) ) )
-#define SHLWim(IM,MD,MB,MS,MI) (((IM)==1) ?    _wO_r_X         (0xd1        
,_b100             ,MD,MB,MI,MS            ) : \
-                                               _wO_r_X_B       (0xc1        
,_b100             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define SHLWrr(RS,RD)          (((RS)==_CL) ?  _wO_Mrm         (0xd3   
,_b11,_b100,_r2(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-#define SHLWrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _wO_r_X         (0xd3        
,_b100             ,MD,MB,MI,MS            ) : \
-                                               JITFAIL         ("source 
register must be CL"                                   ) )
-
-#define SHLLir(IM,RD)          (((IM)==1) ?    _O_Mrm          (0xd1   
,_b11,_b100,_r4(RD)                             ) : \
-                                               _O_Mrm_B        (0xc1   
,_b11,_b100,_r4(RD)                     ,_u8(IM) ) )
-#define SHLLim(IM,MD,MB,MS,MI) (((IM)==1) ?    _O_r_X          (0xd1        
,_b100             ,MD,MB,MI,MS            ) : \
-                                               _O_r_X_B        (0xc1        
,_b100             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define SHLLrr(RS,RD)          (((RS)==_CL) ?  _O_Mrm          (0xd3   
,_b11,_b100,_r4(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-#define SHLLrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _O_r_X          (0xd3        
,_b100             ,MD,MB,MI,MS            ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-
-
-#define SHRBir(IM,RD)          (((IM)==1) ?    _O_Mrm          (0xd0   
,_b11,_b101,_r1(RD)                             ) : \
-                                               _O_Mrm_B        (0xc0   
,_b11,_b101,_r1(RD)                     ,_u8(IM) ) )
-#define SHRBim(IM,MD,MB,MS,MI) (((IM)==1) ?    _O_r_X          (0xd0        
,_b101             ,MD,MB,MI,MS            ) : \
-                                               _O_r_X_B        (0xc0        
,_b101             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define SHRBrr(RS,RD)          (((RS)==_CL) ?  _O_Mrm          (0xd2   
,_b11,_b101,_r1(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-#define SHRBrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _O_r_X          (0xd2        
,_b101             ,MD,MB,MI,MS            ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-
-#define SHRWir(IM,RD)          (((IM)==1) ?    _wO_Mrm         (0xd1   
,_b11,_b101,_r2(RD)                             ) : \
-                                               _wO_Mrm_B       (0xc1   
,_b11,_b101,_r2(RD)                     ,_u8(IM) ) )
-#define SHRWim(IM,MD,MB,MS,MI) (((IM)==1) ?    _wO_r_X         (0xd1        
,_b101             ,MD,MB,MI,MS            ) : \
-                                               _wO_r_X_B       (0xc1        
,_b101             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define SHRWrr(RS,RD)          (((RS)==_CL) ?  _wO_Mrm         (0xd3   
,_b11,_b101,_r2(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-#define SHRWrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _wO_r_X         (0xd3        
,_b101             ,MD,MB,MI,MS            ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-
-#define SHRLir(IM,RD)          (((IM)==1) ?    _O_Mrm          (0xd1   
,_b11,_b101,_r4(RD)                             ) : \
-                                               _O_Mrm_B        (0xc1   
,_b11,_b101,_r4(RD)                     ,_u8(IM) ) )
-#define SHRLim(IM,MD,MB,MS,MI) (((IM)==1) ?    _O_r_X          (0xd1        
,_b101             ,MD,MB,MI,MS            ) : \
-                                               _O_r_X_B        (0xc1        
,_b101             ,MD,MB,MI,MS    ,_u8(IM) ) )
-#define SHRLrr(RS,RD)          (((RS)==_CL) ?  _O_Mrm          (0xd3   
,_b11,_b101,_r4(RD)                             ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-#define SHRLrm(RS,MD,MB,MS,MI) (((RS)==_CL) ?  _O_r_X          (0xd3        
,_b101             ,MD,MB,MI,MS            ) : \
-                                               JITFAIL         ("source 
register must be CL"                           ) )
-
-
-#define STC_()                         _O              (0xf9                   
                                        )
-
-
-#define SUBBrr(RS, RD)                 _O_Mrm          (0x28           
,_b11,_r1(RS),_r1(RD)                           )
-#define SUBBmr(MD, MB, MI, MS, RD)     _O_r_X          (0x2a                
,_r1(RD)           ,MD,MB,MI,MS            )
-#define SUBBrm(RS, MD, MB, MI, MS)     _O_r_X          (0x28                
,_r1(RS)           ,MD,MB,MI,MS            )
-#define SUBBir(IM, RD)                 _O_Mrm_B        (0x80           
,_b11,_b101  ,_r1(RD)                   ,_su8(IM))
-#define SUBBim(IM, MD, MB, MI, MS)     _O_r_X_B        (0x80                
,_b101             ,MD,MB,MI,MS    ,_su8(IM))
-
-#define SUBWrr(RS, RD)                 _wO_Mrm         (0x29           
,_b11,_r2(RS),_r2(RD)                           )
-#define SUBWmr(MD, MB, MI, MS, RD)     _wO_r_X         (0x2b                
,_r2(RD)           ,MD,MB,MI,MS            )
-#define SUBWrm(RS, MD, MB, MI, MS)     _wO_r_X         (0x29                
,_r2(RS)           ,MD,MB,MI,MS            )
-#define SUBWir(IM, RD)                 _wOs_Mrm_sW     (0x81           
,_b11,_b101  ,_r2(RD)                   ,_su16(IM))
-#define SUBWim(IM, MD, MB, MI, MS)     _wOs_r_X_sW     (0x81                
,_b101             ,MD,MB,MI,MS    ,_su16(IM))
-
-#define SUBLrr(RS, RD)                 _O_Mrm          (0x29           
,_b11,_r4(RS),_r4(RD)                           )
-#define SUBLmr(MD, MB, MI, MS, RD)     _O_r_X          (0x2b                
,_r4(RD)           ,MD,MB,MI,MS            )
-#define SUBLrm(RS, MD, MB, MI, MS)     _O_r_X          (0x29                
,_r4(RS)           ,MD,MB,MI,MS            )
-#define SUBLir(IM, RD)                 _Os_Mrm_sL      (0x81           
,_b11,_b101  ,_r4(RD)                   ,IM     )
-#define SUBLim(IM, MD, MB, MI, MS)     _Os_r_X_sL      (0x81                
,_b101             ,MD,MB,MI,MS    ,IM     )
-
-
-#define TESTBrr(RS, RD)                        _O_Mrm          (0x84           
,_b11,_r1(RS),_r1(RD)                           )
-#define TESTBrm(RS, MD, MB, MI, MS)    _O_r_X          (0x84                
,_r1(RS)           ,MD,MB,MI,MS            )
-#define TESTBir(IM, RD)                        _O_Mrm_B        (0xf6           
,_b11,_b000  ,_r1(RD)                   ,_u8(IM))
-#define TESTBim(IM, MD, MB, MI, MS)    _O_r_X_B        (0xf6                
,_b000             ,MD,MB,MI,MS    ,_u8(IM))
-
-#define TESTWrr(RS, RD)                        _wO_Mrm         (0x85           
,_b11,_r2(RS),_r2(RD)                           )
-#define TESTWrm(RS, MD, MB, MI, MS)    _wO_r_X         (0x85                
,_r2(RS)           ,MD,MB,MI,MS            )
-#define TESTWir(IM, RD)                        _wO_Mrm_W       (0xf7           
,_b11,_b000  ,_r2(RD)                   ,_u16(IM))
-#define TESTWim(IM, MD, MB, MI, MS)    _wO_r_X_W       (0xf7                
,_b000             ,MD,MB,MI,MS    ,_u16(IM))
-
-#define TESTLrr(RS, RD)                        _O_Mrm          (0x85           
,_b11,_r4(RS),_r4(RD)                           )
-#define TESTLrm(RS, MD, MB, MI, MS)    _O_r_X          (0x85                
,_r4(RS)           ,MD,MB,MI,MS            )
-#define TESTLir(IM, RD)                        _O_Mrm_L        (0xf7           
,_b11,_b000  ,_r4(RD)                   ,IM     )
-#define TESTLim(IM, MD, MB, MI, MS)    _O_r_X_L        (0xf7                
,_b000             ,MD,MB,MI,MS    ,IM     )
-
-
-#define XADDBrr(RS,RD)                 _OO_Mrm         (0x0fc0         
,_b11,_r1(RS),_r1(RD)                           )
-#define XADDBrm(RS,MD,MB,MI,MS)                _OO_r_X         (0x0fc0         
     ,_r1(RS)           ,MD,MB,MI,MS            )
-
-#define XADDWrr(RS,RD)                 _wOO_Mrm        (0x0fc1         
,_b11,_r2(RS),_r2(RD)                           )
-#define XADDWrm(RS,MD,MB,MI,MS)                _wOO_r_X        (0x0fc1         
     ,_r2(RS)           ,MD,MB,MI,MS            )
-
-#define XADDLrr(RS,RD)                 _OO_Mrm         (0x0fc1         
,_b11,_r4(RS),_r4(RD)                           )
-#define XADDLrm(RS,MD,MB,MI,MS)                _OO_r_X         (0x0fc1         
     ,_r4(RS)           ,MD,MB,MI,MS            )
-
-
-#define XCHGBrr(RS,RD)                 _O_Mrm          (0x86           
,_b11,_r1(RS),_r1(RD)                           )
-#define XCHGBrm(RS,MD,MB,MI,MS)                _O_r_X          (0x86           
     ,_r1(RS)           ,MD,MB,MI,MS            )
-
-#define XCHGWrr(RS,RD)                 _wO_Mrm         (0x87           
,_b11,_r2(RS),_r2(RD)                           )
-#define XCHGWrm(RS,MD,MB,MI,MS)                _wO_r_X         (0x87           
     ,_r2(RS)           ,MD,MB,MI,MS            )
-
-#define XCHGLrr(RS,RD)                 _O_Mrm          (0x87           
,_b11,_r4(RS),_r4(RD)                           )
-#define XCHGLrm(RS,MD,MB,MI,MS)                _O_r_X          (0x87           
     ,_r4(RS)           ,MD,MB,MI,MS            )
-
-
-#define XORBrr(RS, RD)                 _O_Mrm          (0x30           
,_b11,_r1(RS),_r1(RD)                           )
-#define XORBmr(MD, MB, MI, MS, RD)     _O_r_X          (0x32                
,_r1(RD)           ,MD,MB,MI,MS            )
-#define XORBrm(RS, MD, MB, MI, MS)     _O_r_X          (0x30                
,_r1(RS)           ,MD,MB,MI,MS            )
-#define XORBir(IM, RD)                 _O_Mrm_B        (0x80           
,_b11,_b110  ,_r1(RD)                   ,_su8(IM))
-#define XORBim(IM, MD, MB, MI, MS)     _O_r_X_B        (0x80                
,_b110             ,MD,MB,MI,MS    ,_su8(IM))
-
-#define XORWrr(RS, RD)                 _wO_Mrm         (0x31           
,_b11,_r2(RS),_r2(RD)                           )
-#define XORWmr(MD, MB, MI, MS, RD)     _wO_r_X         (0x33                
,_r2(RD)           ,MD,MB,MI,MS            )
-#define XORWrm(RS, MD, MB, MI, MS)     _wO_r_X         (0x31                
,_r2(RS)           ,MD,MB,MI,MS            )
-#define XORWir(IM, RD)                 _wOs_Mrm_sW     (0x81           
,_b11,_b110  ,_r2(RD)                   ,_su16(IM))
-#define XORWim(IM, MD, MB, MI, MS)     _wOs_r_X_sW     (0x81                
,_b110             ,MD,MB,MI,MS    ,_su16(IM))
+#define LEAVE_()                                                       _O      
        (0xc9                                                           )
+#define RET_()                                                         _O      
        (0xc3                                                           )
+#define RETi(IM)                                                       _O_W    
        (0xc2                                                   ,_su16(IM))
 
-#define XORLrr(RS, RD)                 _O_Mrm          (0x31           
,_b11,_r4(RS),_r4(RD)                           )
-#define XORLmr(MD, MB, MI, MS, RD)     _O_r_X          (0x33                
,_r4(RD)           ,MD,MB,MI,MS            )
-#define XORLrm(RS, MD, MB, MI, MS)     _O_r_X          (0x31                
,_r4(RS)           ,MD,MB,MI,MS            )
-#define XORLir(IM, RD)                 _Os_Mrm_sL      (0x81           
,_b11,_b110  ,_r4(RD)                   ,IM     )
-#define XORLim(IM, MD, MB, MI, MS)     _Os_r_X_sL      (0x81                
,_b110             ,MD,MB,MI,MS    ,IM     )
+#define NOP_()                                                         _O      
        (0x90                                                           )
 
 /* x87 instructions -- yay, we found a use for octal constants :-) */
 
-#define ESCmi(D,B,I,S,OP)      _O_r_X(0xd8|(OP >> 3), (OP & 7), D,B,I,S)
+#define ESCmi(D,B,I,S,OP)      (_REXLrm(0,B,I), _O_r_X(0xd8|(OP >> 3), (OP & 
7), D,B,I,S))
 #define ESCri(RD,OP)           _O_Mrm(0xd8|(OP >> 3), _b11, (OP & 7), RD)
 
 #define ESCrri(RS,RD,OP)       ((RS) == _ST0 ? ESCri(RD,(OP|040))              
        \
@@ -1044,263 +1449,6 @@ typedef _uc             jit_insn;
                          ( ((N)&7) == 0) ? 0 : \
                          JITFAIL(".align argument too large")))
 
-/* --- Media 128-bit instructions ------------------------------------------ */
-
-enum {
-  X86_SSE_CVTIS  = 0x2a,
-  X86_SSE_CVTSI  = 0x2d,
-  X86_SSE_UCOMI  = 0x2e,
-  X86_SSE_COMI   = 0x2f,
-  X86_SSE_SQRT   = 0x51,
-  X86_SSE_RSQRT  = 0x52,
-  X86_SSE_RCP    = 0x53,
-  X86_SSE_AND    = 0x54,
-  X86_SSE_ANDN   = 0x55,
-  X86_SSE_OR     = 0x56,
-  X86_SSE_XOR    = 0x57,
-  X86_SSE_ADD    = 0x58,
-  X86_SSE_MUL    = 0x59,
-  X86_SSE_CVTSD  = 0x5a,
-  X86_SSE_CVTDT  = 0x5b,
-  X86_SSE_SUB    = 0x5c,
-  X86_SSE_MIN    = 0x5d,
-  X86_SSE_DIV    = 0x5e,
-  X86_SSE_MAX    = 0x5f,
-};
-
-/*                                                                     _format 
        Opcd            ,Mod ,r      ,m         ,mem=dsp+sib    ,imm... */
-
-#define __SSELrr(OP,RS,RSA,RD,RDA)     (_REXLrr(RD, RS),               _OO_Mrm 
        (0x0f00|(OP)    ,_b11,RDA(RD),RSA(RS)                           ))
-#define __SSELmr(OP,MD,MB,MI,MS,RD,RDA)        (_REXLmr(MB, MI, RD),           
_OO_r_X         (0x0f00|(OP)         ,RDA(RD)           ,MD,MB,MI,MS            
))
-#define __SSELrm(OP,RS,RSA,MD,MB,MI,MS)        (_REXLrm(RS, MB, MI),           
_OO_r_X         (0x0f00|(OP)         ,RSA(RS)           ,MD,MB,MI,MS            
))
-
-#define __SSEQrr(OP,RS,RSA,RD,RDA)     (_REXQrr(RD, RS),               _OO_Mrm 
        (0x0f00|(OP)    ,_b11,RDA(RD),RSA(RS)                           ))
-#define __SSEQmr(OP,MD,MB,MI,MS,RD,RDA)        (_REXQmr(MB, MI, RD),           
_OO_r_X         (0x0f00|(OP)         ,RDA(RD)           ,MD,MB,MI,MS            
))
-#define __SSEQrm(OP,RS,RSA,MD,MB,MI,MS)        (_REXQrm(RS, MB, MI),           
_OO_r_X         (0x0f00|(OP)         ,RSA(RS)           ,MD,MB,MI,MS            
))
-
-#define _SSELrr(PX,OP,RS,RSA,RD,RDA)                                   
(_B(PX), __SSELrr(OP, RS, RSA, RD, RDA))
-#define _SSELmr(PX,OP,MD,MB,MI,MS,RD,RDA)                              
(_B(PX), __SSELmr(OP, MD, MB, MI, MS, RD, RDA))
-#define _SSELrm(PX,OP,RS,RSA,MD,MB,MI,MS)                              
(_B(PX), __SSELrm(OP, RS, RSA, MD, MB, MI, MS))
-
-#define _SSEQrr(PX,OP,RS,RSA,RD,RDA)                                   
(_B(PX), __SSEQrr(OP, RS, RSA, RD, RDA))
-#define _SSEQmr(PX,OP,MD,MB,MI,MS,RD,RDA)                              
(_B(PX), __SSEQmr(OP, MD, MB, MI, MS, RD, RDA))
-#define _SSEQrm(PX,OP,RS,RSA,MD,MB,MI,MS)                              
(_B(PX), __SSEQrm(OP, RS, RSA, MD, MB, MI, MS))
-
-#define _SSEPSrr(OP,RS,RD)             __SSELrr(      OP, RS,_rX, RD,_rX)
-#define _SSEPSmr(OP,MD,MB,MI,MS,RD)    __SSELmr(      OP, MD, MB, MI, MS, 
RD,_rX)
-#define _SSEPSrm(OP,RS,MD,MB,MI,MS)    __SSELrm(      OP, RS,_rX, MD, MB, MI, 
MS)
-
-#define _SSEPDrr(OP,RS,RD)              _SSELrr(0x66, OP, RS,_rX, RD,_rX)
-#define _SSEPDmr(OP,MD,MB,MI,MS,RD)     _SSELmr(0x66, OP, MD, MB, MI, MS, 
RD,_rX)
-#define _SSEPDrm(OP,RS,MD,MB,MI,MS)     _SSELrm(0x66, OP, RS,_rX, MD, MB, MI, 
MS)
-
-#define _SSESSrr(OP,RS,RD)              _SSELrr(0xf3, OP, RS,_rX, RD,_rX)
-#define _SSESSmr(OP,MD,MB,MI,MS,RD)     _SSELmr(0xf3, OP, MD, MB, MI, MS, 
RD,_rX)
-#define _SSESSrm(OP,RS,MD,MB,MI,MS)     _SSELrm(0xf3, OP, RS,_rX, MD, MB, MI, 
MS)
-
-#define _SSESDrr(OP,RS,RD)              _SSELrr(0xf2, OP, RS,_rX, RD,_rX)
-#define _SSESDmr(OP,MD,MB,MI,MS,RD)     _SSELmr(0xf2, OP, MD, MB, MI, MS, 
RD,_rX)
-#define _SSESDrm(OP,RS,MD,MB,MI,MS)     _SSELrm(0xf2, OP, RS,_rX, MD, MB, MI, 
MS)
-
-#define ADDPSrr(RS, RD)                        _SSEPSrr(X86_SSE_ADD, RS, RD)
-#define ADDPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_ADD, MD, MB, MI, MS, 
RD)
-#define ADDPDrr(RS, RD)                        _SSEPDrr(X86_SSE_ADD, RS, RD)
-#define ADDPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_ADD, MD, MB, MI, MS, 
RD)
-
-#define ADDSSrr(RS, RD)                        _SSESSrr(X86_SSE_ADD, RS, RD)
-#define ADDSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_ADD, MD, MB, MI, MS, 
RD)
-#define ADDSDrr(RS, RD)                        _SSESDrr(X86_SSE_ADD, RS, RD)
-#define ADDSDmr(MD, MB, MI, MS, RD)    _SSESDmr(X86_SSE_ADD, MD, MB, MI, MS, 
RD)
-
-#define ANDNPSrr(RS, RD)               _SSEPSrr(X86_SSE_ANDN, RS, RD)
-#define ANDNPSmr(MD, MB, MI, MS, RD)   _SSEPSmr(X86_SSE_ANDN, MD, MB, MI, MS, 
RD)
-#define ANDNPDrr(RS, RD)               _SSEPDrr(X86_SSE_ANDN, RS, RD)
-#define ANDNPDmr(MD, MB, MI, MS, RD)   _SSEPDmr(X86_SSE_ANDN, MD, MB, MI, MS, 
RD)
-
-#define ANDPSrr(RS, RD)                        _SSEPSrr(X86_SSE_AND, RS, RD)
-#define ANDPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_AND, MD, MB, MI, MS, 
RD)
-#define ANDPDrr(RS, RD)                        _SSEPDrr(X86_SSE_AND, RS, RD)
-#define ANDPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_AND, MD, MB, MI, MS, 
RD)
-
-#define DIVPSrr(RS, RD)                        _SSEPSrr(X86_SSE_DIV, RS, RD)
-#define DIVPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_DIV, MD, MB, MI, MS, 
RD)
-#define DIVPDrr(RS, RD)                        _SSEPDrr(X86_SSE_DIV, RS, RD)
-#define DIVPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_DIV, MD, MB, MI, MS, 
RD)
-
-#define DIVSSrr(RS, RD)                        _SSESSrr(X86_SSE_DIV, RS, RD)
-#define DIVSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_DIV, MD, MB, MI, MS, 
RD)
-#define DIVSDrr(RS, RD)                        _SSESDrr(X86_SSE_DIV, RS, RD)
-#define DIVSDmr(MD, MB, MI, MS, RD)    _SSESDmr(X86_SSE_DIV, MD, MB, MI, MS, 
RD)
-
-#define MAXPSrr(RS, RD)                        _SSEPSrr(X86_SSE_MAX, RS, RD)
-#define MAXPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_MAX, MD, MB, MI, MS, 
RD)
-#define MAXPDrr(RS, RD)                        _SSEPDrr(X86_SSE_MAX, RS, RD)
-#define MAXPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_MAX, MD, MB, MI, MS, 
RD)
-
-#define MAXSSrr(RS, RD)                        _SSESSrr(X86_SSE_MAX, RS, RD)
-#define MAXSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_MAX, MD, MB, MI, MS, 
RD)
-#define MAXSDrr(RS, RD)                        _SSESDrr(X86_SSE_MAX, RS, RD)
-#define MAXSDmr(MD, MB, MI, MS, RD)    _SSESDmr(X86_SSE_MAX, MD, MB, MI, MS, 
RD)
-
-#define MINPSrr(RS, RD)                        _SSEPSrr(X86_SSE_MIN, RS, RD)
-#define MINPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_MIN, MD, MB, MI, MS, 
RD)
-#define MINPDrr(RS, RD)                        _SSEPDrr(X86_SSE_MIN, RS, RD)
-#define MINPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_MIN, MD, MB, MI, MS, 
RD)
-
-#define MINSSrr(RS, RD)                        _SSESSrr(X86_SSE_MIN, RS, RD)
-#define MINSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_MIN, MD, MB, MI, MS, 
RD)
-#define MINSDrr(RS, RD)                        _SSESDrr(X86_SSE_MIN, RS, RD)
-#define MINSDmr(MD, MB, MI, MS, RD)    _SSESDmr(X86_SSE_MIN, MD, MB, MI, MS, 
RD)
-
-#define MULPSrr(RS, RD)                        _SSEPSrr(X86_SSE_MUL, RS, RD)
-#define MULPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_MUL, MD, MB, MI, MS, 
RD)
-#define MULPDrr(RS, RD)                        _SSEPDrr(X86_SSE_MUL, RS, RD)
-#define MULPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_MUL, MD, MB, MI, MS, 
RD)
-
-#define MULSSrr(RS, RD)                        _SSESSrr(X86_SSE_MUL, RS, RD)
-#define MULSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_MUL, MD, MB, MI, MS, 
RD)
-#define MULSDrr(RS, RD)                        _SSESDrr(X86_SSE_MUL, RS, RD)
-#define MULSDmr(MD, MB, MI, MS, RD)    _SSESDmr(X86_SSE_MUL, MD, MB, MI, MS, 
RD)
-
-#define ORPSrr(RS, RD)                 _SSEPSrr(X86_SSE_OR, RS, RD)
-#define ORPSmr(MD, MB, MI, MS, RD)     _SSEPSmr(X86_SSE_OR, MD, MB, MI, MS, RD)
-#define ORPDrr(RS, RD)                 _SSEPDrr(X86_SSE_OR, RS, RD)
-#define ORPDmr(MD, MB, MI, MS, RD)     _SSEPDmr(X86_SSE_OR, MD, MB, MI, MS, RD)
-
-#define RCPPSrr(RS, RD)                        _SSEPSrr(X86_SSE_RCP, RS, RD)
-#define RCPPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_RCP, MD, MB, MI, MS, 
RD)
-#define RCPSSrr(RS, RD)                        _SSESSrr(X86_SSE_RCP, RS, RD)
-#define RCPSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_RCP, MD, MB, MI, MS, 
RD)
-
-#define RSQRTPSrr(RS, RD)              _SSEPSrr(X86_SSE_RSQRT, RS, RD)
-#define RSQRTPSmr(MD, MB, MI, MS, RD)  _SSEPSmr(X86_SSE_RSQRT, MD, MB, MI, MS, 
RD)
-#define RSQRTSSrr(RS, RD)              _SSESSrr(X86_SSE_RSQRT, RS, RD)
-#define RSQRTSSmr(MD, MB, MI, MS, RD)  _SSESSmr(X86_SSE_RSQRT, MD, MB, MI, MS, 
RD)
-
-#define SQRTPSrr(RS, RD)               _SSEPSrr(X86_SSE_SQRT, RS, RD)
-#define SQRTPSmr(MD, MB, MI, MS, RD)   _SSEPSmr(X86_SSE_SQRT, MD, MB, MI, MS, 
RD)
-#define SQRTPDrr(RS, RD)               _SSEPDrr(X86_SSE_SQRT, RS, RD)
-#define SQRTPDmr(MD, MB, MI, MS, RD)   _SSEPDmr(X86_SSE_SQRT, MD, MB, MI, MS, 
RD)
-
-#define SQRTSSrr(RS, RD)               _SSESSrr(X86_SSE_SQRT, RS, RD)
-#define SQRTSSmr(MD, MB, MI, MS, RD)   _SSESSmr(X86_SSE_SQRT, MD, MB, MI, MS, 
RD)
-#define SQRTSDrr(RS, RD)               _SSESDrr(X86_SSE_SQRT, RS, RD)
-#define SQRTSDmr(MD, MB, MI, MS, RD)   _SSESDmr(X86_SSE_SQRT, MD, MB, MI, MS, 
RD)
-
-#define SUBPSrr(RS, RD)                        _SSEPSrr(X86_SSE_SUB, RS, RD)
-#define SUBPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_SUB, MD, MB, MI, MS, 
RD)
-#define SUBPDrr(RS, RD)                        _SSEPDrr(X86_SSE_SUB, RS, RD)
-#define SUBPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_SUB, MD, MB, MI, MS, 
RD)
-
-#define SUBSSrr(RS, RD)                        _SSESSrr(X86_SSE_SUB, RS, RD)
-#define SUBSSmr(MD, MB, MI, MS, RD)    _SSESSmr(X86_SSE_SUB, MD, MB, MI, MS, 
RD)
-#define SUBSDrr(RS, RD)                        _SSESDrr(X86_SSE_SUB, RS, RD)
-#define SUBSDmr(MD, MB, MI, MS, RD)    _SSESDmr(X86_SSE_SUB, MD, MB, MI, MS, 
RD)
-
-#define XORPSrr(RS, RD)                        _SSEPSrr(X86_SSE_XOR, RS, RD)
-#define XORPSmr(MD, MB, MI, MS, RD)    _SSEPSmr(X86_SSE_XOR, MD, MB, MI, MS, 
RD)
-#define XORPDrr(RS, RD)                        _SSEPDrr(X86_SSE_XOR, RS, RD)
-#define XORPDmr(MD, MB, MI, MS, RD)    _SSEPDmr(X86_SSE_XOR, MD, MB, MI, MS, 
RD)
-
-#define COMISSrr(RS, RD)               _SSESSrr(X86_SSE_COMI, RS, RD)
-#define COMISSmr(MD, MB, MI, MS, RD)   _SSESSmr(X86_SSE_COMI, MD, MB, MI, MS, 
RD)
-#define COMISDrr(RS, RD)               _SSESDrr(X86_SSE_COMI, RS, RD)
-#define COMISDmr(MD, MB, MI, MS, RD)   _SSESDmr(X86_SSE_COMI, MD, MB, MI, MS, 
RD)
-
-#define UCOMISSrr(RS, RD)              _SSESSrr(X86_SSE_UCOMI, RS, RD)
-#define UCOMISSmr(MD, MB, MI, MS, RD)  _SSESSmr(X86_SSE_UCOMI, MD, MB, MI, MS, 
RD)
-#define UCOMISDrr(RS, RD)              _SSESDrr(X86_SSE_UCOMI, RS, RD)
-#define UCOMISDmr(MD, MB, MI, MS, RD)  _SSESDmr(X86_SSE_UCOMI, MD, MB, MI, MS, 
RD)
-
-#define MOVAPSrr(RS, RD)               _SSEPSrr(0x28, RS, RD)
-#define MOVAPSmr(MD, MB, MI, MS, RD)   _SSEPSmr(0x28, MD, MB, MI, MS, RD)
-#define MOVAPSrm(RS, MD, MB, MI, MS)   _SSEPSrm(0x29, RS, MD, MB, MI, MS)
-
-#define MOVAPDrr(RS, RD)               _SSEPDrr(0x28, RS, RD)
-#define MOVAPDmr(MD, MB, MI, MS, RD)   _SSEPDmr(0x28, MD, MB, MI, MS, RD)
-#define MOVAPDrm(RS, MD, MB, MI, MS)   _SSEPDrm(0x29, RS, MD, MB, MI, MS)
-
-#define CVTPS2PIrr(RS, RD)             __SSELrr(      X86_SSE_CVTSI, RS,_rX, 
RD,_rM)
-#define CVTPS2PImr(MD, MB, MI, MS, RD) __SSELmr(      X86_SSE_CVTSI, MD, MB, 
MI, MS, RD,_rM)
-#define CVTPD2PIrr(RS, RD)              _SSELrr(0x66, X86_SSE_CVTSI, RS,_rX, 
RD,_rM)
-#define CVTPD2PImr(MD, MB, MI, MS, RD)  _SSELmr(0x66, X86_SSE_CVTSI, MD, MB, 
MI, MS, RD,_rM)
-
-#define CVTPI2PSrr(RS, RD)             __SSELrr(      X86_SSE_CVTIS, RS,_rM, 
RD,_rX)
-#define CVTPI2PSmr(MD, MB, MI, MS, RD) __SSELmr(      X86_SSE_CVTIS, MD, MB, 
MI, MS, RD,_rX)
-#define CVTPI2PDrr(RS, RD)              _SSELrr(0x66, X86_SSE_CVTIS, RS,_rM, 
RD,_rX)
-#define CVTPI2PDmr(MD, MB, MI, MS, RD)  _SSELmr(0x66, X86_SSE_CVTIS, MD, MB, 
MI, MS, RD,_rX)
-
-#define CVTPS2PDrr(RS, RD)             __SSELrr(      X86_SSE_CVTSD, RS,_rX, 
RD,_rX)
-#define CVTPS2PDmr(MD, MB, MI, MS, RD) __SSELmr(      X86_SSE_CVTSD, MD, MB, 
MI, MS, RD,_rX)
-#define CVTPD2PSrr(RS, RD)              _SSELrr(0x66, X86_SSE_CVTSD, RS,_rX, 
RD,_rX)
-#define CVTPD2PSmr(MD, MB, MI, MS, RD)  _SSELmr(0x66, X86_SSE_CVTSD, MD, MB, 
MI, MS, RD,_rX)
-
-#define CVTSS2SDrr(RS, RD)              _SSELrr(0xf3, X86_SSE_CVTSD, RS,_rX, 
RD,_rX)
-#define CVTSS2SDmr(MD, MB, MI, MS, RD)  _SSELmr(0xf3, X86_SSE_CVTSD, MD, MB, 
MI, MS, RD,_rX)
-#define CVTSD2SSrr(RS, RD)              _SSELrr(0xf2, X86_SSE_CVTSD, RS,_rX, 
RD,_rX)
-#define CVTSD2SSmr(MD, MB, MI, MS, RD)  _SSELmr(0xf2, X86_SSE_CVTSD, MD, MB, 
MI, MS, RD,_rX)
-
-#define CVTSS2SILrr(RS, RD)             _SSELrr(0xf3, X86_SSE_CVTSI, RS,_rX, 
RD,_r4)
-#define CVTSS2SILmr(MD, MB, MI, MS, RD)         _SSELmr(0xf3, X86_SSE_CVTSI, 
MD, MB, MI, MS, RD,_r4)
-#define CVTSD2SILrr(RS, RD)             _SSELrr(0xf2, X86_SSE_CVTSI, RS,_rX, 
RD,_r4)
-#define CVTSD2SILmr(MD, MB, MI, MS, RD)         _SSELmr(0xf2, X86_SSE_CVTSI, 
MD, MB, MI, MS, RD,_r4)
-
-#define CVTSI2SSLrr(RS, RD)             _SSELrr(0xf3, X86_SSE_CVTIS, RS,_r4, 
RD,_rX)
-#define CVTSI2SSLmr(MD, MB, MI, MS, RD)         _SSELmr(0xf3, X86_SSE_CVTIS, 
MD, MB, MI, MS, RD,_rX)
-#define CVTSI2SDLrr(RS, RD)             _SSELrr(0xf2, X86_SSE_CVTIS, RS,_r4, 
RD,_rX)
-#define CVTSI2SDLmr(MD, MB, MI, MS, RD)         _SSELmr(0xf2, X86_SSE_CVTIS, 
MD, MB, MI, MS, RD,_rX)
-
-#define CVTSS2SIQrr(RS, RD)             _SSEQrr(0xf3, X86_SSE_CVTSI, RS,_rX, 
RD,_r8)
-#define CVTSS2SIQmr(MD, MB, MI, MS, RD)         _SSEQmr(0xf3, X86_SSE_CVTSI, 
MD, MB, MI, MS, RD,_r8)
-#define CVTSD2SIQrr(RS, RD)             _SSEQrr(0xf2, X86_SSE_CVTSI, RS,_rX, 
RD,_r8)
-#define CVTSD2SIQmr(MD, MB, MI, MS, RD)         _SSEQmr(0xf2, X86_SSE_CVTSI, 
MD, MB, MI, MS, RD,_r8)
-
-#define CVTSI2SSQrr(RS, RD)             _SSEQrr(0xf3, X86_SSE_CVTIS, RS,_r8, 
RD,_rX)
-#define CVTSI2SSQmr(MD, MB, MI, MS, RD)         _SSEQmr(0xf3, X86_SSE_CVTIS, 
MD, MB, MI, MS, RD,_rX)
-#define CVTSI2SDQrr(RS, RD)             _SSEQrr(0xf2, X86_SSE_CVTIS, RS,_r8, 
RD,_rX)
-#define CVTSI2SDQmr(MD, MB, MI, MS, RD)         _SSEQmr(0xf2, X86_SSE_CVTIS, 
MD, MB, MI, MS, RD,_rX)
-
-#define MOVDLXrr(RS, RD)                _SSELrr(0x66, 0x6e, RS,_r4, RD,_rX)
-#define MOVDLXmr(MD, MB, MI, MS, RD)    _SSELmr(0x66, 0x6e, MD, MB, MI, MS, 
RD,_rX)
-#define MOVDQXrr(RS, RD)                _SSEQrr(0x66, 0x6e, RS,_r8, RD,_rX)
-#define MOVDQXmr(MD, MB, MI, MS, RD)    _SSEQmr(0x66, 0x6e, MD, MB, MI, MS, 
RD,_rX)
-
-#define MOVDXLrr(RS, RD)                _SSELrr(0x66, 0x7e, RS,_rX, RD,_r4)
-#define MOVDXLrm(RS, MD, MB, MI, MS)    _SSELrm(0x66, 0x7e, RS,_rX, MD, MB, 
MI, MS)
-#define MOVDXQrr(RS, RD)                _SSEQrr(0x66, 0x7e, RS,_rX, RD,_r8)
-#define MOVDXQrm(RS, MD, MB, MI, MS)    _SSEQrm(0x66, 0x7e, RS,_rX, MD, MB, 
MI, MS)
-
-#define MOVDLMrr(RS, RD)               __SSELrr(      0x6e, RS,_r4, RD,_rM)
-#define MOVDLMmr(MD, MB, MI, MS, RD)   __SSELmr(      0x6e, MD, MB, MI, MS, 
RD,_rM)
-#define MOVDQMrr(RS, RD)               __SSEQrr(      0x6e, RS,_r8, RD,_rM)
-#define MOVDQMmr(MD, MB, MI, MS, RD)   __SSEQmr(      0x6e, MD, MB, MI, MS, 
RD,_rM)
-
-#define MOVDMLrr(RS, RD)               __SSELrr(      0x7e, RS,_rM, RD,_r4)
-#define MOVDMLrm(RS, MD, MB, MI, MS)   __SSELrm(      0x7e, RS,_rM, MD, MB, 
MI, MS)
-#define MOVDMQrr(RS, RD)               __SSEQrr(      0x7e, RS,_rM, RD,_r8)
-#define MOVDMQrm(RS, MD, MB, MI, MS)   __SSEQrm(      0x7e, RS,_rM, MD, MB, 
MI, MS)
-
-#define MOVDQ2Qrr(RS, RD)               _SSELrr(0xf2, 0xd6, RS,_rX, RD,_rM)
-#define MOVHLPSrr(RS, RD)              __SSELrr(      0x12, RS,_rX, RD,_rX)
-#define MOVLHPSrr(RS, RD)              __SSELrr(      0x16, RS,_rX, RD,_rX)
-
-#define MOVDQArr(RS, RD)                _SSELrr(0x66, 0x6f, RS,_rX, RD,_rX)
-#define MOVDQAmr(MD, MB, MI, MS, RD)    _SSELmr(0x66, 0x6f, MD, MB, MI, MS, 
RD,_rX)
-#define MOVDQArm(RS, MD, MB, MI, MS)    _SSELrm(0x66, 0x7f, RS,_rX, MD, MB, 
MI, MS)
-
-#define MOVDQUrr(RS, RD)                _SSELrr(0xf3, 0x6f, RS,_rX, RD,_rX)
-#define MOVDQUmr(MD, MB, MI, MS, RD)    _SSELmr(0xf3, 0x6f, MD, MB, MI, MS, 
RD,_rX)
-#define MOVDQUrm(RS, MD, MB, MI, MS)    _SSELrm(0xf3, 0x7f, RS,_rX, MD, MB, 
MI, MS)
-
-#define MOVHPDmr(MD, MB, MI, MS, RD)    _SSELmr(0x66, 0x16, MD, MB, MI, MS, 
RD,_rX)
-#define MOVHPDrm(RS, MD, MB, MI, MS)    _SSELrm(0x66, 0x17, RS,_rX, MD, MB, 
MI, MS)
-#define MOVHPSmr(MD, MB, MI, MS, RD)   __SSELmr(      0x16, MD, MB, MI, MS, 
RD,_rX)
-#define MOVHPSrm(RS, MD, MB, MI, MS)   __SSELrm(      0x17, RS,_rX, MD, MB, 
MI, MS)
-
-#define MOVLPDmr(MD, MB, MI, MS, RD)    _SSELmr(0x66, 0x12, MD, MB, MI, MS, 
RD,_rX)
-#define MOVLPDrm(RS, MD, MB, MI, MS)    _SSELrm(0x66, 0x13, RS,_rX, MD, MB, 
MI, MS)
-#define MOVLPSmr(MD, MB, MI, MS, RD)   __SSELmr(      0x12, MD, MB, MI, MS, 
RD,_rX)
-#define MOVLPSrm(RS, MD, MB, MI, MS)   __SSELrm(      0x13, RS,_rX, MD, MB, 
MI, MS)
 
 /*** References:                                                               
                */
 /*                                                                             
                */
diff --git a/lightning/i386/core-32.h b/lightning/i386/core-32.h
index bc52231..25594d5 100644
--- a/lightning/i386/core-32.h
+++ b/lightning/i386/core-32.h
@@ -46,8 +46,8 @@ struct jit_local_state {
   int  alloca_slack;
 };
 
-#define jit_base_prolog() (PUSHLr(_EBP), MOVLrr(_ESP, _EBP), PUSHLr(_EBX), 
PUSHLr(_ESI), PUSHLr(_EDI))
-#define jit_prolog(n) (_jitl.framesize = 8, _jitl.alloca_offset = -12, 
jit_base_prolog())
+#define jit_base_prolog() (PUSHLr(_EBX), PUSHLr(_ESI), PUSHLr(_EDI), 
PUSHLr(_EBP), MOVLrr(_ESP, _EBP))
+#define jit_prolog(n) (_jitl.framesize = 20, _jitl.alloca_offset = 0, 
jit_base_prolog())
 
 /* Used internally.  SLACK is used by the Darwin ABI which keeps the stack
    aligned to 16-bytes.  */
@@ -63,10 +63,8 @@ struct jit_local_state {
    _jitl.alloca_offset -= (amount))
    
 /* Stack */
-#ifdef JIT_NEED_PUSH_POP
 #define jit_pushr_i(rs)                PUSHLr(rs)
 #define jit_popr_i(rs)         POPLr(rs)
-#endif
 
 /* The += in argssize allows for stack pollution */
 
@@ -103,7 +101,33 @@ struct jit_local_state {
 
 #define jit_patch_long_at(jump_pc,v)  (*_PSL((jump_pc) - sizeof(long)) = 
_jit_SL((jit_insn *)(v) - (jump_pc)))
 #define jit_patch_at(jump_pc,v)  jit_patch_long_at(jump_pc, v)
-#define jit_ret()              (POPLr(_EDI), POPLr(_ESI), POPLr(_EBX), 
(_jitl.alloca_offset < -12 ? LEAVE_() : POPLr(_EBP)), RET_())
+#define jit_ret()              ((_jitl.alloca_offset < 0 ? LEAVE_() : 
POPLr(_EBP)), POPLr(_EDI), POPLr(_ESI), POPLr(_EBX), RET_())
+
+/* Memory */
+
+#define jit_ldi_c(d, is)                MOVSBLmr((is), 0,    0,    0, (d))
+#define jit_ldxi_c(d, rs, is)           MOVSBLmr((is), (rs), 0,    0, (d))
+
+#define jit_ldi_uc(d, is)               MOVZBLmr((is), 0,    0,    0, (d))
+#define jit_ldxi_uc(d, rs, is)          MOVZBLmr((is), (rs), 0,    0, (d))
+
+#define jit_sti_c(id, rs)               jit_movbrm((rs), (id), 0,    0,    0)
+#define jit_stxi_c(id, rd, rs)          jit_movbrm((rs), (id), (rd), 0,    0)
+
+#define jit_ldi_s(d, is)                MOVSWLmr((is), 0,    0,    0, (d))
+#define jit_ldxi_s(d, rs, is)           MOVSWLmr((is), (rs), 0,    0, (d))
+
+#define jit_ldi_us(d, is)               MOVZWLmr((is), 0,    0,    0,  (d))
+#define jit_ldxi_us(d, rs, is)          MOVZWLmr((is), (rs), 0,    0,  (d))
+
+#define jit_sti_s(id, rs)               MOVWrm(jit_reg16(rs), (id), 0,    0,   
 0)
+#define jit_stxi_s(id, rd, rs)          MOVWrm(jit_reg16(rs), (id), (rd), 0,   
 0)
+
+#define jit_ldi_i(d, is)                MOVLmr((is), 0,    0,    0,  (d))
+#define jit_ldxi_i(d, rs, is)           MOVLmr((is), (rs), 0,    0,  (d))
+
+#define jit_sti_i(id, rs)               MOVLrm((rs), (id), 0,    0,    0)
+#define jit_stxi_i(id, rd, rs)          MOVLrm((rs), (id), (rd), 0,    0)
 
 #endif /* __lightning_core_h */
 
diff --git a/lightning/i386/core-64.h b/lightning/i386/core-64.h
index dfe845c..fefa421 100644
--- a/lightning/i386/core-64.h
+++ b/lightning/i386/core-64.h
@@ -36,6 +36,8 @@
 
 /* Used to implement ldc, stc, ... */
 #define JIT_CAN_16 0
+#define JIT_CALLTMPSTART 0x48
+#define JIT_REXTMP       0x4B
 
 #include "core-i386.h"
 
@@ -106,13 +108,11 @@ struct jit_local_state {
 #define jit_rshr_ul(d, r1, r2) jit_replace((r1), (r2), _ECX,                   
        jit_qop_ ((d), (r1), SHRQrr(_CL,  (d)) ))
 
 /* Stack */
-#ifdef JIT_NEED_PUSH_POP
 #define jit_pushr_i(rs)                PUSHQr(rs)
 #define jit_popr_i(rs)         POPQr(rs)
-#endif
 
-#define jit_base_prolog() (PUSHQr(_EBP), MOVQrr(_ESP, _EBP), PUSHQr(_EBX), 
PUSHQr(_R12), PUSHQr(_R13))
-#define jit_prolog(n) (_jitl.nextarg_geti = 0, _jitl.alloca_offset = -24, 
jit_base_prolog())
+#define jit_base_prolog() (PUSHQr(_EBX), PUSHQr(_R12), PUSHQr(_R13), 
PUSHQr(_EBP), MOVQrr(_ESP, _EBP))
+#define jit_prolog(n) (_jitl.nextarg_geti = 0, _jitl.alloca_offset = 0, 
jit_base_prolog())
 
 /* Stack isn't used for arguments: */
 #define jit_prepare_i(ni)      (_jitl.argssize = 0)
@@ -122,7 +122,7 @@ struct jit_local_state {
 #define jit_reg_is_arg(reg) ((reg == _EDI) || (reg ==_ESI) || (reg == _EDX))
 #define jit_finishr(reg)       ((jit_reg_is_arg((reg)) ? MOVQrr(reg, 
JIT_REXTMP) : (void)0), \
                                  jit_shift_args(), \
-                                 jit_reg_is_arg((reg)) ? CALQsr((JIT_REXTMP)) 
: jit_callr((reg)), \
+                                 jit_reg_is_arg((reg)) ? CALLsr((JIT_REXTMP)) 
: jit_callr((reg)), \
                                  jit_restore_locals())
 
 /* R12 and R13 are callee-save, instead of EDI and ESI.  Can be improved. */
@@ -162,22 +162,22 @@ static int jit_arg_reg_order[] = { _EDI, _ESI, _EDX, _ECX 
};
                                     : MOVQir((is), (d))) \
                                  : XORLrr ((d), (d)) )
 
-#define jit_bmsr_l(label, s1, s2)      (TESTQrr((s1), (s2)), 
JNZm(label,0,0,0), _jit.x.pc)
-#define jit_bmcr_l(label, s1, s2)      (TESTQrr((s1), (s2)), JZm(label,0,0,0), 
 _jit.x.pc)
-#define jit_boaddr_l(label, s1, s2)    (ADDQrr((s2), (s1)), JOm(label,0,0,0), 
_jit.x.pc)
-#define jit_bosubr_l(label, s1, s2)    (SUBQrr((s2), (s1)), JOm(label,0,0,0), 
_jit.x.pc)
-#define jit_boaddr_ul(label, s1, s2)   (ADDQrr((s2), (s1)), JCm(label,0,0,0), 
_jit.x.pc)
-#define jit_bosubr_ul(label, s1, s2)   (SUBQrr((s2), (s1)), JCm(label,0,0,0), 
_jit.x.pc)
+#define jit_bmsr_l(label, s1, s2)      (TESTQrr((s1), (s2)), JNZm(label), 
_jit.x.pc)
+#define jit_bmcr_l(label, s1, s2)      (TESTQrr((s1), (s2)), JZm(label),  
_jit.x.pc)
+#define jit_boaddr_l(label, s1, s2)    (ADDQrr((s2), (s1)), JOm(label), 
_jit.x.pc)
+#define jit_bosubr_l(label, s1, s2)    (SUBQrr((s2), (s1)), JOm(label), 
_jit.x.pc)
+#define jit_boaddr_ul(label, s1, s2)   (ADDQrr((s2), (s1)), JCm(label), 
_jit.x.pc)
+#define jit_bosubr_ul(label, s1, s2)   (SUBQrr((s2), (s1)), JCm(label), 
_jit.x.pc)
 
-#define jit_boaddi_l(label, rs, is)    (ADDQir((is), (rs)), JOm(label,0,0,0), 
_jit.x.pc)
-#define jit_bosubi_l(label, rs, is)    (SUBQir((is), (rs)), JOm(label,0,0,0), 
_jit.x.pc)
-#define jit_boaddi_ul(label, rs, is)   (ADDQir((is), (rs)), JCm(label,0,0,0), 
_jit.x.pc)
-#define jit_bosubi_ul(label, rs, is)   (SUBQir((is), (rs)), JCm(label,0,0,0), 
_jit.x.pc)
+#define jit_boaddi_l(label, rs, is)    (ADDQir((is), (rs)), JOm(label), 
_jit.x.pc)
+#define jit_bosubi_l(label, rs, is)    (SUBQir((is), (rs)), JOm(label), 
_jit.x.pc)
+#define jit_boaddi_ul(label, rs, is)   (ADDQir((is), (rs)), JCm(label), 
_jit.x.pc)
+#define jit_bosubi_ul(label, rs, is)   (SUBQir((is), (rs)), JCm(label), 
_jit.x.pc)
 
 #define jit_patch_long_at(jump_pc,v)  (*_PSL((jump_pc) - sizeof(long)) = 
_jit_SL((jit_insn *)(v)))
 #define jit_patch_short_at(jump_pc,v)  (*_PSI((jump_pc) - sizeof(int)) = 
_jit_SI((jit_insn *)(v) - (jump_pc)))
 #define jit_patch_at(jump_pc,v) (_jitl.long_jumps ? 
jit_patch_long_at((jump_pc)-3, v) : jit_patch_short_at(jump_pc, v))
-#define jit_ret() (POPQr(_R13), POPQr(_R12), POPQr(_EBX), (_jitl.alloca_offset 
< -24 ? LEAVE_() : POPQr(_EBP)), RET_())
+#define jit_ret() ((_jitl.alloca_offset < -24 ? LEAVE_() : POPQr(_EBP)), 
POPQr(_R13), POPQr(_R12), POPQr(_EBX), RET_())
 
 #define _jit_ldi_l(d, is)              MOVQmr((is), 0,    0,    0,  (d))
 #define jit_ldr_l(d, rs)               MOVQmr(0,    (rs), 0,    0,  (d))
@@ -189,19 +189,44 @@ static int jit_arg_reg_order[] = { _EDI, _ESI, _EDX, _ECX 
};
 #define jit_stxr_l(d1, d2, rs)         MOVQrm((rs), 0,    (d1), (d2), 1)
 #define jit_stxi_l(id, rd, rs)         MOVQrm((rs), (id), (rd), 0,    0)
 
-#define jit_ldi_l(d, is) (_u32P((long)(is)) ? _jit_ldi_l(d, is) : 
(jit_movi_l(d, is), jit_ldr_l(d, d)))
-#define jit_sti_l(id, rs) (_u32P((long)(id)) ? _jit_sti_l(id, rs) : 
(jit_movi_l(JIT_REXTMP, id), MOVQrQm(rs, 0, JIT_REXTMP, 0, 0)))
-
-#define jit_blti_l(label, rs, is)      jit_bra_l0((rs), (is), JLm(label, 
0,0,0), JSm(label, 0,0,0) )
-#define jit_blei_l(label, rs, is)      jit_bra_l ((rs), (is), 
JLEm(label,0,0,0)                    )
-#define jit_bgti_l(label, rs, is)      jit_bra_l ((rs), (is), JGm(label, 
0,0,0)                    )
-#define jit_bgei_l(label, rs, is)      jit_bra_l0((rs), (is), 
JGEm(label,0,0,0), JNSm(label,0,0,0) )
-#define jit_beqi_l(label, rs, is)      jit_bra_l0((rs), (is), JEm(label, 
0,0,0), JEm(label, 0,0,0) )
-#define jit_bnei_l(label, rs, is)      jit_bra_l0((rs), (is), 
JNEm(label,0,0,0), JNEm(label,0,0,0) )
-#define jit_blti_ul(label, rs, is)     jit_bra_l ((rs), (is), JBm(label, 
0,0,0)                    )
-#define jit_blei_ul(label, rs, is)     jit_bra_l0((rs), (is), 
JBEm(label,0,0,0), JEm(label, 0,0,0) )
-#define jit_bgti_ul(label, rs, is)     jit_bra_l0((rs), (is), JAm(label, 
0,0,0), JNEm(label,0,0,0) )
-#define jit_bgei_ul(label, rs, is)     jit_bra_l ((rs), (is), 
JAEm(label,0,0,0)                    )
+#define jit_ldi_l(d, is)               (_u32P((long)(is)) ? _jit_ldi_l((d), 
(is)) : (jit_movi_l(JIT_REXTMP, (is)), jit_ldr_l(JIT_REXTMP)))
+#define jit_sti_l(id, rs)              (_u32P((long)(id)) ? _jit_sti_l(id, rs) 
: (jit_movi_l(JIT_REXTMP, id), jit_str_l (JIT_REXTMP, (rs))))
+
+/* Memory */
+#define jit_ldi_c(d, is)                (_u32P((long)(is)) ? MOVSBLmr((is), 0, 
   0,    0, (d)) :  (jit_movi_l(JIT_REXTMP, is), jit_ldr_c(d, JIT_REXTMP)))
+#define jit_ldxi_c(d, rs, is)           (_u32P((long)(is)) ? MOVSBLmr((is), 
(rs), 0,    0, (d)) :  (jit_movi_l(JIT_REXTMP, is), jit_ldxr_c(d, rs, 
JIT_REXTMP)))
+
+#define jit_ldi_uc(d, is)               (_u32P((long)(is)) ? MOVZBLmr((is), 0, 
   0,    0, (d)) :  (jit_movi_l(JIT_REXTMP, is), jit_ldr_uc(d, JIT_REXTMP)))
+#define jit_ldxi_uc(d, rs, is)          (_u32P((long)(is)) ? MOVZBLmr((is), 
(rs), 0,    0, (d)) :  (jit_movi_l(JIT_REXTMP, is), jit_ldxr_uc(d, rs, 
JIT_REXTMP)))
+
+#define jit_sti_c(id, rs)               (_u32P((long)(id)) ? jit_movbrm((rs), 
(id), 0,    0,    0) : (jit_movi_l(JIT_REXTMP, id), jit_str_c(JIT_REXTMP, rs)))
+#define jit_stxi_c(id, rd, rs)          (_u32P((long)(id)) ? jit_movbrm((rs), 
(id), (rd), 0,    0) : (jit_movi_l(JIT_REXTMP, id), jit_stxr_c(JIT_REXTMP, rd, 
rs)))
+
+#define jit_ldi_s(d, is)                (_u32P((long)(is)) ? MOVSWLmr((is), 0, 
   0,    0, (d)) :  (jit_movi_l(JIT_REXTMP, is), jit_ldr_s(d, JIT_REXTMP)))
+#define jit_ldxi_s(d, rs, is)           (_u32P((long)(is)) ? MOVSWLmr((is), 
(rs), 0,    0, (d)) :  (jit_movi_l(JIT_REXTMP, is), jit_ldxr_s(d, rs, 
JIT_REXTMP)))
+
+#define jit_ldi_us(d, is)               (_u32P((long)(is)) ? MOVZWLmr((is), 0, 
   0,    0,  (d)) :  (jit_movi_l(JIT_REXTMP, is), jit_ldr_us(d, JIT_REXTMP)))
+#define jit_ldxi_us(d, rs, is)          (_u32P((long)(is)) ? MOVZWLmr((is), 
(rs), 0,    0,  (d)) :  (jit_movi_l(JIT_REXTMP, is), jit_ldxr_us(d, rs, 
JIT_REXTMP)))
+
+#define jit_sti_s(id, rs)               (_u32P((long)(id)) ? 
MOVWrm(jit_reg16(rs), (id), 0,    0,    0) : (jit_movi_l(JIT_REXTMP, id), 
jit_str_s(JIT_REXTMP, rs)))
+#define jit_stxi_s(id, rd, rs)          (_u32P((long)(id)) ? 
MOVWrm(jit_reg16(rs), (id), (rd), 0,    0) : (jit_movi_l(JIT_REXTMP, id), 
jit_stxr_s(JIT_REXTMP, rd, rs)))
+
+#define jit_ldi_i(d, is)                (_u32P((long)(is)) ? MOVLmr((is), 0,   
 0,    0,  (d)) :  (jit_movi_l(JIT_REXTMP, is), jit_ldr_i(d, JIT_REXTMP)))
+#define jit_ldxi_i(d, rs, is)           (_u32P((long)(is)) ? MOVLmr((is), 
(rs), 0,    0,  (d)) :  (jit_movi_l(JIT_REXTMP, is), jit_ldxr_i(d, rs, 
JIT_REXTMP)))
+
+#define jit_sti_i(id, rs)               (_u32P((long)(id)) ? MOVLrm((rs), 
(id), 0,    0,    0) : (jit_movi_l(JIT_REXTMP, id), jit_str_i(JIT_REXTMP, rs)))
+#define jit_stxi_i(id, rd, rs)          (_u32P((long)(id)) ? MOVLrm((rs), 
(id), (rd), 0,    0) : (jit_movi_l(JIT_REXTMP, id), jit_stxr_i(JIT_REXTMP, rd, 
rs)))
+
+#define jit_blti_l(label, rs, is)      jit_bra_l0((rs), (is), JLm(label), 
JSm(label) )
+#define jit_blei_l(label, rs, is)      jit_bra_l ((rs), (is), JLEm(label)      
            )
+#define jit_bgti_l(label, rs, is)      jit_bra_l ((rs), (is), JGm(label)       
            )
+#define jit_bgei_l(label, rs, is)      jit_bra_l0((rs), (is), JGEm(label), 
JNSm(label) )
+#define jit_beqi_l(label, rs, is)      jit_bra_l0((rs), (is), JEm(label), 
JEm(label) )
+#define jit_bnei_l(label, rs, is)      jit_bra_l0((rs), (is), JNEm(label), 
JNEm(label) )
+#define jit_blti_ul(label, rs, is)     jit_bra_l ((rs), (is), JBm(label)       
            )
+#define jit_blei_ul(label, rs, is)     jit_bra_l0((rs), (is), JBEm(label), 
JEm(label) )
+#define jit_bgti_ul(label, rs, is)     jit_bra_l0((rs), (is), JAm(label), 
JNEm(label) )
+#define jit_bgei_ul(label, rs, is)     jit_bra_l ((rs), (is), JAEm(label)      
            )
 #define jit_bmsi_l(label, rs, is) jit_bmsi_i(label, rs, is)
 #define jit_bmci_l(label, rs, is) jit_bmci_i(label, rs, is)
 
@@ -210,16 +235,16 @@ static int jit_arg_reg_order[] = { _EDI, _ESI, _EDX, _ECX 
};
 
 #define jit_pusharg_l(rs) jit_pusharg_i(rs)
 #define jit_retval_l(rd)       ((void)jit_movr_l ((rd), _EAX))
-#define jit_bltr_l(label, s1, s2)      jit_bra_qr((s1), (s2), JLm(label, 
0,0,0) )
-#define jit_bler_l(label, s1, s2)      jit_bra_qr((s1), (s2), 
JLEm(label,0,0,0) )
-#define jit_bgtr_l(label, s1, s2)      jit_bra_qr((s1), (s2), JGm(label, 
0,0,0) )
-#define jit_bger_l(label, s1, s2)      jit_bra_qr((s1), (s2), 
JGEm(label,0,0,0) )
-#define jit_beqr_l(label, s1, s2)      jit_bra_qr((s1), (s2), JEm(label, 
0,0,0) )
-#define jit_bner_l(label, s1, s2)      jit_bra_qr((s1), (s2), 
JNEm(label,0,0,0) )
-#define jit_bltr_ul(label, s1, s2)     jit_bra_qr((s1), (s2), JBm(label, 
0,0,0) )
-#define jit_bler_ul(label, s1, s2)     jit_bra_qr((s1), (s2), 
JBEm(label,0,0,0) )
-#define jit_bgtr_ul(label, s1, s2)     jit_bra_qr((s1), (s2), JAm(label, 
0,0,0) )
-#define jit_bger_ul(label, s1, s2)     jit_bra_qr((s1), (s2), 
JAEm(label,0,0,0) )
+#define jit_bltr_l(label, s1, s2)      jit_bra_qr((s1), (s2), JLm(label) )
+#define jit_bler_l(label, s1, s2)      jit_bra_qr((s1), (s2), JLEm(label) )
+#define jit_bgtr_l(label, s1, s2)      jit_bra_qr((s1), (s2), JGm(label) )
+#define jit_bger_l(label, s1, s2)      jit_bra_qr((s1), (s2), JGEm(label) )
+#define jit_beqr_l(label, s1, s2)      jit_bra_qr((s1), (s2), JEm(label) )
+#define jit_bner_l(label, s1, s2)      jit_bra_qr((s1), (s2), JNEm(label) )
+#define jit_bltr_ul(label, s1, s2)     jit_bra_qr((s1), (s2), JBm(label) )
+#define jit_bler_ul(label, s1, s2)     jit_bra_qr((s1), (s2), JBEm(label) )
+#define jit_bgtr_ul(label, s1, s2)     jit_bra_qr((s1), (s2), JAm(label) )
+#define jit_bger_ul(label, s1, s2)     jit_bra_qr((s1), (s2), JAEm(label) )
 
 #endif /* __lightning_core_h */
 
diff --git a/lightning/i386/core-i386.h b/lightning/i386/core-i386.h
index 3f06c0e..7820972 100644
--- a/lightning/i386/core-i386.h
+++ b/lightning/i386/core-i386.h
@@ -66,13 +66,13 @@
 /* An operand is forced into a register */
 #define jit_replace(rd, rs, forced, op)                                        
\
        ((rd == forced) ? JITSORRY("Register conflict for " # op) :     \
-        (rs == forced) ? op : (PUSHLr(forced), MOVLrr(rs, forced), op, 
POPLr(forced)))
+        (rs == forced) ? op : (jit_pushr_i(forced), MOVLrr(rs, forced), op, 
jit_popr_i(forced)))
 
 /* For LT, LE, ... */
 #define jit_replace8(d, op)                                            \
        (jit_check8(d)                                                  \
          ? (MOVLir(0, d), op(d))                                       \
-         : (PUSHLr(_EAX), MOVLir(0, _EAX), op(_EAX), MOVLrr(_EAX, (d)), 
POPLr(_EAX)))
+         : (jit_pushr_i(_EAX), MOVLir(0, _EAX), op(_EAX), MOVLrr(_EAX, (d)), 
jit_popr_i(_EAX)))
 
 #define jit_bool_r(d, s1, s2, op)                                      \
        (CMPLrr(s2, s1), jit_replace8(d, op))
@@ -96,8 +96,8 @@
 
 /* Used to implement ldc, stc, ... */
 #define jit_check8(rs)         ( (rs) <= _EBX )
-#define jit_reg8(rs)           ( ((rs) == _SI || (rs) == _DI) ? _AL : ((rs) & 
_BH) | _AL )
-#define jit_reg16(rs)          ( ((rs) & _BH) | _AX )
+#define jit_reg8(rs)           ( ((rs) == _SI || (rs) == _DI) ? _AL : (_rN(rs) 
| _AL ))
+#define jit_reg16(rs)          ( _rN(rs) | _AX )
 
 /* In jit_replace below, _EBX is dummy */
 #define jit_movbrm(rs, dd, db, di, ds)                                         
       \
@@ -129,62 +129,62 @@
         IMULLr(rs == _EAX ? _EDX : rs))
 
 #define jit_divi_i_(result, d, rs, is)                 \
-       (jit_might (d,    _EAX, PUSHLr(_EAX)),          \
-       jit_might (d,    _ECX, PUSHLr(_ECX)),           \
-       jit_might (d,    _EDX, PUSHLr(_EDX)),           \
+       (jit_might (d,    _EAX, jit_pushr_i(_EAX)),             \
+       jit_might (d,    _ECX, jit_pushr_i(_ECX)),              \
+       jit_might (d,    _EDX, jit_pushr_i(_EDX)),              \
        jit_might (rs,   _EAX, MOVLrr(rs, _EAX)),       \
        jit_might (rs,   _EDX, MOVLrr(rs, _EDX)),       \
        MOVLir(is, _ECX),                               \
        SARLir(31, _EDX),                               \
        IDIVLr(_ECX),                                   \
        jit_might(d,    result, MOVLrr(result, d)),     \
-       jit_might(d,     _EDX,  POPLr(_EDX)),           \
-       jit_might(d,     _ECX,  POPLr(_ECX)),           \
-       jit_might(d,     _EAX,  POPLr(_EAX)))
+       jit_might(d,     _EDX,  jit_popr_i(_EDX)),              \
+       jit_might(d,     _ECX,  jit_popr_i(_ECX)),              \
+       jit_might(d,     _EAX,  jit_popr_i(_EAX)))
 
 #define jit_divr_i_(result, d, s1, s2)                 \
-       (jit_might (d,    _EAX, PUSHLr(_EAX)),          \
-       jit_might (d,    _ECX, PUSHLr(_ECX)),           \
-       jit_might (d,    _EDX, PUSHLr(_EDX)),           \
-       ((s1 == _ECX) ? PUSHLr(_ECX) : 0),              \
+       (jit_might (d,    _EAX, jit_pushr_i(_EAX)),             \
+       jit_might (d,    _ECX, jit_pushr_i(_ECX)),              \
+       jit_might (d,    _EDX, jit_pushr_i(_EDX)),              \
+       ((s1 == _ECX) ? jit_pushr_i(_ECX) : 0),         \
        jit_might (s2,   _ECX, MOVLrr(s2, _ECX)),       \
-       ((s1 == _ECX) ? POPLr(_EDX) :                   \
+       ((s1 == _ECX) ? jit_popr_i(_EDX) :                      \
        jit_might (s1,   _EDX, MOVLrr(s1, _EDX))),      \
        MOVLrr(_EDX, _EAX),                             \
        SARLir(31, _EDX),                               \
        IDIVLr(_ECX),                                   \
        jit_might(d,    result, MOVLrr(result, d)),     \
-       jit_might(d,     _EDX,  POPLr(_EDX)),           \
-       jit_might(d,     _ECX,  POPLr(_ECX)),           \
-       jit_might(d,     _EAX,  POPLr(_EAX)))
+       jit_might(d,     _EDX,  jit_popr_i(_EDX)),              \
+       jit_might(d,     _ECX,  jit_popr_i(_ECX)),              \
+       jit_might(d,     _EAX,  jit_popr_i(_EAX)))
 
 #define jit_divi_ui_(result, d, rs, is)                        \
-       (jit_might (d,    _EAX, PUSHLr(_EAX)),          \
-       jit_might (d,    _ECX, PUSHLr(_ECX)),           \
-       jit_might (d,    _EDX, PUSHLr(_EDX)),           \
+       (jit_might (d,    _EAX, jit_pushr_i(_EAX)),             \
+       jit_might (d,    _ECX, jit_pushr_i(_ECX)),              \
+       jit_might (d,    _EDX, jit_pushr_i(_EDX)),              \
        jit_might (rs,   _EAX, MOVLrr(rs, _EAX)),       \
        MOVLir(is, _ECX),                               \
        XORLrr(_EDX, _EDX),                             \
        DIVLr(_ECX),                                    \
        jit_might(d,    result, MOVLrr(result, d)),     \
-       jit_might(d,     _EDX,  POPLr(_EDX)),           \
-       jit_might(d,     _ECX,  POPLr(_ECX)),           \
-       jit_might(d,     _EAX,  POPLr(_EAX)))
+       jit_might(d,     _EDX,  jit_popr_i(_EDX)),              \
+       jit_might(d,     _ECX,  jit_popr_i(_ECX)),              \
+       jit_might(d,     _EAX,  jit_popr_i(_EAX)))
 
 #define jit_divr_ui_(result, d, s1, s2)                        \
-       (jit_might (d,    _EAX, PUSHLr(_EAX)),          \
-       jit_might (d,    _ECX, PUSHLr(_ECX)),           \
-       jit_might (d,    _EDX, PUSHLr(_EDX)),           \
-       ((s1 == _ECX) ? PUSHLr(_ECX) : 0),              \
+       (jit_might (d,    _EAX, jit_pushr_i(_EAX)),             \
+       jit_might (d,    _ECX, jit_pushr_i(_ECX)),              \
+       jit_might (d,    _EDX, jit_pushr_i(_EDX)),              \
+       ((s1 == _ECX) ? jit_pushr_i(_ECX) : 0),         \
        jit_might (s2,   _ECX, MOVLrr(s2, _ECX)),       \
-       ((s1 == _ECX) ? POPLr(_EAX) :                   \
+       ((s1 == _ECX) ? jit_popr_i(_EAX) :                      \
        jit_might (s1,   _EAX, MOVLrr(s1, _EAX))),      \
        XORLrr(_EDX, _EDX),                             \
        DIVLr(_ECX),                                    \
        jit_might(d,    result, MOVLrr(result, d)),     \
-       jit_might(d,     _EDX,  POPLr(_EDX)),           \
-       jit_might(d,     _ECX,  POPLr(_ECX)),           \
-       jit_might(d,     _EAX,  POPLr(_EAX)))
+       jit_might(d,     _EDX,  jit_popr_i(_EDX)),              \
+       jit_might(d,     _ECX,  jit_popr_i(_ECX)),              \
+       jit_might(d,     _EAX,  jit_popr_i(_EAX)))
 
 
 /* ALU */
@@ -217,24 +217,24 @@
 #define jit_mulr_ui(d, s1, s2) jit_opr_((d), (s1), (s2), IMULLrr((s1), (d)), 
IMULLrr((s2), (d)) )
 
 #define jit_hmuli_i(d, rs, is)                                                 
                                                        \
-       ((d) == _EDX ? (              PUSHLr(_EAX), jit_muli_i_((is), (rs)),    
                             POPLr(_EAX)                ) :     \
-       ((d) == _EAX ? (PUSHLr(_EDX),               jit_muli_i_((is), (rs)), 
MOVLrr(_EDX, _EAX),             POPLr(_EDX) ) :    \
-                      (PUSHLr(_EDX), PUSHLr(_EAX), jit_muli_i_((is), (rs)), 
MOVLrr(_EDX, (d)), POPLr(_EAX), POPLr(_EDX) )))
+       ((d) == _EDX ? (              jit_pushr_i(_EAX), jit_muli_i_((is), 
(rs)),                                    jit_popr_i(_EAX)           ) :     \
+       ((d) == _EAX ? (jit_pushr_i(_EDX),                  jit_muli_i_((is), 
(rs)), MOVLrr(_EDX, _EAX),             jit_popr_i(_EDX) ) :       \
+                      (jit_pushr_i(_EDX), jit_pushr_i(_EAX), jit_muli_i_((is), 
(rs)), MOVLrr(_EDX, (d)), jit_popr_i(_EAX), jit_popr_i(_EDX) )))
 
 #define jit_hmulr_i(d, s1, s2)                                                 
                                                \
-       ((d) == _EDX ? (              PUSHLr(_EAX), jit_mulr_i_((s1), (s2)),    
                  POPLr(_EAX)               ) : \
-       ((d) == _EAX ? (PUSHLr(_EDX),               jit_mulr_i_((s1), (s2)), 
MOVLrr(_EDX, _EAX),               POPLr(_EDX)  ) : \
-                      (PUSHLr(_EDX), PUSHLr(_EAX), jit_mulr_i_((s1), (s2)), 
MOVLrr(_EDX, (d)),   POPLr(_EAX), POPLr(_EDX)  )))
+       ((d) == _EDX ? (              jit_pushr_i(_EAX), jit_mulr_i_((s1), 
(s2)),                         jit_popr_i(_EAX)                  ) : \
+       ((d) == _EAX ? (jit_pushr_i(_EDX),                  jit_mulr_i_((s1), 
(s2)), MOVLrr(_EDX, _EAX),               jit_popr_i(_EDX)  ) :    \
+                      (jit_pushr_i(_EDX), jit_pushr_i(_EAX), jit_mulr_i_((s1), 
(s2)), MOVLrr(_EDX, (d)),   jit_popr_i(_EAX), jit_popr_i(_EDX)  )))
 
 #define jit_hmuli_ui(d, rs, is)                                                
                                                                \
-       ((d) == _EDX ? (              PUSHLr(_EAX), jit_muli_ui_((is), (rs)),   
                              POPLr(_EAX)               ) :     \
-       ((d) == _EAX ? (PUSHLr(_EDX),               jit_muli_ui_((is), (rs)), 
MOVLrr(_EDX, _EAX),             POPLr(_EDX) ) :   \
-                      (PUSHLr(_EDX), PUSHLr(_EAX), jit_muli_ui_((is), (rs)), 
MOVLrr(_EDX, (d)), POPLr(_EAX), POPLr(_EDX) )))
+       ((d) == _EDX ? (              jit_pushr_i(_EAX), jit_muli_ui_((is), 
(rs)),                                    jit_popr_i(_EAX)          ) :     \
+       ((d) == _EAX ? (jit_pushr_i(_EDX),                  jit_muli_ui_((is), 
(rs)), MOVLrr(_EDX, _EAX),             jit_popr_i(_EDX) ) :      \
+                      (jit_pushr_i(_EDX), jit_pushr_i(_EAX), 
jit_muli_ui_((is), (rs)), MOVLrr(_EDX, (d)), jit_popr_i(_EAX), jit_popr_i(_EDX) 
)))
 
 #define jit_hmulr_ui(d, s1, s2)                                                
                                                        \
-       ((d) == _EDX ? (              PUSHLr(_EAX), jit_mulr_ui_((s1), (s2)),   
                  POPLr(_EAX)               ) : \
-       ((d) == _EAX ? (PUSHLr(_EDX),               jit_mulr_ui_((s1), (s2)), 
MOVLrr(_EDX, _EAX),              POPLr(_EDX)  ) : \
-                      (PUSHLr(_EDX), PUSHLr(_EAX), jit_mulr_ui_((s1), (s2)), 
MOVLrr(_EDX, (d)),  POPLr(_EAX), POPLr(_EDX)  )))
+       ((d) == _EDX ? (              jit_pushr_i(_EAX), jit_mulr_ui_((s1), 
(s2)),                        jit_popr_i(_EAX)                  ) : \
+       ((d) == _EAX ? (jit_pushr_i(_EDX),                  jit_mulr_ui_((s1), 
(s2)), MOVLrr(_EDX, _EAX),              jit_popr_i(_EDX)  ) :    \
+                      (jit_pushr_i(_EDX), jit_pushr_i(_EAX), 
jit_mulr_ui_((s1), (s2)), MOVLrr(_EDX, (d)),  jit_popr_i(_EAX), 
jit_popr_i(_EDX)  )))
 
 #define jit_divi_i(d, rs, is)  jit_divi_i_(_EAX, (d), (rs), (is))
 #define jit_divi_ui(d, rs, is) jit_divi_ui_(_EAX, (d), (rs), (is))
@@ -297,88 +297,71 @@
 #define jit_gei_ui(d, rs, is)  jit_bool_i0((d), (rs), (is), SETAEr, INCLr  )
 
 /* Jump */
-#define jit_bltr_i(label, s1, s2)      jit_bra_r((s1), (s2), JLm(label, 0,0,0) 
)
-#define jit_bler_i(label, s1, s2)      jit_bra_r((s1), (s2), JLEm(label,0,0,0) 
)
-#define jit_bgtr_i(label, s1, s2)      jit_bra_r((s1), (s2), JGm(label, 0,0,0) 
)
-#define jit_bger_i(label, s1, s2)      jit_bra_r((s1), (s2), JGEm(label,0,0,0) 
)
-#define jit_beqr_i(label, s1, s2)      jit_bra_r((s1), (s2), JEm(label, 0,0,0) 
)
-#define jit_bner_i(label, s1, s2)      jit_bra_r((s1), (s2), JNEm(label,0,0,0) 
)
-#define jit_bltr_ui(label, s1, s2)     jit_bra_r((s1), (s2), JBm(label, 0,0,0) 
)
-#define jit_bler_ui(label, s1, s2)     jit_bra_r((s1), (s2), JBEm(label,0,0,0) 
)
-#define jit_bgtr_ui(label, s1, s2)     jit_bra_r((s1), (s2), JAm(label, 0,0,0) 
)
-#define jit_bger_ui(label, s1, s2)     jit_bra_r((s1), (s2), JAEm(label,0,0,0) 
)
-#define jit_bmsr_i(label, s1, s2)      (TESTLrr((s1), (s2)), 
JNZm(label,0,0,0), _jit.x.pc)
-#define jit_bmcr_i(label, s1, s2)      (TESTLrr((s1), (s2)), JZm(label,0,0,0), 
 _jit.x.pc)
-#define jit_boaddr_i(label, s1, s2)    (ADDLrr((s2), (s1)), JOm(label,0,0,0), 
_jit.x.pc)
-#define jit_bosubr_i(label, s1, s2)    (SUBLrr((s2), (s1)), JOm(label,0,0,0), 
_jit.x.pc)
-#define jit_boaddr_ui(label, s1, s2)   (ADDLrr((s2), (s1)), JCm(label,0,0,0), 
_jit.x.pc)
-#define jit_bosubr_ui(label, s1, s2)   (SUBLrr((s2), (s1)), JCm(label,0,0,0), 
_jit.x.pc)
-
-#define jit_blti_i(label, rs, is)      jit_bra_i0((rs), (is), JLm(label, 
0,0,0), JSm(label, 0,0,0) )
-#define jit_blei_i(label, rs, is)      jit_bra_i ((rs), (is), 
JLEm(label,0,0,0)                    )
-#define jit_bgti_i(label, rs, is)      jit_bra_i ((rs), (is), JGm(label, 
0,0,0)                    )
-#define jit_bgei_i(label, rs, is)      jit_bra_i0((rs), (is), 
JGEm(label,0,0,0), JNSm(label,0,0,0) )
-#define jit_beqi_i(label, rs, is)      jit_bra_i0((rs), (is), JEm(label, 
0,0,0), JEm(label, 0,0,0) )
-#define jit_bnei_i(label, rs, is)      jit_bra_i0((rs), (is), 
JNEm(label,0,0,0), JNEm(label,0,0,0) )
-#define jit_blti_ui(label, rs, is)     jit_bra_i ((rs), (is), JBm(label, 
0,0,0)                    )
-#define jit_blei_ui(label, rs, is)     jit_bra_i0((rs), (is), 
JBEm(label,0,0,0), JEm(label, 0,0,0) )
-#define jit_bgti_ui(label, rs, is)     jit_bra_i0((rs), (is), JAm(label, 
0,0,0), JNEm(label,0,0,0) )
-#define jit_bgei_ui(label, rs, is)     jit_bra_i ((rs), (is), 
JAEm(label,0,0,0)                    )
-#define jit_boaddi_i(label, rs, is)    (ADDLir((is), (rs)), JOm(label,0,0,0), 
_jit.x.pc)
-#define jit_bosubi_i(label, rs, is)    (SUBLir((is), (rs)), JOm(label,0,0,0), 
_jit.x.pc)
-#define jit_boaddi_ui(label, rs, is)   (ADDLir((is), (rs)), JCm(label,0,0,0), 
_jit.x.pc)
-#define jit_bosubi_ui(label, rs, is)   (SUBLir((is), (rs)), JCm(label,0,0,0), 
_jit.x.pc)
-
-#define jit_bmsi_i(label, rs, is)      (jit_reduce(TEST, (is), (rs)), 
JNZm(label,0,0,0), _jit.x.pc)
-#define jit_bmci_i(label, rs, is)      (jit_reduce(TEST, (is), (rs)), 
JZm(label,0,0,0),  _jit.x.pc)
-
-#define jit_jmpi(label)                (JMPm( ((unsigned long) (label)),       
0, 0, 0), _jit.x.pc)
-#define jit_calli(label)       (CALLm( ((unsigned long) (label)),      0, 0, 
0), _jit.x.pc)
-#define jit_callr(reg)         (CALLsr(reg))
-#define jit_jmpr(reg)          JMPsr(reg)
+#define jit_bltr_i(label, s1, s2)      jit_bra_r((s1), (s2), JLm(label) )
+#define jit_bler_i(label, s1, s2)      jit_bra_r((s1), (s2), JLEm(label) )
+#define jit_bgtr_i(label, s1, s2)      jit_bra_r((s1), (s2), JGm(label) )
+#define jit_bger_i(label, s1, s2)      jit_bra_r((s1), (s2), JGEm(label) )
+#define jit_beqr_i(label, s1, s2)      jit_bra_r((s1), (s2), JEm(label) )
+#define jit_bner_i(label, s1, s2)      jit_bra_r((s1), (s2), JNEm(label) )
+#define jit_bltr_ui(label, s1, s2)     jit_bra_r((s1), (s2), JBm(label) )
+#define jit_bler_ui(label, s1, s2)     jit_bra_r((s1), (s2), JBEm(label) )
+#define jit_bgtr_ui(label, s1, s2)     jit_bra_r((s1), (s2), JAm(label) )
+#define jit_bger_ui(label, s1, s2)     jit_bra_r((s1), (s2), JAEm(label) )
+#define jit_bmsr_i(label, s1, s2)      (TESTLrr((s1), (s2)), JNZm(label), 
_jit.x.pc)
+#define jit_bmcr_i(label, s1, s2)      (TESTLrr((s1), (s2)), JZm(label),  
_jit.x.pc)
+#define jit_boaddr_i(label, s1, s2)    (ADDLrr((s2), (s1)), JOm(label), 
_jit.x.pc)
+#define jit_bosubr_i(label, s1, s2)    (SUBLrr((s2), (s1)), JOm(label), 
_jit.x.pc)
+#define jit_boaddr_ui(label, s1, s2)   (ADDLrr((s2), (s1)), JCm(label), 
_jit.x.pc)
+#define jit_bosubr_ui(label, s1, s2)   (SUBLrr((s2), (s1)), JCm(label), 
_jit.x.pc)
+
+#define jit_blti_i(label, rs, is)      jit_bra_i0((rs), (is), JLm(label), 
JSm(label) )
+#define jit_blei_i(label, rs, is)      jit_bra_i ((rs), (is), JLEm(label)      
            )
+#define jit_bgti_i(label, rs, is)      jit_bra_i ((rs), (is), JGm(label)       
            )
+#define jit_bgei_i(label, rs, is)      jit_bra_i0((rs), (is), JGEm(label), 
JNSm(label) )
+#define jit_beqi_i(label, rs, is)      jit_bra_i0((rs), (is), JEm(label), 
JEm(label) )
+#define jit_bnei_i(label, rs, is)      jit_bra_i0((rs), (is), JNEm(label), 
JNEm(label) )
+#define jit_blti_ui(label, rs, is)     jit_bra_i ((rs), (is), JBm(label)       
            )
+#define jit_blei_ui(label, rs, is)     jit_bra_i0((rs), (is), JBEm(label), 
JEm(label) )
+#define jit_bgti_ui(label, rs, is)     jit_bra_i0((rs), (is), JAm(label), 
JNEm(label) )
+#define jit_bgei_ui(label, rs, is)     jit_bra_i ((rs), (is), JAEm(label)      
            )
+#define jit_boaddi_i(label, rs, is)    (ADDLir((is), (rs)), JOm(label), 
_jit.x.pc)
+#define jit_bosubi_i(label, rs, is)    (SUBLir((is), (rs)), JOm(label), 
_jit.x.pc)
+#define jit_boaddi_ui(label, rs, is)   (ADDLir((is), (rs)), JCm(label), 
_jit.x.pc)
+#define jit_bosubi_ui(label, rs, is)   (SUBLir((is), (rs)), JCm(label), 
_jit.x.pc)
+
+#define jit_bmsi_i(label, rs, is)      (jit_reduce(TEST, (is), (rs)), 
JNZm(label), _jit.x.pc)
+#define jit_bmci_i(label, rs, is)      (jit_reduce(TEST, (is), (rs)), 
JZm(label),  _jit.x.pc)
+
+#define jit_jmpi(label)                        (JMPm( ((unsigned long) 
(label))), _jit.x.pc)
+#define jit_calli(label)               (CALLm( ((unsigned long) (label))), 
_jit.x.pc)
+#define jit_callr(reg)                 CALLsr(reg)
+#define jit_jmpr(reg)                  JMPsr(reg)
 
 /* Memory */
-#define jit_ldi_c(d, is)               MOVSBLmr((is), 0,    0,    0, (d))
-#define jit_ldr_c(d, rs)               MOVSBLmr(0,    (rs), 0,    0, (d))
-#define jit_ldxr_c(d, s1, s2)          MOVSBLmr(0,    (s1), (s2), 1, (d))
-#define jit_ldxi_c(d, rs, is)          MOVSBLmr((is), (rs), 0,    0, (d))
-
-#define jit_ldi_uc(d, is)              MOVZBLmr((is), 0,    0,    0, (d))
-#define jit_ldr_uc(d, rs)              MOVZBLmr(0,    (rs), 0,    0, (d))
-#define jit_ldxr_uc(d, s1, s2)         MOVZBLmr(0,    (s1), (s2), 1, (d))
-#define jit_ldxi_uc(d, rs, is)         MOVZBLmr((is), (rs), 0,    0, (d))
-
-#define jit_sti_c(id, rs)               jit_movbrm((rs), (id), 0,    0,    0)
+#define jit_ldr_c(d, rs)                MOVSBLmr(0,    (rs), 0,    0, (d))
+#define jit_ldxr_c(d, s1, s2)           MOVSBLmr(0,    (s1), (s2), 1, (d))
+                                                           
+#define jit_ldr_uc(d, rs)               MOVZBLmr(0,    (rs), 0,    0, (d))
+#define jit_ldxr_uc(d, s1, s2)          MOVZBLmr(0,    (s1), (s2), 1, (d))
+                                                           
 #define jit_str_c(rd, rs)               jit_movbrm((rs), 0,    (rd), 0,    0)
 #define jit_stxr_c(d1, d2, rs)          jit_movbrm((rs), 0,    (d1), (d2), 1)
-#define jit_stxi_c(id, rd, rs)          jit_movbrm((rs), (id), (rd), 0,    0)
-
-#define jit_ldi_s(d, is)               MOVSWLmr((is), 0,    0,    0, (d))
-#define jit_ldr_s(d, rs)               MOVSWLmr(0,    (rs), 0,    0, (d))
-#define jit_ldxr_s(d, s1, s2)          MOVSWLmr(0,    (s1), (s2), 1, (d))
-#define jit_ldxi_s(d, rs, is)          MOVSWLmr((is), (rs), 0,    0, (d))
-
-#define jit_ldi_us(d, is)              MOVZWLmr((is), 0,    0,    0,  (d))
-#define jit_ldr_us(d, rs)              MOVZWLmr(0,    (rs), 0,    0,  (d))
-#define jit_ldxr_us(d, s1, s2)         MOVZWLmr(0,    (s1), (s2), 1,  (d))
-#define jit_ldxi_us(d, rs, is)         MOVZWLmr((is), (rs), 0,    0,  (d))
-
-#define jit_sti_s(id, rs)              MOVWrm(jit_reg16(rs), (id), 0,    0,    
0)
-#define jit_str_s(rd, rs)              MOVWrm(jit_reg16(rs), 0,    (rd), 0,    
0)
-#define jit_stxr_s(d1, d2, rs)         MOVWrm(jit_reg16(rs), 0,    (d1), (d2), 
1)
-#define jit_stxi_s(id, rd, rs)         MOVWrm(jit_reg16(rs), (id), (rd), 0,    
0)
-
-#define jit_ldi_i(d, is)               MOVLmr((is), 0,    0,    0,  (d))
-#define jit_ldr_i(d, rs)               MOVLmr(0,    (rs), 0,    0,  (d))
-#define jit_ldxr_i(d, s1, s2)          MOVLmr(0,    (s1), (s2), 1,  (d))
-#define jit_ldxi_i(d, rs, is)          MOVLmr((is), (rs), 0,    0,  (d))
-
-#define jit_sti_i(id, rs)              MOVLrm((rs), (id), 0,    0,    0)
-#define jit_str_i(rd, rs)              MOVLrm((rs), 0,    (rd), 0,    0)
-#define jit_stxr_i(d1, d2, rs)         MOVLrm((rs), 0,    (d1), (d2), 1)
-#define jit_stxi_i(id, rd, rs)         MOVLrm((rs), (id), (rd), 0,    0)
-
-
+                                                           
+#define jit_ldr_s(d, rs)                MOVSWLmr(0,    (rs), 0,    0, (d))
+#define jit_ldxr_s(d, s1, s2)           MOVSWLmr(0,    (s1), (s2), 1, (d))
+                                                           
+#define jit_ldr_us(d, rs)               MOVZWLmr(0,    (rs), 0,    0,  (d))
+#define jit_ldxr_us(d, s1, s2)          MOVZWLmr(0,    (s1), (s2), 1,  (d))
+                                                           
+#define jit_str_s(rd, rs)               MOVWrm(jit_reg16(rs), 0,    (rd), 0,   
 0)
+#define jit_stxr_s(d1, d2, rs)          MOVWrm(jit_reg16(rs), 0,    (d1), 
(d2), 1)
+                                                           
+#define jit_ldr_i(d, rs)                MOVLmr(0,    (rs), 0,    0,  (d))
+#define jit_ldxr_i(d, s1, s2)           MOVLmr(0,    (s1), (s2), 1,  (d))
+                                                           
+#define jit_str_i(rd, rs)               MOVLrm((rs), 0,    (rd), 0,    0)
+#define jit_stxr_i(d1, d2, rs)          MOVLrm((rs), 0,    (d1), (d2), 1)
+                                                           
 /* Extra */
 #define jit_nop()                      NOP_()
 
diff --git a/lightning/i386/fp-32.h b/lightning/i386/fp-32.h
index dda7396..ab2d440 100644
--- a/lightning/i386/fp-32.h
+++ b/lightning/i386/fp-32.h
@@ -222,10 +222,10 @@ union jit_double_imm {
        POPLr(rd),                              \
        TESTLrr(aux, aux),                      \
        POPLr(aux),                             \
-       JSSm(_jit.x.pc + 11, 0, 0, 0),          \
+       JSSm(_jit.x.pc + 11),                   \
        ADDLir(0x7FFFFFFF, aux),        /* 6 */ \
        SBBLir(0, rd),                  /* 3 */ \
-       JMPSm(_jit.x.pc + 10, 0, 0, 0), /* 2 */ \
+       JMPSm(_jit.x.pc + 10),          /* 2 */ \
        TESTLrr(aux, aux),              /* 2 */ \
        SETGr(jit_reg8(aux)),           /* 3 */ \
        SHRLir(1, aux),                 /* 2 */ \
diff --git a/tests/ldxi.c b/tests/ldxi.c
index 753b1c9..bd5f1a5 100644
--- a/tests/ldxi.c
+++ b/tests/ldxi.c
@@ -31,7 +31,7 @@ generate_ldxi_big_operand (const void *operand)
   result = (loader_t)(jit_set_ip (buffer).iptr);
   jit_leaf (1);
   arg = jit_arg_i ();
-  jit_getarg_i (JIT_R1, arg);
+  jit_getarg_p (JIT_R1, arg);
 
   jit_ldxi_c (JIT_R0, JIT_R1, operand);
   jit_movr_i (JIT_RET, JIT_R0);



reply via email to

[Prev in Thread] Current Thread [Next in Thread]