guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] 182/437: Implement the qmul and qdiv instructions.


From: Andy Wingo
Subject: [Guile-commits] 182/437: Implement the qmul and qdiv instructions.
Date: Mon, 2 Jul 2018 05:14:13 -0400 (EDT)

wingo pushed a commit to branch lightning
in repository guile.

commit d91b25d1bed7fa285af4009d661a7f9537e422f3
Author: pcpa <address@hidden>
Date:   Mon Feb 4 18:54:37 2013 -0200

    Implement the qmul and qdiv instructions.
    
    2013-02-04 Paulo Andrade <address@hidden>
    
        * include/lightning.h, include/lightning/jit_private.h,
        lib/jit_arm-cpu.c, lib/jit_arm.c, lib/jit_mips-cpu.c,
        lib/jit_mips.c, lib/jit_ppc-cpu.c, lib/jit_ppc.c,
        lib/jit_x86-cpu.c, lib/jit_x86.c, lib/lightning.c:
        Implement the new qmul and qdiv instructions that return signed
        and unsigned lo/hi multiplication result and div/rem division result.
        These should be useful for jit translation of code that needs to
        know if a multiplication overflows (no branch opcode added) or if
        a division is exact (easy check if remainder is zero).
    
        * check/lightning.c, lib/jit_print.c, check/Makefile.am,
        check/all.tst: Update for the new qmul and qdiv instructions.
    
        * check/qalu.inc, check/qalu_div.ok, check/qalu_div.tst,
        check/qalu_mul.ok, check/qalu_mul.tst: New files implementing
        simple test cases for qmul and qdiv.
---
 ChangeLog                       |  19 +++
 TODO                            |  16 +++
 check/Makefile.am               |   6 +-
 check/all.tst                   |   8 ++
 check/lightning.c               |  28 ++++
 check/qalu.inc                  |  97 +++++++++++++
 check/qalu_div.ok               |   1 +
 check/qalu_div.tst              |  18 +++
 check/qalu_mul.ok               |   1 +
 check/qalu_mul.tst              |  25 ++++
 include/lightning.h             |  16 +++
 include/lightning/jit_private.h |  11 +-
 lib/jit_arm-cpu.c               | 128 +++++++++++++++++
 lib/jit_arm.c                   |  18 +++
 lib/jit_mips-cpu.c              |  66 +++++++++
 lib/jit_mips.c                  |  18 +++
 lib/jit_ppc-cpu.c               |  99 +++++++++++++
 lib/jit_ppc.c                   |  18 +++
 lib/jit_print.c                 |  22 ++-
 lib/jit_x86-cpu.c               | 310 ++++++++++++++++++++++++++++++++++------
 lib/jit_x86.c                   |  18 +++
 lib/lightning.c                 | 223 +++++++++++++++++++++++------
 22 files changed, 1072 insertions(+), 94 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index fe58a50..ee46e0b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2013-02-04 Paulo Andrade <address@hidden>
+
+       * include/lightning.h, include/lightning/jit_private.h,
+       lib/jit_arm-cpu.c, lib/jit_arm.c, lib/jit_mips-cpu.c,
+       lib/jit_mips.c, lib/jit_ppc-cpu.c, lib/jit_ppc.c,
+       lib/jit_x86-cpu.c, lib/jit_x86.c, lib/lightning.c:
+       Implement the new qmul and qdiv instructions that return signed
+       and unsigned lo/hi multiplication result and div/rem division result.
+       These should be useful for jit translation of code that needs to
+       know if a multiplication overflows (no branch opcode added) or if
+       a division is exact (easy check if remainder is zero).
+
+       * check/lightning.c, lib/jit_print.c, check/Makefile.am,
+       check/all.tst: Update for the new qmul and qdiv instructions.
+
+       * check/qalu.inc, check/qalu_div.ok, check/qalu_div.tst,
+       check/qalu_mul.ok, check/qalu_mul.tst: New files implementing
+       simple test cases for qmul and qdiv.
+
 2013-01-30 Paulo Andrade <address@hidden>
 
        * doc/body.texi: Correct "jmpi" description that incorrectly
diff --git a/TODO b/TODO
index 30c42c2..5fd2df3 100644
--- a/TODO
+++ b/TODO
@@ -12,3 +12,19 @@
        Suggested names for now are "qmul" and "qdiv", with "r"
        and "i" variants, and possibly unsigned version. Branches
        would use "bo" and "bx" prefix.
+
+       * Convert retr to an actual node, otherwise, code like:
+       movi %r0 1
+       divr %r1 %r2 %r3
+       retr %r0
+       will fail in x86 because, besides "divr" telling it clobbers
+       %rax (r0) it ends being ignored because retr is a noop there
+       (removed "mov %rax,%rax" expansion) and the code checking for
+       live registers ends up not knowing about %rax being live after
+       the "divr". This affects only x86.
+
+       * Validate that divrem in jit_x86-cpu.c is not modifying
+       the non result arguments. This is not verified by clobber.tst,
+       as it only checks registers not involved in the operation
+       (because it does not know about values being set as input
+       for the the operation).
diff --git a/check/Makefile.am b/check/Makefile.am
index 33ce213..34fbbcf 100644
--- a/check/Makefile.am
+++ b/check/Makefile.am
@@ -66,6 +66,9 @@ EXTRA_DIST =                          \
        carry.tst       carry.ok        \
        call.tst        call.ok         \
        float.tst       float.ok        \
+       qalu.inc                        \
+       qalu_mul.tst    qalu_mul.ok     \
+       qalu_div.tst    qalu_div.ok     \
        ccall.ok                        \
        check.sh                        \
        check.x87.sh                    \
@@ -88,7 +91,8 @@ base_TESTS =                          \
        fop_abs fop_sqrt                \
        varargs stack                   \
        clobber carry call              \
-       float
+       float                           \
+       qalu_mul qalu_div
 
 $(base_TESTS): check.sh
        $(LN_S) $(srcdir)/check.sh $@
diff --git a/check/all.tst b/check/all.tst
index d8d4769..9be38eb 100644
--- a/check/all.tst
+++ b/check/all.tst
@@ -34,10 +34,18 @@
        subxi %r0 %r1 2
        mulr %r0 %r1 %r2
        muli %r0 %r1 2
+       qmulr %r0 %r1 %r2 %v0
+       qmuli %r0 %r1 %r2 3
+       qmulr_u %r0 %r1 %r2 %v0
+       qmuli_u %r0 %r1 %r2 3
        divr %r0 %r1 %r2
        divi %r0 %r1 2
        divr_u %r0 %r1 %r2
        divi_u %r0 %r1 2
+       qdivr %r0 %r1 %r2 %v0
+       qdivi %r0 %r1 %r2 3
+       qdivr_u %r0 %r1 %r2 %v0
+       qdivi_u %r0 %r1 %r2 3
        remr %r0 %r1 %r2
        remi %r0 %r1 2
        remr_u %r0 %r1 %r2
diff --git a/check/lightning.c b/check/lightning.c
index 13abbfb..9722011 100644
--- a/check/lightning.c
+++ b/check/lightning.c
@@ -260,8 +260,12 @@ static void subr(void);            static void subi(void);
 static void subxr(void);       static void subxi(void);
 static void subcr(void);       static void subci(void);
 static void mulr(void);                static void muli(void);
+static void qmulr(void);       static void qmuli(void);
+static void qmulr_u(void);     static void qmuli_u(void);
 static void divr(void);                static void divi(void);
 static void divr_u(void);      static void divi_u(void);
+static void qdivr(void);       static void qdivi(void);
+static void qdivr_u(void);     static void qdivi_u(void);
 static void remr(void);                static void remi(void);
 static void remr_u(void);      static void remi_u(void);
 static void andr(void);                static void andi(void);
@@ -552,8 +556,12 @@ static instr_t               instr_vector[] = {
     entry(subxr),      entry(subxi),
     entry(subcr),      entry(subci),
     entry(mulr),       entry(muli),
+    entry(qmulr),      entry(qmuli),
+    entry(qmulr_u),    entry(qmuli_u),
     entry(divr),       entry(divi),
     entry(divr_u),     entry(divi_u),
+    entry(qdivr),      entry(qdivi),
+    entry(qdivr_u),    entry(qdivi_u),
     entry(remr),       entry(remi),
     entry(remr_u),     entry(remi_u),
     entry(andr),       entry(andi),
@@ -920,6 +928,22 @@ name(void)                                                 
        \
     jit_word_t im = get_imm();                                         \
     jit_##name(r0, r1, im);                                            \
 }
+#define entry_ir_ir_ir_ir(name)                                                
\
+static void                                                            \
+name(void)                                                             \
+{                                                                      \
+    jit_gpr_t  r0 = get_ireg(), r1 = get_ireg(),                       \
+               r2 = get_ireg(), r3 = get_ireg();                       \
+    jit_##name(r0, r1, r2, r3);                                                
\
+}
+#define entry_ir_ir_ir_im(name)                                                
\
+static void                                                            \
+name(void)                                                             \
+{                                                                      \
+    jit_gpr_t  r0 = get_ireg(), r1 = get_ireg(), r2 = get_ireg();      \
+    jit_word_t im = get_imm();                                         \
+    jit_##name(r0, r1, r2, im);                                                
\
+}
 #define entry_ir_ir(name)                                              \
 static void                                                            \
 name(void)                                                             \
@@ -1244,8 +1268,12 @@ entry_ir_ir_ir(subr)             entry_ir_ir_im(subi)
 entry_ir_ir_ir(subxr)          entry_ir_ir_im(subxi)
 entry_ir_ir_ir(subcr)          entry_ir_ir_im(subci)
 entry_ir_ir_ir(mulr)           entry_ir_ir_im(muli)
+entry_ir_ir_ir_ir(qmulr)       entry_ir_ir_ir_im(qmuli)
+entry_ir_ir_ir_ir(qmulr_u)     entry_ir_ir_ir_im(qmuli_u)
 entry_ir_ir_ir(divr)           entry_ir_ir_im(divi)
 entry_ir_ir_ir(divr_u)         entry_ir_ir_im(divi_u)
+entry_ir_ir_ir_ir(qdivr)       entry_ir_ir_ir_im(qdivi)
+entry_ir_ir_ir_ir(qdivr_u)     entry_ir_ir_ir_im(qdivi_u)
 entry_ir_ir_ir(remr)           entry_ir_ir_im(remi)
 entry_ir_ir_ir(remr_u)         entry_ir_ir_im(remi_u)
 entry_ir_ir_ir(andr)           entry_ir_ir_im(andi)
diff --git a/check/qalu.inc b/check/qalu.inc
new file mode 100644
index 0000000..a5e893f
--- /dev/null
+++ b/check/qalu.inc
@@ -0,0 +1,97 @@
+.data  8
+ok:
+.c     "ok\n"
+
+/* r0,r1 = r2 op r3 */
+#define QALUR(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3)        \
+       movi %R2 I0                                     \
+       movi %R3 I1                                     \
+       OP##r##T %R0 %R1 %R2 %R3                        \
+       bnei OP##T##N##rlo##R0##R1##R2##R3 %R0 LO       \
+       beqi OP##T##N##rhi##R0##R1##R2##R3 %R1 HI       \
+OP##T##N##rlo##R0##R1##R2##R3:                         \
+       calli @abort                                    \
+OP##T##N##rhi##R0##R1##R2##R3:
+
+/* r0,r1 = r2 op i0 */
+#define QALUI(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3)        \
+       movi %R2 I0                                     \
+       movi %R3 HI                                     \
+       OP##i##T %R0 %R1 %R2 I1                         \
+       bnei OP##T##N##ilo##R0##R1##R2##R3 %R0 LO       \
+       beqr OP##T##N##ihi##R0##R1##R2##R3 %R1 %R3      \
+OP##T##N##ilo##R0##R1##R2##R3:                         \
+       calli @abort                                    \
+OP##T##N##ihi##R0##R1##R2##R3:
+
+/* r0,r1 = r0 op r1 */
+#define QALUX(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3)        \
+       movi %R0 I0                                     \
+       movi %R1 I1                                     \
+       movi %R2 LO                                     \
+       movi %R3 HI                                     \
+       OP##r##T %R0 %R1 %R0 %R1                        \
+       bner OP##T##N##0lo##R0##R1##R2##R3 %R0 %R2      \
+       beqr OP##T##N##0hi##R0##R1##R2##R3 %R1 %R3      \
+OP##T##N##0lo##R0##R1##R2##R3:                         \
+       calli @abort                                    \
+OP##T##N##0hi##R0##R1##R2##R3:
+
+/* r0,r1 = r1 op r0 */
+#define QALUY(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3)        \
+       movi %R1 I0                                     \
+       movi %R0 I1                                     \
+       movi %R2 LO                                     \
+       movi %R3 HI                                     \
+       OP##r##T %R0 %R1 %R1 %R0                        \
+       bner OP##T##N##1lo##R0##R1##R2##R3 %R0 %R2      \
+       beqr OP##T##N##1hi##R0##R1##R2##R3 %R1 %R3      \
+OP##T##N##1lo##R0##R1##R2##R3:                         \
+       calli @abort                                    \
+OP##T##N##1hi##R0##R1##R2##R3:
+
+/* r0,r1 = r0 op r3 */
+#define QALUZ(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3)        \
+       movi %R0 I0                                     \
+       movi %R3 I1                                     \
+       movi %R2 LO                                     \
+       OP##r##T %R0 %R1 %R0 %R3                        \
+       bner OP##T##N##2lo##R0##R1##R2##R3 %R0 %R2      \
+       beqi OP##T##N##2hi##R0##R1##R2##R3 %R1 HI       \
+OP##T##N##2lo##R0##R1##R2##R3:                         \
+       calli @abort                                    \
+OP##T##N##2hi##R0##R1##R2##R3:
+
+/* r0,r1 = r2 op r1 */
+#define QALUW(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3)        \
+       movi %R2 I0                                     \
+       movi %R1 I1                                     \
+       movi %R3 LO                                     \
+       OP##r##T %R0 %R1 %R2 %R1                        \
+       bner OP##T##N##3lo##R0##R1##R2##R3 %R0 %R3      \
+       beqi OP##T##N##3hi##R0##R1##R2##R3 %R1 HI       \
+OP##T##N##3lo##R0##R1##R2##R3:                         \
+       calli @abort                                    \
+OP##T##N##3hi##R0##R1##R2##R3:
+
+#define QALU2(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3)        \
+       QALUR(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3) \
+       QALUI(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3) \
+       QALUX(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3) \
+       QALUY(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3) \
+       QALUZ(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3) \
+       QALUW(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3)
+
+#define QALU1(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3)        \
+       QALU2(N, T, OP, I0, I1, LO, HI, R0, R1, R2, R3) \
+       QALU2(N, T, OP, I0, I1, LO, HI, R1, R2, R3, R0) \
+       QALU2(N, T, OP, I0, I1, LO, HI, R2, R3, R0, R1) \
+       QALU2(N, T, OP, I0, I1, LO, HI, R3, R0, R1, R2)
+
+#define QALU(N, T, OP, I0, I1, LO, HI)                 \
+       QALU1(N, T, OP, I0, I1, LO, HI, v0, v1, v2, r0) \
+       QALU1(N, T, OP, I0, I1, LO, HI, v0, v1, v2, r1) \
+       QALU1(N, T, OP, I0, I1, LO, HI, v0, v1, v2, r2) \
+       QALU1(N, T, OP, I0, I1, LO, HI, v1, v2, r0, r1) \
+       QALU1(N, T, OP, I0, I1, LO, HI, v1, v2, r0, r2) \
+       QALU1(N, T, OP, I0, I1, LO, HI, v2, r0, r1, r2)
diff --git a/check/qalu_div.ok b/check/qalu_div.ok
new file mode 100644
index 0000000..9766475
--- /dev/null
+++ b/check/qalu_div.ok
@@ -0,0 +1 @@
+ok
diff --git a/check/qalu_div.tst b/check/qalu_div.tst
new file mode 100644
index 0000000..198dfbb
--- /dev/null
+++ b/check/qalu_div.tst
@@ -0,0 +1,18 @@
+#include "qalu.inc"
+
+.code
+       prolog
+#define QDIV(N, I0, I1, LO, HI)                QALU(N, , qdiv, I0, I1, LO, HI)
+#define UQDIV(N, I0, I1, LO, HI)       QALU(N, _u, qdiv, I0, I1, LO, HI)
+        QDIV(0, 10, 3, 3, 1)
+        QDIV(1, -33, 9, -3, -6)
+        QDIV(2, -41, -7, 5, -6)
+        QDIV(3, 65536, 4096, 16, 0)
+       UQDIV(4, -1, -2, 1, 1)
+       UQDIV(5, -2, -5, 1, 3)
+       prepare
+               pushargi ok
+               ellipsis
+       finishi @printf
+       ret
+       epilog
diff --git a/check/qalu_mul.ok b/check/qalu_mul.ok
new file mode 100644
index 0000000..9766475
--- /dev/null
+++ b/check/qalu_mul.ok
@@ -0,0 +1 @@
+ok
diff --git a/check/qalu_mul.tst b/check/qalu_mul.tst
new file mode 100644
index 0000000..378d383
--- /dev/null
+++ b/check/qalu_mul.tst
@@ -0,0 +1,25 @@
+#include "qalu.inc"
+
+.code
+       prolog
+#define QMUL(N, I0, I1, LO, HI)                QALU(N, , qmul, I0, I1, LO, HI)
+#define UQMUL(N, I0, I1, LO, HI)       QALU(N, _u, qmul, I0, I1, LO, HI)
+       QMUL(0, -2, -1, 2, 0)
+       QMUL(1, 0, -1, 0, 0)
+       QMUL(2, -1, 0, 0, 0)
+       QMUL(3, 1, -1, -1, -1)
+#if __WORDSIZE == 32
+        QMUL(4, 0x7ffff, 0x7ffff, 0xfff00001, 0x3f)
+       UQMUL(5, 0xffffff, 0xffffff, 0xfe000001, 0xffff)
+        QMUL(6, 0x80000000, -2, 0, 1)
+#else
+        QMUL(4, 0x7ffffffff, 0x7ffffffff, 0xfffffff000000001, 0x3f)
+       UQMUL(5, 0xffffffffff, 0xffffffffff, 0xfffffe0000000001, 0xffff)
+        QMUL(6, 0x8000000000000000, -2, 0, 1)
+#endif
+       prepare
+               pushargi ok
+               ellipsis
+       finishi @printf
+       ret
+       epilog
diff --git a/include/lightning.h b/include/lightning.h
index 52ff539..3694616 100644
--- a/include/lightning.h
+++ b/include/lightning.h
@@ -153,12 +153,24 @@ typedef enum {
 #define jit_mulr(u,v,w)                jit_new_node_www(jit_code_mulr,u,v,w)
 #define jit_muli(u,v,w)                jit_new_node_www(jit_code_muli,u,v,w)
     jit_code_mulr,             jit_code_muli,
+#define jit_qmulr(l,h,v,w)     jit_new_node_qww(jit_code_qmulr,l,h,v,w)
+#define jit_qmuli(l,h,v,w)     jit_new_node_qww(jit_code_qmuli,l,h,v,w)
+    jit_code_qmulr,            jit_code_qmuli,
+#define jit_qmulr_u(l,h,v,w)   jit_new_node_qww(jit_code_qmulr_u,l,h,v,w)
+#define jit_qmuli_u(l,h,v,w)   jit_new_node_qww(jit_code_qmuli_u,l,h,v,w)
+    jit_code_qmulr_u,          jit_code_qmuli_u,
 #define jit_divr(u,v,w)                jit_new_node_www(jit_code_divr,u,v,w)
 #define jit_divi(u,v,w)                jit_new_node_www(jit_code_divi,u,v,w)
     jit_code_divr,             jit_code_divi,
 #define jit_divr_u(u,v,w)      jit_new_node_www(jit_code_divr_u,u,v,w)
 #define jit_divi_u(u,v,w)      jit_new_node_www(jit_code_divi_u,u,v,w)
     jit_code_divr_u,           jit_code_divi_u,
+#define jit_qdivr(l,h,v,w)     jit_new_node_qww(jit_code_qdivr,l,h,v,w)
+#define jit_qdivi(l,h,v,w)     jit_new_node_qww(jit_code_qdivi,l,h,v,w)
+    jit_code_qdivr,            jit_code_qdivi,
+#define jit_qdivr_u(l,h,v,w)   jit_new_node_qww(jit_code_qdivr_u,l,h,v,w)
+#define jit_qdivi_u(l,h,v,w)   jit_new_node_qww(jit_code_qdivi_u,l,h,v,w)
+    jit_code_qdivr_u,          jit_code_qdivi_u,
 #define jit_remr(u,v,w)                jit_new_node_www(jit_code_remr,u,v,w)
 #define jit_remi(u,v,w)                jit_new_node_www(jit_code_remi,u,v,w)
     jit_code_remr,             jit_code_remi,
@@ -839,6 +851,10 @@ extern jit_node_t *_jit_new_node_wd(jit_state_t*, 
jit_code_t,
 #define jit_new_node_www(c,u,v,w) _jit_new_node_www(_jit,c,u,v,w)
 extern jit_node_t *_jit_new_node_www(jit_state_t*, jit_code_t,
                                     jit_word_t, jit_word_t, jit_word_t);
+#define jit_new_node_qww(c,l,h,v,w) _jit_new_node_qww(_jit,c,l,h,v,w)
+extern jit_node_t *_jit_new_node_qww(jit_state_t*, jit_code_t,
+                                    jit_int32_t, jit_int32_t,
+                                    jit_word_t, jit_word_t);
 #define jit_new_node_wwf(c,u,v,w) _jit_new_node_wwf(_jit,c,u,v,w)
 extern jit_node_t *_jit_new_node_wwf(jit_state_t*, jit_code_t,
                                     jit_word_t, jit_word_t, jit_float32_t);
diff --git a/include/lightning/jit_private.h b/include/lightning/jit_private.h
index b0be3d9..202fc67 100644
--- a/include/lightning/jit_private.h
+++ b/include/lightning/jit_private.h
@@ -103,6 +103,7 @@
 #define jit_cc_a0_reg          0x00000001      /* arg0 is a register */
 #define jit_cc_a0_chg          0x00000002      /* arg0 is modified */
 #define jit_cc_a0_jmp          0x00000004      /* arg0 is a jump target */
+#define jit_cc_a0_rlh          0x00000008      /* arg0 is a register pair */
 #define jit_cc_a0_int          0x00000010      /* arg0 is immediate word */
 #define jit_cc_a0_flt          0x00000020      /* arg0 is immediate float */
 #define jit_cc_a0_dbl          0x00000040      /* arg0 is immediate double */
@@ -170,13 +171,13 @@ typedef struct jit_data_info      jit_data_info_t;
 union jit_data {
     struct {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-       jit_int32_t     l;
-       jit_int32_t     h;
+       jit_int32_t      l;
+       jit_int32_t      h;
 #else
-       jit_int32_t     h;
-       jit_int32_t     l;
+       jit_int32_t      h;
+       jit_int32_t      l;
 #endif
-    } pair;
+    } q;
     jit_word_t          w;
     jit_float32_t       f;
     jit_float64_t       d;
diff --git a/lib/jit_arm-cpu.c b/lib/jit_arm-cpu.c
index cf1be0f..499616d 100644
--- a/lib/jit_arm-cpu.c
+++ b/lib/jit_arm-cpu.c
@@ -132,7 +132,9 @@ extern unsigned     __aeabi_uidivmod(unsigned, unsigned);
 #  define THUMB_MUL                        0x4340
 #  define THUMB2_MUL                   0xfb00f000
 #  define ARM_UMULL                    0x00800090
+#  define THUMB2_UMULL                 0xfba00000
 #  define ARM_SMULL                    0x00c00090
+#  define THUMB2_SMULL                 0xfb800000
 #  define THUMB2_SDIV                  0xfb90f0f0
 #  define THUMB2_UDIV                  0xfbb0f0f0
 #  define ARM_AND                      0x00000000
@@ -868,6 +870,16 @@ static void 
_subxi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
 static void _mulr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define muli(r0,r1,i0)               _muli(_jit,r0,r1,i0)
 static void _muli(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
+#  define qmulr(r0,r1,r2,r3)           iqmulr(r0,r1,r2,r3,1)
+#  define qmulr_u(r0,r1,r2,r3)         iqmulr(r0,r1,r2,r3,0)
+#  define iqmulr(r0,r1,r2,r3,cc)       _iqmulr(_jit,r0,r1,r2,r3,cc)
+static void _iqmulr(jit_state_t*,jit_int32_t,jit_int32_t,
+                   jit_int32_t,jit_int32_t,jit_bool_t);
+#  define qmuli(r0,r1,r2,i0)           iqmuli(r0,r1,r2,i0,1)
+#  define qmuli_u(r0,r1,r2,i0)         iqmuli(r0,r1,r2,i0,0)
+#  define iqmuli(r0,r1,r2,i0,cc)       _iqmuli(_jit,r0,r1,r2,i0,cc)
+static void _iqmuli(jit_state_t*,jit_int32_t,jit_int32_t,
+                   jit_int32_t,jit_word_t,jit_bool_t);
 #  define divrem(d,s,r0,r1,r2)         _divrem(_jit,d,s,r0,r1,r2)
 static void _divrem(jit_state_t*,int,int,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define divr(r0,r1,r2)               _divr(_jit,r0,r1,r2)
@@ -878,6 +890,16 @@ static void 
_divi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
 static void _divr_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define divi_u(r0,r1,i0)             _divi_u(_jit,r0,r1,i0)
 static void _divi_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
+#  define qdivr(r0,r1,r2,r3)           iqdivr(r0,r1,r2,r3,1)
+#  define qdivr_u(r0,r1,r2,r3)         iqdivr(r0,r1,r2,r3,0)
+#  define iqdivr(r0,r1,r2,r3,cc)       _iqdivr(_jit,r0,r1,r2,r3,cc)
+static void _iqdivr(jit_state_t*,jit_int32_t,jit_int32_t,
+                   jit_int32_t,jit_int32_t,jit_bool_t);
+#  define qdivi(r0,r1,r2,i0)           iqdivi(r0,r1,r2,i0,1)
+#  define qdivi_u(r0,r1,r2,i0)         iqdivi(r0,r1,r2,i0,0)
+#  define iqdivi(r0,r1,r2,i0,cc)       _iqdivi(_jit,r0,r1,r2,i0,cc)
+static void _iqdivi(jit_state_t*,jit_int32_t,jit_int32_t,
+                   jit_int32_t,jit_word_t,jit_bool_t);
 #  define remr(r0,r1,r2)               _remr(_jit,r0,r1,r2)
 static void _remr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define remi(r0,r1,i0)               _remi(_jit,r0,r1,i0)
@@ -1963,6 +1985,70 @@ _muli(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, 
jit_word_t i0)
 }
 
 static void
+_iqmulr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_int32_t r3, jit_bool_t sign)
+{
+    jit_int32_t                reg;
+    if (jit_thumb_p()) {
+       if (r2 == r3) {
+           reg = jit_get_reg(jit_class_gpr);
+           movr(rn(reg), r2);
+           if (sign)
+               T2_SMULL(r0, r1, rn(reg), r2);
+           else
+               T2_UMULL(r0, r1, rn(reg), r2);
+           jit_unget_reg(reg);
+       }
+       else if (r0 != r2 && r1 != r2) {
+           if (sign)
+               T2_SMULL(r0, r1, r2, r3);
+           else
+               T2_UMULL(r0, r1, r2, r3);
+       }
+       else {
+           if (sign)
+               T2_SMULL(r0, r1, r3, r2);
+           else
+               T2_UMULL(r0, r1, r3, r2);
+       }
+    }
+    else {
+       if (r2 == r3) {
+           reg = jit_get_reg(jit_class_gpr);
+           movr(rn(reg), r2);
+           if (sign)
+               SMULL(r0, r1, rn(reg), r2);
+           else
+               UMULL(r0, r1, rn(reg), r2);
+           jit_unget_reg(reg);
+       }
+       else if (r0 != r2 && r1 != r2) {
+           if (sign)
+               SMULL(r0, r1, r2, r3);
+           else
+               UMULL(r0, r1, r2, r3);
+       }
+       else {
+           if (sign)
+               SMULL(r0, r1, r3, r2);
+           else
+               UMULL(r0, r1, r3, r2);
+       }
+    }
+}
+
+static void
+_iqmuli(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_word_t i0, jit_bool_t sign)
+{
+    jit_int32_t                reg;
+    reg = jit_get_reg(jit_class_gpr);
+    movi(rn(reg), i0);
+    iqmulr(r0, r1, r2, rn(reg), sign);
+    jit_unget_reg(reg);
+}
+
+static void
 _divrem(jit_state_t *_jit, int div, int sign,
        jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
@@ -2032,6 +2118,48 @@ _divi_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t 
r1, jit_word_t i0)
 }
 
 static void
+_iqdivr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_int32_t r3, jit_bool_t sign)
+{
+    jit_word_t         d;
+    jit_word_t         w;
+    jit_get_reg_args();
+    movr(_R0_REGNO, r2);
+    movr(_R1_REGNO, r3);
+    if (sign)                  w = (jit_word_t)__aeabi_idivmod;
+    else                       w = (jit_word_t)__aeabi_uidivmod;
+    if (!jit_exchange_p()) {
+       if (jit_thumb_p())      d = ((w - _jit->pc.w) >> 1) - 2;
+       else                    d = ((w - _jit->pc.w) >> 2) - 2;
+       if (_s24P(d)) {
+           if (jit_thumb_p())  T2_BLI(encode_thumb_jump(d));
+           else                BLI(d & 0x00ffffff);
+       }
+       else                    goto fallback;
+    }
+    else {
+    fallback:
+       movi(_R2_REGNO, w);
+       if (jit_thumb_p())      T1_BLX(_R2_REGNO);
+       else                    BLX(_R2_REGNO);
+    }
+    movr(r0, _R0_REGNO);
+    movr(r1, _R1_REGNO);
+    jit_unget_reg_args();
+}
+
+static void
+_iqdivi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_word_t i0, jit_bool_t sign)
+{
+    jit_int32_t                reg;
+    reg = jit_get_reg(jit_class_gpr);
+    movi(rn(reg), i0);
+    iqdivr(r0, r1, r2, rn(reg), sign);
+    jit_unget_reg(reg);
+}
+
+static void
 _remr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
     divrem(0, 1, r0, r1, r2);
diff --git a/lib/jit_arm.c b/lib/jit_arm.c
index 95cbfac..d27d796 100644
--- a/lib/jit_arm.c
+++ b/lib/jit_arm.c
@@ -849,6 +849,11 @@ _emit_code(jit_state_t *_jit)
                name##r##type(rn(node->u.w),                            \
                              rn(node->v.w), rn(node->w.w));            \
                break
+#define case_rrrr(name, type)                                          \
+           case jit_code_##name##r##type:                              \
+               name##r##type(rn(node->u.q.l), rn(node->u.q.h),         \
+                             rn(node->v.w), rn(node->w.w));            \
+               break
 #define case_vvv(name, type)                                           \
            case jit_code_##name##r##type:                              \
                if (jit_swf_p())                                        \
@@ -862,6 +867,11 @@ _emit_code(jit_state_t *_jit)
            case jit_code_##name##i##type:                              \
                name##i##type(rn(node->u.w), rn(node->v.w), node->w.w); \
                break
+#define case_rrrw(name, type)                                          \
+           case jit_code_##name##i##type:                              \
+               name##i##type(rn(node->u.q.l), rn(node->u.q.h),         \
+                             rn(node->v.w), node->w.w);                \
+               break
 #define case_vvw(name, type)                                           \
            case jit_code_##name##i##type:                              \
                if (jit_swf_p())                                        \
@@ -1036,10 +1046,18 @@ _emit_code(jit_state_t *_jit)
                case_rrw(subx,);
                case_rrr(mul,);
                case_rrw(mul,);
+               case_rrrr(qmul,);
+               case_rrrw(qmul,);
+               case_rrrr(qmul, _u);
+               case_rrrw(qmul, _u);
                case_rrr(div,);
                case_rrw(div,);
                case_rrr(div, _u);
                case_rrw(div, _u);
+               case_rrrr(qdiv,);
+               case_rrrw(qdiv,);
+               case_rrrr(qdiv, _u);
+               case_rrrw(qdiv, _u);
                case_rrr(rem,);
                case_rrw(rem,);
                case_rrr(rem, _u);
diff --git a/lib/jit_mips-cpu.c b/lib/jit_mips-cpu.c
index 3113260..fc5356b 100644
--- a/lib/jit_mips-cpu.c
+++ b/lib/jit_mips-cpu.c
@@ -392,6 +392,16 @@ static void 
_subxi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
 static void _mulr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define muli(r0,r1,i0)               _muli(_jit,r0,r1,i0)
 static void _muli(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
+#  define qmulr(r0,r1,r2,r3)           iqmulr(r0,r1,r2,r3,1)
+#  define qmulr_u(r0,r1,r2,r3)         iqmulr(r0,r1,r2,r3,0)
+#  define iqmulr(r0,r1,r2,r3,cc)       _iqmulr(_jit,r0,r1,r2,r3,cc)
+static void _iqmulr(jit_state_t*,jit_int32_t,jit_int32_t,
+                   jit_int32_t,jit_int32_t,jit_bool_t);
+#  define qmuli(r0,r1,r2,i0)           iqmuli(r0,r1,r2,i0,1)
+#  define qmuli_u(r0,r1,r2,i0)         iqmuli(r0,r1,r2,i0,0)
+#  define iqmuli(r0,r1,r2,i0,cc)       _iqmuli(_jit,r0,r1,r2,i0,cc)
+static void _iqmuli(jit_state_t*,jit_int32_t,jit_int32_t,
+                   jit_int32_t,jit_word_t,jit_bool_t);
 #  define divr(r0,r1,r2)               _divr(_jit,r0,r1,r2)
 static void _divr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define divi(r0,r1,i0)               _divi(_jit,r0,r1,i0)
@@ -400,6 +410,16 @@ static void 
_divi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
 static void _divr_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define divi_u(r0,r1,i0)             _divi_u(_jit,r0,r1,i0)
 static void _divi_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
+#  define qdivr(r0,r1,r2,r3)           iqdivr(r0,r1,r2,r3,1)
+#  define qdivr_u(r0,r1,r2,r3)         iqdivr(r0,r1,r2,r3,0)
+#  define iqdivr(r0,r1,r2,r3,cc)       _iqdivr(_jit,r0,r1,r2,r3,cc)
+static void _iqdivr(jit_state_t*,jit_int32_t,jit_int32_t,
+                   jit_int32_t,jit_int32_t,jit_bool_t);
+#  define qdivi(r0,r1,r2,i0)           iqdivi(r0,r1,r2,i0,1)
+#  define qdivi_u(r0,r1,r2,i0)         iqdivi(r0,r1,r2,i0,0)
+#  define iqdivi(r0,r1,r2,i0,cc)       _iqdivi(_jit,r0,r1,r2,i0,cc)
+static void _iqdivi(jit_state_t*,jit_int32_t,jit_int32_t,
+                   jit_int32_t,jit_word_t,jit_bool_t);
 #  define remr(r0,r1,r2)               _remr(_jit,r0,r1,r2)
 static void _remr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define remi(r0,r1,i0)               _remi(_jit,r0,r1,i0)
@@ -915,6 +935,29 @@ _muli(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, 
jit_word_t i0)
 }
 
 static void
+_iqmulr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_int32_t r3, jit_bool_t sign)
+{
+    if (sign)
+       MULT(r2, r3);
+    else
+       MULTU(r2, r3);
+    MFLO(r0);
+    MFHI(r1);
+}
+
+static void
+_iqmuli(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_word_t i0, jit_bool_t sign)
+{
+    jit_int32_t                reg;
+    reg = jit_get_reg(jit_class_gpr);
+    movi(rn(reg), i0);
+    iqmulr(r0, r1, r2, rn(reg), sign);
+    jit_unget_reg(reg);
+}
+
+static void
 _divr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
     DIV(r1, r2);
@@ -949,6 +992,29 @@ _divi_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, 
jit_word_t i0)
 }
 
 static void
+_iqdivr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_int32_t r3, jit_bool_t sign)
+{
+    if (sign)
+       DIV(r2, r3);
+    else
+       DIVU(r2, r3);
+    MFLO(r0);
+    MFHI(r1);
+}
+
+static void
+_iqdivi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_word_t i0, jit_bool_t sign)
+{
+    jit_int32_t                reg;
+    reg = jit_get_reg(jit_class_gpr);
+    movi(rn(reg), i0);
+    iqdivr(r0, r1, r2, rn(reg), sign);
+    jit_unget_reg(reg);
+}
+
+static void
 _remr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
     DIV(r1, r2);
diff --git a/lib/jit_mips.c b/lib/jit_mips.c
index 57be725..4c606c4 100644
--- a/lib/jit_mips.c
+++ b/lib/jit_mips.c
@@ -704,6 +704,16 @@ _emit_code(jit_state_t *_jit)
            case jit_code_##name##i##type:                              \
                name##i##type(rn(node->u.w), rn(node->v.w), node->w.w); \
                break
+#define case_rrrr(name, type)                                          \
+           case jit_code_##name##r##type:                              \
+               name##r##type(rn(node->u.q.l), rn(node->u.q.h),         \
+                             rn(node->v.w), rn(node->w.w));            \
+               break
+#define case_rrrw(name, type)                                          \
+           case jit_code_##name##i##type:                              \
+               name##i##type(rn(node->u.q.l), rn(node->u.q.h),         \
+                             rn(node->v.w), node->w.w);                \
+               break
 #define case_rrf(name, type, size)                                     \
            case jit_code_##name##i##type:                              \
                assert(node->flag & jit_flag_data);                     \
@@ -785,10 +795,18 @@ _emit_code(jit_state_t *_jit)
                case_rrw(subx,);
                case_rrr(mul,);
                case_rrw(mul,);
+               case_rrrr(qmul,);
+               case_rrrw(qmul,);
+               case_rrrr(qmul, _u);
+               case_rrrw(qmul, _u);
                case_rrr(div,);
                case_rrw(div,);
                case_rrr(div, _u);
                case_rrw(div, _u);
+               case_rrrr(qdiv,);
+               case_rrrw(qdiv,);
+               case_rrrr(qdiv, _u);
+               case_rrrw(qdiv, _u);
                case_rrr(rem,);
                case_rrw(rem,);
                case_rrr(rem, _u);
diff --git a/lib/jit_ppc-cpu.c b/lib/jit_ppc-cpu.c
index 7f8a835..56bf56a 100644
--- a/lib/jit_ppc-cpu.c
+++ b/lib/jit_ppc-cpu.c
@@ -390,12 +390,32 @@ static void 
_subxi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
 #  define mulr(r0,r1,r2)               MULLW(r0,r1,r2)
 #  define muli(r0,r1,i0)               _muli(_jit,r0,r1,i0)
 static void _muli(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
+#  define qmulr(r0,r1,r2,r3)           iqmulr(r0,r1,r2,r3,1)
+#  define qmulr_u(r0,r1,r2,r3)         iqmulr(r0,r1,r2,r3,0)
+#  define iqmulr(r0,r1,r2,r3,cc)       _iqmulr(_jit,r0,r1,r2,r3,cc)
+static void _iqmulr(jit_state_t*,jit_int32_t,jit_int32_t,
+                   jit_int32_t,jit_int32_t,jit_bool_t);
+#  define qmuli(r0,r1,r2,i0)           iqmuli(r0,r1,r2,i0,1)
+#  define qmuli_u(r0,r1,r2,i0)         iqmuli(r0,r1,r2,i0,0)
+#  define iqmuli(r0,r1,r2,i0,cc)       _iqmuli(_jit,r0,r1,r2,i0,cc)
+static void _iqmuli(jit_state_t*,jit_int32_t,jit_int32_t,
+                   jit_int32_t,jit_word_t,jit_bool_t);
 #  define divr(r0,r1,r2)               DIVW(r0,r1,r2)
 #  define divi(r0,r1,i0)               _divi(_jit,r0,r1,i0)
 static void _divi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
 #  define divr_u(r0,r1,r2)             DIVWU(r0,r1,r2)
 #  define divi_u(r0,r1,i0)             _divi_u(_jit,r0,r1,i0)
 static void _divi_u(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t);
+#  define qdivr(r0,r1,r2,r3)           iqdivr(r0,r1,r2,r3,1)
+#  define qdivr_u(r0,r1,r2,r3)         iqdivr(r0,r1,r2,r3,0)
+#  define iqdivr(r0,r1,r2,r3,cc)       _iqdivr(_jit,r0,r1,r2,r3,cc)
+static void _iqdivr(jit_state_t*,jit_int32_t,jit_int32_t,
+                   jit_int32_t,jit_int32_t,jit_bool_t);
+#  define qdivi(r0,r1,r2,i0)           iqdivi(r0,r1,r2,i0,1)
+#  define qdivi_u(r0,r1,r2,i0)         iqdivi(r0,r1,r2,i0,0)
+#  define iqdivi(r0,r1,r2,i0,cc)       _iqdivi(_jit,r0,r1,r2,i0,cc)
+static void _iqdivi(jit_state_t*,jit_int32_t,jit_int32_t,
+                   jit_int32_t,jit_word_t,jit_bool_t);
 #  define remr(r0,r1,r2)               _remr(_jit,r0,r1,r2)
 static void _remr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t);
 #  define remi(r0,r1,i0)               _remi(_jit,r0,r1,i0)
@@ -872,6 +892,38 @@ _muli(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, 
jit_word_t i0)
 }
 
 static void
+_iqmulr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_int32_t r3, jit_bool_t sign)
+{
+    jit_int32_t                reg;
+    if (r0 == r2 || r0 == r3) {
+       reg = jit_get_reg(jit_class_gpr);
+       MULLW(rn(reg), r2, r3);
+    }
+    else
+       MULLW(r0, r2, r3);
+    if (sign)
+       MULLH(r1, r2, r3);
+    else
+       MULLHU(r1, r2, r3);
+    if (r0 == r2 || r0 == r3) {
+       movr(r0, rn(reg));
+       reg = jit_unget_reg(reg);
+    }
+}
+
+static void
+_iqmuli(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_word_t i0, jit_bool_t sign)
+{
+    jit_int32_t                reg;
+    reg = jit_get_reg(jit_class_gpr);
+    movi(rn(reg), i0);
+    iqmulr(r0, r1, r2, rn(reg), sign);
+    jit_unget_reg(reg);
+}
+
+static void
 _divi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_word_t i0)
 {
     jit_int32_t                reg;
@@ -892,6 +944,53 @@ _divi_u(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, 
jit_word_t i0)
 }
 
 static void
+_iqdivr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_int32_t r3, jit_bool_t sign)
+{
+    jit_int32_t                sv0, rg0;
+    jit_int32_t                sv1, rg1;
+
+    if (r0 == r2 || r0 == r3) {
+       sv0 = jit_get_reg(jit_class_gpr);
+       rg0 = rn(sv0);
+    }
+    else
+       rg0 = r0;
+    if (r1 == r2 || r1 == r3) {
+       sv1 = jit_get_reg(jit_class_gpr);
+       rg1 = rn(sv1);
+    }
+    else
+       rg1 = r1;
+
+    if (sign)
+       divr(rg0, r2, r3);
+    else
+       divr_u(rg0, r2, r3);
+    mulr(rg1, r2, r0);
+    subr(rg1, r2, r0);
+    if (rg0 != r0) {
+       movr(r0, rg0);
+       jit_unget_reg(sv0);
+    }
+    if (rg1 != r1) {
+       movr(r1, rg1);
+       jit_unget_reg(sv1);
+    }
+}
+
+static void
+_iqdivi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_word_t i0, jit_bool_t sign)
+{
+    jit_int32_t                reg;
+    reg = jit_get_reg(jit_class_gpr);
+    movi(rn(reg), i0);
+    iqdivr(r0, r1, r2, rn(reg), sign);
+    jit_unget_reg(reg);
+}
+
+static void
 _remr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
     jit_int32_t                reg;
diff --git a/lib/jit_ppc.c b/lib/jit_ppc.c
index aaa2035..dafc36b 100644
--- a/lib/jit_ppc.c
+++ b/lib/jit_ppc.c
@@ -723,10 +723,20 @@ _emit_code(jit_state_t *_jit)
                name##r##type(rn(node->u.w),                            \
                              rn(node->v.w), rn(node->w.w));            \
                break
+#define case_rrr(name, type)                                           \
+           case jit_code_##name##r##type:                              \
+               name##r##type(rn(node->u.q.l), rn(node->u.q.h),         \
+                             rn(node->v.w), rn(node->w.w));            \
+               break
 #define case_rrw(name, type)                                           \
            case jit_code_##name##i##type:                              \
                name##i##type(rn(node->u.w), rn(node->v.w), node->w.w); \
                break
+#define case_rrrw(name, type)                                          \
+           case jit_code_##name##i##type:                              \
+               name##i##type(rn(node->u.q.l), rn(node->u.q.h),         \
+                             rn(node->v.w), node->w.w);                \
+               break
 #define case_rrf(name, type, size)                                     \
            case jit_code_##name##i##type:                              \
                assert(node->flag & jit_flag_data);                     \
@@ -808,10 +818,18 @@ _emit_code(jit_state_t *_jit)
                case_rrw(subx,);
                case_rrr(mul,);
                case_rrw(mul,);
+               case_rrrr(qmul,);
+               case_rrrw(qmul,);
+               case_rrrr(qmul, _u);
+               case_rrrw(qmul, _u);
                case_rrr(div,);
                case_rrw(div,);
                case_rrr(div, _u);
                case_rrw(div, _u);
+               case_rrrr(qdiv,);
+               case_rrrw(qdiv,);
+               case_rrrr(qdiv, _u);
+               case_rrrw(qdiv, _u);
                case_rrr(rem,);
                case_rrw(rem,);
                case_rrr(rem, _u);
diff --git a/lib/jit_print.c b/lib/jit_print.c
index fe4b165..ee7e08d 100644
--- a/lib/jit_print.c
+++ b/lib/jit_print.c
@@ -48,8 +48,12 @@ static char *code_name[] = {
     "subcr",           "subci",
     "subxr",           "subxi",
     "mulr",            "muli",
+    "qmulr",           "qmuli",
+    "qmulr_u",         "qmuli_u",
     "divr",            "divi",
     "divr_u",          "divi_u",
+    "qdivr",           "qdivi",
+    "qdivr_u",         "qdivi_u",
     "remr",            "remi",
     "remr_u",          "remi_u",
     "andr",            "andi",
@@ -248,7 +252,7 @@ _jit_print(jit_state_t *_jit)
            continue;
        }
        value = jit_classify(node->code) &
-           (jit_cc_a0_int|jit_cc_a0_jmp|jit_cc_a0_reg|
+           (jit_cc_a0_int|jit_cc_a0_jmp|jit_cc_a0_reg|jit_cc_a0_rlh|
             jit_cc_a1_reg|jit_cc_a1_int|jit_cc_a1_flt|jit_cc_a1_dbl|
             jit_cc_a2_reg|jit_cc_a2_int|jit_cc_a2_flt|jit_cc_a2_dbl);
        if (value & jit_cc_a0_jmp)
@@ -303,6 +307,16 @@ _jit_print(jit_state_t *_jit)
            print_chr(' ');             print_reg(node->u.w);
            print_chr(' ');             print_reg(node->v.w);
            print_chr(' ');             print_hex(node->w.w);   continue;
+       q_r_r:
+           print_str(" (");            print_reg(node->u.q.l);
+           print_chr(' ');             print_reg(node->u.q.h);
+           print_str(") ");            print_reg(node->v.w);
+           print_chr(' ');             print_reg(node->w.w);   continue;
+       q_r_w:
+           print_str(" (");            print_reg(node->u.q.l);
+           print_chr(' ');             print_reg(node->u.q.h);
+           print_str(") ");            print_reg(node->v.w);
+           print_chr(' ');             print_hex(node->w.w);   continue;
        r_r_f:
            print_chr(' ');             print_reg(node->u.w);
            print_chr(' ');             print_reg(node->v.w);
@@ -419,6 +433,12 @@ _jit_print(jit_state_t *_jit)
                        goto r_r_r;
                    case jit_cc_a0_reg|jit_cc_a1_reg|jit_cc_a2_int:
                        goto r_r_w;
+                   case jit_cc_a0_reg|jit_cc_a0_rlh|
+                        jit_cc_a1_reg|jit_cc_a2_reg:
+                       goto q_r_r;
+                   case jit_cc_a0_reg|jit_cc_a0_rlh|
+                        jit_cc_a1_reg|jit_cc_a2_int:
+                       goto q_r_w;
                    case jit_cc_a0_reg|jit_cc_a1_reg|jit_cc_a2_flt:
                        goto r_r_f;
                    case jit_cc_a0_reg|jit_cc_a1_reg|jit_cc_a2_dbl:
diff --git a/lib/jit_x86-cpu.c b/lib/jit_x86-cpu.c
index 9ba93c3..708c680 100644
--- a/lib/jit_x86-cpu.c
+++ b/lib/jit_x86-cpu.c
@@ -158,6 +158,10 @@ static void _alur(jit_state_t*, jit_int32_t, jit_int32_t, 
jit_int32_t);
 #  define alui(code, r0, i0)           _alui(_jit, code, r0, i0)
 static void _alui(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t);
 #  define iaddr(r0, r1)                        alur(X86_ADD, r0, r1)
+#  define save(r0)                     _save(_jit, r0)
+static void _save(jit_state_t*, jit_int32_t);
+#  define load(r0)                     _load(_jit, r0)
+static void _load(jit_state_t*, jit_int32_t);
 #  define addr(r0, r1, r2)             _addr(_jit, r0, r1, r2)
 static void _addr(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
 #  define iaddi(r0, i0)                        alui(X86_ADD, r0, i0)
@@ -197,10 +201,22 @@ static void _imuli(jit_state_t*, jit_int32_t, 
jit_int32_t, jit_word_t);
 static void _mulr(jit_state_t*, jit_int32_t, jit_int32_t, jit_int32_t);
 #  define muli(r0, r1, i0)             _muli(_jit, r0, r1, i0)
 static void _muli(jit_state_t*, jit_int32_t, jit_int32_t, jit_word_t);
-#  define idivr(r0)                    unr(X86_IDIV, r0)
-#  define idivr_u(r0)                  unr(X86_DIV, r0)
+#  define umulr(r0)                    unr(X86_IMUL, r0)
+#  define umulr_u(r0)                  unr(X86_MUL, r0)
+#  define qmulr(r0, r1, r2, r3)                _iqmulr(_jit, r0, r1, r2, r3, 1)
+#  define qmulr_u(r0, r1, r2, r3)      _iqmulr(_jit, r0, r1, r2, r3, 0)
+#  define iqmulr(r0, r1, r2, r3, sign) _iqmulr(_jit, r0, r1, r2, r3, sign)
+static void _iqmulr(jit_state_t*, jit_int32_t, jit_int32_t,
+                   jit_int32_t,jit_int32_t, jit_bool_t);
+#  define qmuli(r0, r1, r2, i0)                _iqmuli(_jit, r0, r1, r2, i0, 1)
+#  define qmuli_u(r0, r1, r2, i0)      _iqmuli(_jit, r0, r1, r2, i0, 0)
+#  define iqmuli(r0, r1, r2, i0, sign) _iqmuli(_jit, r0, r1, r2, i0, sign)
+static void _iqmuli(jit_state_t*, jit_int32_t, jit_int32_t,
+                   jit_int32_t,jit_word_t, jit_bool_t);
 #  define sign_extend_rdx_rax()                _sign_extend_rdx_rax(_jit)
 static void _sign_extend_rdx_rax(jit_state_t*);
+#  define idivr(r0)                    unr(X86_IDIV, r0)
+#  define idivr_u(r0)                  unr(X86_DIV, r0)
 #  define divremr(r0, r1, r2, i0, i1)  _divremr(_jit, r0, r1, r2, i0, i1)
 static void
 _divremr(jit_state_t*,jit_int32_t,jit_int32_t,jit_int32_t,
@@ -212,6 +228,16 @@ 
_divremi(jit_state_t*,jit_int32_t,jit_int32_t,jit_word_t,jit_bool_t,jit_bool_t);
 #  define divi(r0, r1, i0)             divremi(r0, r1, i0, 1, 1)
 #  define divr_u(r0, r1, r2)           divremr(r0, r1, r2, 0, 1)
 #  define divi_u(r0, r1, i0)           divremi(r0, r1, i0, 0, 1)
+#  define qdivr(r0, r1, r2, r3)                _iqdivr(_jit, r0, r1, r2, r3, 1)
+#  define qdivr_u(r0, r1, r2, r3)      _iqdivr(_jit, r0, r1, r2, r3, 0)
+#  define iqdivr(r0, r1, r2, r3, sign) _iqdivr(_jit, r0, r1, r2, r3, sign)
+static void _iqdivr(jit_state_t*, jit_int32_t, jit_int32_t,
+                   jit_int32_t,jit_int32_t, jit_bool_t);
+#  define qdivi(r0, r1, r2, i0)                _iqdivi(_jit, r0, r1, r2, i0, 1)
+#  define qdivi_u(r0, r1, r2, i0)      _iqdivi(_jit, r0, r1, r2, i0, 0)
+#  define iqdivi(r0, r1, r2, i0, sign) _iqdivi(_jit, r0, r1, r2, i0, sign)
+static void _iqdivi(jit_state_t*, jit_int32_t, jit_int32_t,
+                   jit_int32_t,jit_word_t, jit_bool_t);
 #  define remr(r0, r1, r2)             divremr(r0, r1, r2, 1, 0)
 #  define remi(r0, r1, i0)             divremi(r0, r1, i0, 1, 0)
 #  define remr_u(r0, r1, r2)           divremr(r0, r1, r2, 0, 0)
@@ -835,6 +861,27 @@ _alui(jit_state_t *_jit, jit_int32_t code, jit_int32_t r0, 
jit_word_t i0)
 }
 
 static void
+_save(jit_state_t *_jit, jit_int32_t r0)
+{
+    if (!_jit->function->regoff[r0]) {
+       _jit->function->regoff[r0] = jit_allocai(sizeof(jit_word_t));
+       _jit->again = 1;
+    }
+    assert(!jit_regset_tstbit(_jit->regsav, r0));
+    jit_regset_setbit(_jit->regsav, r0);
+    stxi(_jit->function->regoff[r0], _RBP_REGNO, r0);
+}
+
+static void
+_load(jit_state_t *_jit, jit_int32_t r0)
+{
+    assert(_jit->function->regoff[r0]);
+    assert(jit_regset_tstbit(_jit->regsav, r0));
+    jit_regset_clrbit(_jit->regsav, r0);
+    ldxi(r0, _RBP_REGNO, _jit->function->regoff[r0]);
+}
+
+static void
 _addr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
 {
     if (r0 == r1)
@@ -1130,6 +1177,99 @@ _muli(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, 
jit_word_t i0)
     }
 }
 
+#define savset(rn)                                                     \
+    if (r0 != rn) {                                                    \
+       sav |= 1 << rn;                                                 \
+       if (r1 != rn && r2 != rn)                                       \
+           set |= 1 << rn;                                             \
+    }
+#define isavset(rn)                                                    \
+    if (r0 != rn) {                                                    \
+       sav |= 1 << rn;                                                 \
+       if (r1 != rn)                                                   \
+           set |= 1 << rn;                                             \
+    }
+#define qsavset(rn)                                                    \
+    if (r0 != rn && r1 != rn) {                                                
\
+       sav |= 1 << rn;                                                 \
+       if (r2 != rn && r3 != rn)                                       \
+           set |= 1 << rn;                                             \
+    }
+#define allocr(rn, rv)                                                 \
+    if (set & (1 << rn))                                               \
+       (void)jit_get_reg(rv|jit_class_gpr|jit_class_named);            \
+    if (sav & (1 << rn)) {                                             \
+       if ( jit_regset_tstbit(_jit->regsav, rv) ||                     \
+           !jit_regset_tstbit(_jit->reglive, rv))                      \
+           sav &= ~(1 << rn);                                          \
+       else                                                            \
+           save(rv);                                                   \
+    }
+#define clear(rn, rv)                                                  \
+    if (set & (1 << rn))                                               \
+       jit_unget_reg(rv);                                              \
+    if (sav & (1 << rn))                                               \
+       load(rv);
+static void
+_iqmulr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_int32_t r3, jit_bool_t sign)
+{
+    jit_int32_t                mul;
+    jit_int32_t                sav;
+    jit_int32_t                set;
+
+    sav = set = 0;
+    qsavset(_RDX_REGNO);
+    qsavset(_RAX_REGNO);
+    allocr(_RDX_REGNO, _RDX);
+    allocr(_RAX_REGNO, _RAX);
+
+    if (r3 == _RAX_REGNO)
+       mul = r2;
+    else {
+       mul = r3;
+       movr(_RAX_REGNO, r2);
+    }
+    if (sign)
+       umulr(mul);
+    else
+       umulr_u(mul);
+
+    if (r0 == _RDX_REGNO && r1 == _RAX_REGNO)
+       xchgr(_RAX_REGNO, _RDX_REGNO);
+    else {
+       if (r0 != _RDX_REGNO)
+           movr(r0, _RAX_REGNO);
+       movr(r1, _RDX_REGNO);
+       if (r0 == _RDX_REGNO)
+           movr(r0, _RAX_REGNO);
+    }
+
+    clear(_RDX_REGNO, _RDX);
+    clear(_RAX_REGNO, _RAX);
+}
+
+static void
+_iqmuli(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_word_t i0, jit_bool_t sign)
+{
+    jit_int32_t                reg;
+
+    if (i0 == 0) {
+       ixorr(r0, r0);
+       ixorr(r1, r1);
+    }
+    else {
+       reg = jit_get_reg(jit_class_gpr);
+       movi(rn(reg), i0);
+       if (sign)
+           qmulr(r0, r1, r2, rn(reg));
+       else
+           qmulr_u(r0, r1, r2, rn(reg));
+       jit_unget_reg(reg);
+    }
+}
+
 static void
 _sign_extend_rdx_rax(jit_state_t *_jit)
 {
@@ -1144,17 +1284,14 @@ _divremr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t 
r1, jit_int32_t r2,
     jit_int32_t                div;
     jit_int32_t                reg;
     jit_int32_t                set;
+    jit_int32_t                sav;
     jit_int32_t                use;
 
-    set = use = 0;
-    if (r0 != _RDX_REGNO && r1 != _RDX_REGNO && r2 != _RDX_REGNO)
-       set |= 1 << _RDX_REGNO;
-    if (r0 != _RAX_REGNO && r1 != _RAX_REGNO && r2 != _RAX_REGNO)
-       set |= 1 << _RAX_REGNO;
-    if (set & (1 <<_RDX_REGNO))
-       (void)jit_get_reg(_RDX|jit_class_gpr|jit_class_named);
-    if (set & (1 << _RAX_REGNO))
-       (void)jit_get_reg(_RAX|jit_class_gpr|jit_class_named);
+    sav = set = use = 0;
+    savset(_RDX_REGNO);
+    savset(_RAX_REGNO);
+    allocr(_RDX_REGNO, _RDX);
+    allocr(_RAX_REGNO, _RAX);
 
     if (r2 == _RAX_REGNO) {
        if (r0 == _RAX_REGNO || r0 == _RDX_REGNO) {
@@ -1215,18 +1352,13 @@ _divremr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t 
r1, jit_int32_t r2,
     if (use)
        jit_unget_reg(reg);
 
-    if (r0 != _RAX_REGNO) {
-       if (divide)
-           movr(r0, _RAX_REGNO);
-    }
-    if (r0 != _RDX_REGNO) {
-       if (!divide)
-           movr(r0, _RDX_REGNO);
-    }
-    if (set & (1 <<_RDX_REGNO))
-       jit_unget_reg(_RDX);
-    if (set & (1 << _RAX_REGNO))
-       jit_unget_reg(_RAX);
+    if (divide)
+       movr(r0, _RAX_REGNO);
+    else
+       movr(r0, _RDX_REGNO);
+
+    clear(_RDX_REGNO, _RDX);
+    clear(_RAX_REGNO, _RAX);
 }
 
 static void
@@ -1235,6 +1367,7 @@ _divremi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t 
r1, jit_word_t i0,
 {
     jit_int32_t                reg;
     jit_int32_t                div;
+    jit_int32_t                sav;
     jit_int32_t                set;
     jit_int32_t                use;
 
@@ -1283,15 +1416,11 @@ _divremi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t 
r1, jit_word_t i0,
        return;
     }
 
-    set = use = 0;
-    if (r0 != _RDX_REGNO && r1 != _RDX_REGNO)
-       set |= 1 << _RDX_REGNO;
-    if (r0 != _RAX_REGNO && r1 != _RAX_REGNO)
-       set |= 1 << _RAX_REGNO;
-    if (set & (1 <<_RDX_REGNO))
-       (void)jit_get_reg(_RDX|jit_class_gpr|jit_class_named);
-    if (set & (1 << _RAX_REGNO))
-       (void)jit_get_reg(_RAX|jit_class_gpr|jit_class_named);
+    sav = set = use = 0;
+    isavset(_RDX_REGNO);
+    isavset(_RAX_REGNO);
+    allocr(_RDX_REGNO, _RDX);
+    allocr(_RAX_REGNO, _RAX);
 
     if (r0 == _RAX_REGNO || r0 == _RDX_REGNO || r0 == r1) {
        if ((reg = jit_get_reg(jit_class_gpr|jit_class_chk)) == JIT_NOREG)
@@ -1318,19 +1447,118 @@ _divremi(jit_state_t *_jit, jit_int32_t r0, 
jit_int32_t r1, jit_word_t i0,
     if (use)
        jit_unget_reg(reg);
 
-    if (r0 != _RAX_REGNO) {
-       if (divide)
-           movr(r0, _RAX_REGNO);
+    if (divide)
+       movr(r0, _RAX_REGNO);
+    else
+       movr(r0, _RDX_REGNO);
+
+    clear(_RDX_REGNO, _RDX);
+    clear(_RAX_REGNO, _RAX);
+}
+
+static void
+_iqdivr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_int32_t r3, jit_bool_t sign)
+{
+    jit_int32_t                div;
+    jit_int32_t                reg;
+    jit_int32_t                sav;
+    jit_int32_t                set;
+    jit_int32_t                use;
+
+    sav = set = use = 0;
+    qsavset(_RDX_REGNO);
+    qsavset(_RAX_REGNO);
+    allocr(_RDX_REGNO, _RDX);
+    allocr(_RAX_REGNO, _RAX);
+    if (r3 == _RAX_REGNO) {
+       if (r0 == _RAX_REGNO || r0 == _RDX_REGNO) {
+           if ((reg = jit_get_reg(jit_class_gpr|jit_class_chk)) == JIT_NOREG)
+               reg = jit_get_reg((r1 == _RCX_REGNO ? _RBX : _RCX) |
+                                 jit_class_gpr|jit_class_named);
+           use = 1;
+           div = rn(reg);
+           movr(div, _RAX_REGNO);
+           if (r2 != _RAX_REGNO)
+               movr(_RAX_REGNO, r2);
+       }
+       else {
+           if (r0 == r2)
+               xchgr(r0, _RAX_REGNO);
+           else {
+               if (r0 != _RAX_REGNO)
+                   movr(r0, _RAX_REGNO);
+               if (r2 != _RAX_REGNO)
+                   movr(_RAX_REGNO, r2);
+           }
+           div = r0;
+       }
     }
-    if (r0 != _RDX_REGNO) {
-       if (!divide)
+    else if (r3 == _RDX_REGNO) {
+       if (r0 == _RAX_REGNO || r0 == _RDX_REGNO) {
+           if ((reg = jit_get_reg(jit_class_gpr|jit_class_chk)) == JIT_NOREG)
+               reg = jit_get_reg((r1 == _RCX_REGNO ? _RBX : _RCX) |
+                                 jit_class_gpr|jit_class_named);
+           use = 1;
+           div = rn(reg);
+           movr(div, _RDX_REGNO);
+           if (r2 != _RAX_REGNO)
+               movr(_RAX_REGNO, r2);
+       }
+       else {
+           if (r2 != _RAX_REGNO)
+               movr(_RAX_REGNO, r2);
            movr(r0, _RDX_REGNO);
+           div = r0;
+       }
+    }
+    else {
+       if (r2 != _RAX_REGNO)
+           movr(_RAX_REGNO, r2);
+       div = r3;
+    }
+    if (sign) {
+       sign_extend_rdx_rax();
+       idivr(div);
+    }
+    else {
+       ixorr(_RDX_REGNO, _RDX_REGNO);
+       idivr_u(div);
+    }
+    if (use)
+       jit_unget_reg(reg);
+
+    if (r0 == _RDX_REGNO && r1 == _RAX_REGNO)
+       xchgr(_RAX_REGNO, _RDX_REGNO);
+    else {
+       if (r0 != _RDX_REGNO)
+           movr(r0, _RAX_REGNO);
+       movr(r1, _RDX_REGNO);
+       if (r0 == _RDX_REGNO)
+           movr(r0, _RAX_REGNO);
     }
-    if (set & (1 <<_RDX_REGNO))
-       jit_unget_reg(_RDX);
-    if (set & (1 << _RAX_REGNO))
-       jit_unget_reg(_RAX);
+
+    clear(_RDX_REGNO, _RDX);
+    clear(_RAX_REGNO, _RAX);
+}
+
+static void
+_iqdivi(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
+       jit_int32_t r2, jit_word_t i0, jit_bool_t sign)
+{
+    jit_int32_t                reg;
+
+    reg = jit_get_reg(jit_class_gpr);
+    movi(rn(reg), i0);
+    if (sign)
+       qdivr(r0, r1, r2, rn(reg));
+    else
+       qdivr_u(r0, r1, r2, rn(reg));
+    jit_unget_reg(reg);
 }
+#undef clear
+#undef allocr
+#undef savset
 
 static void
 _andr(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1, jit_int32_t r2)
diff --git a/lib/jit_x86.c b/lib/jit_x86.c
index 4ff1dab..bac6459 100644
--- a/lib/jit_x86.c
+++ b/lib/jit_x86.c
@@ -941,6 +941,11 @@ _emit_code(jit_state_t *_jit)
                name##r##type(rn(node->u.w),                            \
                              rn(node->v.w), rn(node->w.w));            \
                break
+#define case_rrrr(name, type)                                          \
+           case jit_code_##name##r##type:                              \
+               name##r##type(rn(node->u.q.l), rn(node->u.q.h),         \
+                             rn(node->v.w), rn(node->w.w));            \
+               break
 #define case_frr(name, type)                                           \
            case jit_code_##name##r##type:                              \
                if (jit_x87_reg_p(node->u.w))                           \
@@ -963,6 +968,11 @@ _emit_code(jit_state_t *_jit)
            case jit_code_##name##i##type:                              \
                name##i##type(rn(node->u.w), rn(node->v.w), node->w.w); \
                break
+#define case_rrrw(name, type)                                          \
+           case jit_code_##name##i##type:                              \
+               name##i##type(rn(node->u.q.l), rn(node->u.q.h),         \
+                             rn(node->v.w), node->w.w);                \
+               break
 #define case_frw(name, type)                                           \
            case jit_code_##name##i##type:                              \
                if (jit_x87_reg_p(node->u.w))                           \
@@ -1139,10 +1149,18 @@ _emit_code(jit_state_t *_jit)
                case_rrw(subc,);
                case_rrr(mul,);
                case_rrw(mul,);
+               case_rrrr(qmul,);
+               case_rrrw(qmul,);
+               case_rrrr(qmul, _u);
+               case_rrrw(qmul, _u);
                case_rrr(div,);
                case_rrw(div,);
                case_rrr(div, _u);
                case_rrw(div, _u);
+               case_rrrr(qdiv,);
+               case_rrrw(qdiv,);
+               case_rrrr(qdiv, _u);
+               case_rrrw(qdiv, _u);
                case_rrr(rem,);
                case_rrw(rem,);
                case_rrr(rem, _u);
diff --git a/lib/lightning.c b/lib/lightning.c
index e9de59d..c2ee090 100644
--- a/lib/lightning.c
+++ b/lib/lightning.c
@@ -635,6 +635,21 @@ _jit_new_node_www(jit_state_t *_jit, jit_code_t code,
 }
 
 jit_node_t *
+_jit_new_node_qww(jit_state_t *_jit, jit_code_t code,
+                 jit_int32_t l, jit_int32_t h,
+                 jit_word_t v, jit_word_t w)
+{
+    jit_node_t         *node = new_node(code);
+    assert(!_jit->emit);
+    assert(l != h);
+    node->u.q.l = l;
+    node->u.q.h = h;
+    node->v.w = v;
+    node->w.w = w;
+    return (link_node(node));
+}
+
+jit_node_t *
 _jit_new_node_wwf(jit_state_t *_jit, jit_code_t code,
                  jit_word_t u, jit_word_t v, jit_float32_t w)
 {
@@ -857,6 +872,11 @@ _jit_classify(jit_state_t *_jit, jit_code_t code)
        case jit_code_ldxi_l:   case jit_code_ldxi_f:   case jit_code_ldxi_d:
            mask = jit_cc_a0_reg|jit_cc_a0_chg|jit_cc_a1_reg|jit_cc_a2_int;
            break;
+       case jit_code_qmuli:    case jit_code_qmuli_u:
+       case jit_code_qdivi:    case jit_code_qdivi_u:
+           mask = jit_cc_a0_reg|jit_cc_a0_rlh|jit_cc_a0_chg|
+                  jit_cc_a1_reg|jit_cc_a2_int;
+           break;
        case jit_code_addi_f:   case jit_code_subi_f:   case jit_code_muli_f:
        case jit_code_divi_f:   case jit_code_lti_f:    case jit_code_lei_f:
        case jit_code_eqi_f:    case jit_code_gei_f:    case jit_code_gti_f:
@@ -900,6 +920,11 @@ _jit_classify(jit_state_t *_jit, jit_code_t code)
        case jit_code_movr_ww_d:
            mask = jit_cc_a0_reg|jit_cc_a0_chg|jit_cc_a1_reg|jit_cc_a2_reg;
            break;
+       case jit_code_qmulr:    case jit_code_qmulr_u:
+       case jit_code_qdivr:    case jit_code_qdivr_u:
+           mask = jit_cc_a0_reg|jit_cc_a0_rlh|jit_cc_a0_chg|
+                  jit_cc_a1_reg|jit_cc_a2_reg;
+           break;
        case jit_code_sti_c:    case jit_code_sti_s:    case jit_code_sti_i:
        case jit_code_sti_l:    case jit_code_sti_f:    case jit_code_sti_d:
            mask = jit_cc_a0_int|jit_cc_a1_reg;
@@ -1185,13 +1210,35 @@ _jit_reglive(jit_state_t *_jit, jit_node_t *node)
            break;
        default:
            value = jit_classify(node->code);
-           if ((value & jit_cc_a0_reg) && !(node->u.w & jit_regno_patch)) {
-               if (value & jit_cc_a0_chg) {
-                   jit_regset_clrbit(_jit->reglive, node->u.w);
-                   jit_regset_setbit(_jit->regmask, node->u.w);
+           if (value & jit_cc_a0_reg) {
+               if (value & jit_cc_a0_rlh) {
+                   if (!(node->u.q.l & jit_regno_patch)) {
+                       if (value & jit_cc_a0_chg) {
+                           jit_regset_clrbit(_jit->reglive, node->u.q.l);
+                           jit_regset_setbit(_jit->regmask, node->u.q.l);
+                       }
+                       else
+                           jit_regset_setbit(_jit->reglive, node->u.q.l);
+                   }
+                   if (!(node->u.q.h & jit_regno_patch)) {
+                       if (value & jit_cc_a0_chg) {
+                           jit_regset_clrbit(_jit->reglive, node->u.q.h);
+                           jit_regset_setbit(_jit->regmask, node->u.q.h);
+                       }
+                       else
+                           jit_regset_setbit(_jit->reglive, node->u.q.h);
+                   }
+               }
+               else {
+                   if (!(node->u.w & jit_regno_patch)) {
+                       if (value & jit_cc_a0_chg) {
+                           jit_regset_clrbit(_jit->reglive, node->u.w);
+                           jit_regset_setbit(_jit->regmask, node->u.w);
+                       }
+                       else
+                           jit_regset_setbit(_jit->reglive, node->u.w);
+                   }
                }
-               else
-                   jit_regset_setbit(_jit->reglive, node->u.w);
            }
            if ((value & jit_cc_a1_reg) && !(node->v.w & jit_regno_patch)) {
                if (value & jit_cc_a1_chg) {
@@ -1219,8 +1266,14 @@ _jit_reglive(jit_state_t *_jit, jit_node_t *node)
 void
 _jit_regarg_set(jit_state_t *_jit, jit_node_t *node, jit_int32_t value)
 {
-    if (value & jit_cc_a0_reg)
-       jit_regset_setbit(_jit->regarg, jit_regno(node->u.w));
+    if (value & jit_cc_a0_reg) {
+       if (value & jit_cc_a0_rlh) {
+           jit_regset_setbit(_jit->regarg, jit_regno(node->u.q.l));
+           jit_regset_setbit(_jit->regarg, jit_regno(node->u.q.h));
+       }
+       else
+           jit_regset_setbit(_jit->regarg, jit_regno(node->u.w));
+    }
     if (value & jit_cc_a1_reg)
        jit_regset_setbit(_jit->regarg, jit_regno(node->v.w));
     if (value & jit_cc_a2_reg)
@@ -1230,8 +1283,14 @@ _jit_regarg_set(jit_state_t *_jit, jit_node_t *node, 
jit_int32_t value)
 void
 _jit_regarg_clr(jit_state_t *_jit, jit_node_t *node, jit_int32_t value)
 {
-    if (value & jit_cc_a0_reg)
-       jit_regset_clrbit(_jit->regarg, jit_regno(node->u.w));
+    if (value & jit_cc_a0_reg) {
+       if (value & jit_cc_a0_rlh) {
+           jit_regset_clrbit(_jit->regarg, jit_regno(node->u.q.l));
+           jit_regset_clrbit(_jit->regarg, jit_regno(node->u.q.h));
+       }
+       else
+           jit_regset_clrbit(_jit->regarg, jit_regno(node->u.w));
+    }
     if (value & jit_cc_a1_reg)
        jit_regset_clrbit(_jit->regarg, jit_regno(node->v.w));
     if (value & jit_cc_a2_reg)
@@ -1325,14 +1384,33 @@ _jit_setup(jit_state_t *_jit, jit_block_t *block)
                return;
            default:
                value = jit_classify(node->code);
-               if ((value & jit_cc_a0_reg) &&
-                   !(node->u.w & jit_regno_patch) &&
-                   jit_regset_tstbit(regmask, node->u.w)) {
+               if (value & jit_cc_a0_reg) {
                    live = !(value & jit_cc_a0_chg);
-                   if (live || !jump)
-                       jit_regset_clrbit(regmask, node->u.w);
-                   if (live)
-                       jit_regset_setbit(reglive, node->u.w);
+                   if (value & jit_cc_a0_rlh) {
+                       if (!(node->u.q.l & jit_regno_patch) &&
+                           jit_regset_tstbit(regmask, node->u.q.l)) {
+                           if (live || !jump)
+                               jit_regset_clrbit(regmask, node->u.q.l);
+                           if (live)
+                               jit_regset_setbit(reglive, node->u.q.l);
+                       }
+                       if (!(node->u.q.h & jit_regno_patch) &&
+                           jit_regset_tstbit(regmask, node->u.q.h)) {
+                           if (live || !jump)
+                               jit_regset_clrbit(regmask, node->u.q.h);
+                           if (live)
+                               jit_regset_setbit(reglive, node->u.q.h);
+                       }
+                   }
+                   else {
+                       if (!(node->u.w & jit_regno_patch) &&
+                           jit_regset_tstbit(regmask, node->u.w)) {
+                           if (live || !jump)
+                               jit_regset_clrbit(regmask, node->u.w);
+                           if (live)
+                               jit_regset_setbit(reglive, node->u.w);
+                       }
+                   }
                }
                if ((value & jit_cc_a1_reg) &&
                    !(node->v.w & jit_regno_patch) &&
@@ -1453,11 +1531,29 @@ _jit_update(jit_state_t *_jit, jit_node_t *node,
                    }
                }
                if (value & jit_cc_a0_reg) {
-                   if (!(node->u.w & jit_regno_patch)) {
-                       if (jit_regset_tstbit(*mask, node->u.w)) {
-                           jit_regset_clrbit(*mask, node->u.w);
-                           if (!(value & jit_cc_a0_chg))
-                               jit_regset_setbit(*live, node->u.w);
+                   if (value & jit_cc_a0_rlh) {
+                       if (!(node->u.q.l & jit_regno_patch)) {
+                           if (jit_regset_tstbit(*mask, node->u.q.l)) {
+                               jit_regset_clrbit(*mask, node->u.q.l);
+                               if (!(value & jit_cc_a0_chg))
+                                   jit_regset_setbit(*live, node->u.q.l);
+                           }
+                       }
+                       if (!(node->u.q.h & jit_regno_patch)) {
+                           if (jit_regset_tstbit(*mask, node->u.q.h)) {
+                               jit_regset_clrbit(*mask, node->u.q.h);
+                               if (!(value & jit_cc_a0_chg))
+                                   jit_regset_setbit(*live, node->u.q.h);
+                           }
+                       }
+                   }
+                   else {
+                       if (!(node->u.w & jit_regno_patch)) {
+                           if (jit_regset_tstbit(*mask, node->u.w)) {
+                               jit_regset_clrbit(*mask, node->u.w);
+                               if (!(value & jit_cc_a0_chg))
+                                   jit_regset_setbit(*live, node->u.w);
+                           }
                        }
                    }
                }
@@ -1887,14 +1983,25 @@ _redundant_store(jit_state_t *_jit, jit_node_t *node, 
jit_bool_t jump)
                break;
            default:
                spec = jit_classify(iter->code);
-               if ((spec & jit_cc_a0_jmp) ||
-                   (((spec & (jit_cc_a0_reg|jit_cc_a0_chg)) ==
-                     (jit_cc_a0_reg|jit_cc_a0_chg)) &&
-                    regno == jit_regno(iter->u.w)) ||
-                   (((spec & (jit_cc_a1_reg|jit_cc_a1_chg)) ==
-                     (jit_cc_a1_reg|jit_cc_a1_chg)) &&
-                    regno == jit_regno(iter->v.w)))
+               if (spec & jit_cc_a0_jmp)
                    return;
+               if ((spec & (jit_cc_a0_reg|jit_cc_a0_chg)) ==
+                   (jit_cc_a0_reg|jit_cc_a0_chg)) {
+                   if (spec & jit_cc_a0_rlh) {
+                       if (regno == jit_regno(iter->u.q.l) ||
+                           regno == jit_regno(iter->u.q.h))
+                           return;
+                   }
+                   else {
+                       if (regno == jit_regno(iter->u.w))
+                           return;
+                   }
+               }
+               if ((spec & (jit_cc_a1_reg|jit_cc_a1_chg)) ==
+                   (jit_cc_a1_reg|jit_cc_a1_chg)) {
+                   if (regno == jit_regno(iter->v.w))
+                       return;
+               }
                break;
        }
     }
@@ -1912,8 +2019,8 @@ _simplify_movr(jit_state_t *_jit, jit_node_t *prev, 
jit_node_t *node,
     right = jit_regno(node->v.w);
     value = _jit->values + regno;
     if ((value->kind == jit_kind_register &&
-        jit_regno(value->base.pair.l) == right &&
-        value->base.pair.h == _jit->gen[right]) ||
+        jit_regno(value->base.q.l) == right &&
+        value->base.q.h == _jit->gen[right]) ||
        (value->kind == kind && _jit->values[right].kind == kind &&
         memcmp(&value->base.w, &_jit->values[right].base.w, size) == 0)) {
        del_node(prev, node);
@@ -1923,8 +2030,8 @@ _simplify_movr(jit_state_t *_jit, jit_node_t *prev, 
jit_node_t *node,
        memcpy(value, _jit->values + right, sizeof(jit_value_t));
     else {
        value->kind = jit_kind_register;
-       value->base.pair.l = right;
-       value->base.pair.h = _jit->gen[regno];
+       value->base.q.l = right;
+       value->base.q.h = _jit->gen[regno];
     }
     ++_jit->gen[regno];
 
@@ -1996,15 +2103,15 @@ _simplify_ldxi(jit_state_t *_jit, jit_node_t *prev, 
jit_node_t *node)
     right = jit_regno(node->v.w);
     value = _jit->values + regno;
     if (value->kind == jit_kind_code && value->code == node->code &&
-       value->base.pair.l == right && value->base.pair.h == _jit->gen[right] &&
+       value->base.q.l == right && value->base.q.h == _jit->gen[right] &&
        node->w.w == value->disp.w) {
        del_node(prev, node);
        return (1);
     }
     value->kind = jit_kind_code;
     value->code = node->code;
-    value->base.pair.l = right;
-    value->base.pair.h = _jit->gen[right];
+    value->base.q.l = right;
+    value->base.q.h = _jit->gen[right];
     value->disp.w = node->w.w;
     ++_jit->gen[regno];
 
@@ -2025,7 +2132,7 @@ _simplify_stxi(jit_state_t *_jit, jit_node_t *prev, 
jit_node_t *node)
 
     /* check for redundant store after load */
     if (value->kind == jit_kind_code && value->code == node->code &&
-       value->base.pair.l == right && value->base.pair.h == _jit->gen[right] &&
+       value->base.q.l == right && value->base.q.h == _jit->gen[right] &&
        node->w.w == value->disp.w) {
        del_node(prev, node);
        return (1);
@@ -2055,8 +2162,8 @@ _simplify_stxi(jit_state_t *_jit, jit_node_t *prev, 
jit_node_t *node)
        }
        value->kind = jit_kind_code;
        value->code = node->code;
-       value->base.pair.l = right;
-       value->base.pair.h = _jit->gen[right];
+       value->base.q.l = right;
+       value->base.q.h = _jit->gen[right];
        value->disp.w = node->u.w;
     }
 
@@ -2188,9 +2295,19 @@ _simplify(jit_state_t *_jit)
                     * a conditional branch */
                    goto reset;
                if (info & jit_cc_a0_chg) {
-                   regno = jit_regno(node->u.w);
-                   _jit->values[regno].kind = 0;
-                   ++_jit->gen[regno];
+                   if (info & jit_cc_a0_rlh) {
+                       regno = jit_regno(node->u.q.l);
+                       _jit->values[regno].kind = 0;
+                       ++_jit->gen[regno];
+                       regno = jit_regno(node->u.q.h);
+                       _jit->values[regno].kind = 0;
+                       ++_jit->gen[regno];
+                   }
+                   else {
+                       regno = jit_regno(node->u.w);
+                       _jit->values[regno].kind = 0;
+                       ++_jit->gen[regno];
+                   }
                }
                if (info & jit_cc_a1_chg) {
                    regno = jit_regno(node->v.w);
@@ -2222,8 +2339,12 @@ _register_change_p(jit_state_t *_jit, jit_node_t *node, 
jit_node_t *link,
                /* lack of extra information */
                if (value & jit_cc_a0_jmp)
                    return (jit_reg_change);
-               else if ((value & jit_cc_a0_reg) && node->u.w == regno &&
-                        (value & jit_cc_a0_chg))
+               else if ((value & (jit_cc_a0_reg|jit_cc_a0_chg)) ==
+                        (jit_cc_a0_reg|jit_cc_a0_chg) &&
+                        (((value & jit_cc_a0_rlh) &&
+                          (node->u.q.l == regno || node->u.q.h == regno)) ||
+                         (!(value & jit_cc_a0_rlh) &&
+                          node->u.w == regno)))
                    return (jit_reg_change);
                else if ((value & jit_cc_a1_reg) && node->v.w == regno &&
                         (value & jit_cc_a1_chg))
@@ -2380,8 +2501,18 @@ _patch_register(jit_state_t *_jit, jit_node_t *node, 
jit_node_t *link,
 
     for (; node != link; node = node->next) {
        value = jit_classify(node->code);
-       if ((value & jit_cc_a0_reg) && node->u.w == regno)
-           node->u.w = patch;
+       if (value & jit_cc_a0_reg) {
+           if (value & jit_cc_a0_rlh) {
+               if (node->u.q.l == regno)
+                   node->u.q.l = patch;
+               if (node->u.q.h == regno)
+                   node->u.q.h = patch;
+           }
+           else {
+               if (node->u.w == regno)
+                   node->u.w = patch;
+           }
+       }
        if ((value & jit_cc_a1_reg) && node->v.w == regno)
            node->v.w = patch;
        if ((value & jit_cc_a2_reg) && node->w.w == regno)



reply via email to

[Prev in Thread] Current Thread [Next in Thread]