qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 4/8] target/riscv: 128-bit arithmetic and logic instructions


From: Richard Henderson
Subject: Re: [PATCH 4/8] target/riscv: 128-bit arithmetic and logic instructions
Date: Mon, 30 Aug 2021 20:30:28 -0700
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Thunderbird/78.11.0

On 8/30/21 10:16 AM, Frédéric Pétrot wrote:
Adding the support for the 128-bit arithmetic and logic instructions.
Remember that all (i) instructions are now acting on 128-bit registers, that
a few others are added to cope with values that are held on 64 bits within
the 128-bit registers, and that the ones that cope with values on 32-bit
must also be modified for proper sign extension.
Most algorithms taken from Hackers' delight.

Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
---
  target/riscv/insn32.decode              |  13 +
  target/riscv/insn_trans/trans_rvi.c.inc | 955 +++++++++++++++++++++++-
  target/riscv/translate.c                |  25 +
  3 files changed, 976 insertions(+), 17 deletions(-)

diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index 225669e277..2fe7e1dd36 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -22,6 +22,7 @@
  %rs1       15:5
  %rd        7:5
  %sh5       20:5
+%sh6       20:6
%sh7 20:7
  %csr    20:12
@@ -91,6 +92,9 @@
  # Formats 64:
  @sh5     .......  ..... .....  ... ..... ....... &shift  shamt=%sh5      %rs1 
%rd
+# Formats 128:
+@sh6       ...... ...... ..... ... ..... ....... &shift shamt=%sh6 %rs1 %rd
+
  # *** Privileged Instructions ***
  ecall       000000000000     00000 000 00000 1110011
  ebreak      000000000001     00000 000 00000 1110011
@@ -166,6 +170,15 @@ sraw     0100000 .....  ..... 101 ..... 0111011 @r
  ldu      ............   ..... 111 ..... 0000011 @i
  lq       ............   ..... 010 ..... 0001111 @i
  sq       ............   ..... 100 ..... 0100011 @s
+addid    ............  .....  000 ..... 1011011 @i
+sllid    000000 ......  ..... 001 ..... 1011011 @sh6
+srlid    000000 ......  ..... 101 ..... 1011011 @sh6
+sraid    010000 ......  ..... 101 ..... 1011011 @sh6
+addd     0000000 ..... .....  000 ..... 1111011 @r
+subd     0100000 ..... .....  000 ..... 1111011 @r
+slld     0000000 ..... .....  001 ..... 1111011 @r
+srld     0000000 ..... .....  101 ..... 1111011 @r
+srad     0100000 ..... .....  101 ..... 1111011 @r
# *** RV32M Standard Extension ***
  mul      0000001 .....  ..... 000 ..... 0110011 @r
diff --git a/target/riscv/insn_trans/trans_rvi.c.inc 
b/target/riscv/insn_trans/trans_rvi.c.inc
index 772330a766..0401ba3d69 100644
--- a/target/riscv/insn_trans/trans_rvi.c.inc
+++ b/target/riscv/insn_trans/trans_rvi.c.inc
@@ -26,14 +26,20 @@ static bool trans_illegal(DisasContext *ctx, arg_empty *a)
static bool trans_c64_illegal(DisasContext *ctx, arg_empty *a)
  {
-     REQUIRE_64BIT(ctx);
-     return trans_illegal(ctx, a);
+    REQUIRE_64_OR_128BIT(ctx);
+    return trans_illegal(ctx, a);
  }
static bool trans_lui(DisasContext *ctx, arg_lui *a)
  {
      if (a->rd != 0) {
          tcg_gen_movi_tl(cpu_gpr[a->rd], a->imm);
+#if defined(TARGET_RISCV128)
+        if (is_128bit(ctx)) {
+            tcg_gen_ext_i64_i128(cpu_gpr[a->rd], cpu_gprh[a->rd],
+                                 cpu_gpr[a->rd]);
+        }
+#endif
      }
      return true;
  }

I think it is a mistake to introduce all of these ifdefs.

If 128-bit is not enabled, then is_128bit should evaluate to false, and all should be well. As for cpu_gprh[], that should be hidden behind some function/macro so that if 128-bit is not enabled we get qemu_build_not_reached().

Finally, you can compute this immediate value directly. Don't leave it to the optimizer to provide the extension.

  static bool trans_auipc(DisasContext *ctx, arg_auipc *a)
  {
      if (a->rd != 0) {
+#if defined(TARGET_RISCV128)
+        if (is_128bit(ctx)) {
+            /* TODO : when pc is 128 bits, use all its bits */
+            TCGv pc = tcg_const_tl(ctx->base.pc_next),
+                 imm = tcg_const_tl(a->imm),
+                 immh = tcg_const_tl((a->imm & 0x80000)
+                         ? 0xffffffffffffffff : 0),

No need to test bits here: -(a->imm < 0) will do fine.

+                 cnst_zero = tcg_const_tl(0);
+            tcg_gen_add2_tl(cpu_gpr[a->rd], cpu_gprh[a->rd], pc, cnst_zero,
+                            imm, immh);
+            tcg_temp_free(pc);
+            tcg_temp_free(imm);
+            tcg_temp_free(immh);
+            tcg_temp_free(cnst_zero);
+            return true;

tcg_constant_tl, not tcg_const_tl and no tcg_temp_free.

+    case TCG_COND_LT:
+    {
+        TCGv tmp1 = tcg_temp_new(),
+             tmp2 = tcg_temp_new();
+
+        tcg_gen_xor_tl(tmp1, rh, ah);
+        tcg_gen_xor_tl(tmp2, ah, bh);
+        tcg_gen_and_tl(tmp1, tmp1, tmp2);
+        tcg_gen_xor_tl(tmp1, rh, tmp1);
+        tcg_gen_setcondi_tl(TCG_COND_LT, rl, tmp1, 0); /* Check sign bit */
+
+        tcg_temp_free(tmp1);
+        tcg_temp_free(tmp2);
+        break;
+    }

Incorrect, as you're not examining the low parts at all.


+
+    case TCG_COND_GE:
+        /* We invert the result of TCG_COND_LT */
+        gen_setcond_128(rl, rh, al, ah, bl, bh, TCG_COND_LT);
+        tcg_gen_setcondi_tl(TCG_COND_EQ, rl, rl, 0);

Inversion of a boolean is better as xor with 1.

+    case TCG_COND_LTU:
+    {
+        TCGv tmp1 = tcg_temp_new(),
+             tmp2 = tcg_temp_new();
+
+        tcg_gen_eqv_tl(tmp1, ah, bh);
+        tcg_gen_and_tl(tmp1, tmp1, rh);
+        tcg_gen_not_tl(tmp2, ah);
+        tcg_gen_and_tl(tmp2, tmp2, bh);
+        tcg_gen_or_tl(tmp1, tmp1, tmp2);
+
+        tcg_gen_setcondi_tl(TCG_COND_LT, rl, tmp1, 0); /* Check sign bit */
+
+        tcg_temp_free(tmp1);
+        tcg_temp_free(tmp2);
+        break;
+    }

Again, missing comparison of low parts.

@@ -93,7 +205,28 @@ static bool gen_branch(DisasContext *ctx, arg_b *a, TCGCond 
cond)
      gen_get_gpr(source1, a->rs1);
      gen_get_gpr(source2, a->rs2);
+#if defined(TARGET_RISCV128)
+    if (is_128bit(ctx)) {
+        TCGv source1h, source2h, tmph, tmpl;
+        source1h = tcg_temp_new();
+        source2h = tcg_temp_new();
+        tmph = tcg_temp_new();
+        tmpl = tcg_temp_new();
+        gen_get_gprh(source1h, a->rs1);
+        gen_get_gprh(source2h, a->rs2);
+
+        gen_setcond_128(tmpl, tmph, source1, source1h, source2, source2h, 
cond);
+        tcg_gen_brcondi_tl(TCG_COND_NE, tmpl, 0, l);
+        tcg_temp_free(source1h);
+        tcg_temp_free(source2h);
+        tcg_temp_free(tmph);
+        tcg_temp_free(tmpl);

setcond feeding brcond results in too many comparisons, usually.

In this instance it may be just as easy to generate multiple branches, in general. But EQ/NE should not be

        setcond t1, al, bl, eq
        setcond t2, ah, bh, eq
        and     t3, t1, t1
        brcond  t3, 0, ne

but

        xor     t1, al, bl
        xor     t2, ah, bh
        or      t3, t1, t2
        brcond  t3, 0, eq

  static bool trans_srli(DisasContext *ctx, arg_srli *a)
  {
+#if defined(TARGET_RISCV128)
+    if (is_128bit(ctx)) {
+        if (a->shamt >= 128) {
+            return false;
+        }
+
+        if (a->rd != 0 && a->shamt != 0) {
+            TCGv rs = tcg_temp_new(),
+                 rsh = tcg_temp_new(),
+                 res = tcg_temp_new(),
+                 resh = tcg_temp_new(),
+                 tmp = tcg_temp_new();
+            gen_get_gpr(rs, a->rs1);
+            gen_get_gprh(rsh, a->rs1);
+
+            /*
+             * Computation of double-length right logical shift,
+             * adapted for immediates from section 2.17 of Hacker's Delight
+             */
+            if (a->shamt >= 64) {
+                tcg_gen_movi_tl(res, 0);
+            } else {
+                tcg_gen_shri_tl(res, rs, a->shamt);
+            }
+            if (64 - a->shamt < 0) {
+                tcg_gen_movi_tl(tmp, 0);
+            } else {
+                tcg_gen_shli_tl(tmp, rsh, 64 - a->shamt);
+            }
+            tcg_gen_or_tl(res, res, tmp);
+            if (a->shamt - 64 < 0) {
+                tcg_gen_movi_tl(tmp, 0);
+            } else {
+                tcg_gen_shri_tl(tmp, rsh, a->shamt - 64);
+            }
+            tcg_gen_or_tl(res, res, tmp);
+
+            if (a->shamt >= 64) {
+                tcg_gen_movi_tl(resh, 0);
+            } else {
+                tcg_gen_shri_tl(resh, rsh, a->shamt);
+            }

We have tcg_gen_extract2_i64 for the purpose of double-word immediate shifts. You should be doing

    if (shamt >= 64) {
        tcg_gen_shri_tl(resl, srch, shamt - 64);
        tcg_gen_movi_tl(resh, 0);
    } else {
        tcg_gen_extract2_tl(resl, srcl, srch, shamt);
        tcg_gen_shri_tl(resh, srch, shamt);
    }

  static bool trans_srai(DisasContext *ctx, arg_srai *a)

    if (shamt >= 64) {
        tcg_gen_sari_tl(resl, srch, shamt - 64);
        tcg_gen_sari_tl(resh, srch, 63);
    } else {
        tcg_gen_extract2_tl(resl, srcl, srch, shamt);
        tcg_gen_sari_tl(resh, srch, shamt);
    }

 static bool trans_slli(DisasContext *ctx, arg_slli *a)

    if (shamt >= 64) {
        tcg_gen_shli_tl(resh, srcl, shamt - 64);
        tcg_gen_movi_tl(resl, 0);
    } else {
        tcg_gen_extract2_tl(resh, srcl, srch, 64 - shamt);
        tcg_gen_shli_tl(resl, srcl, shamt);
    }

C.f. tcg/tcg-op.c, tcg_gen_shifti_i64, which is doing the same thing for i32.

+#if defined(TARGET_RISCV128)
+enum M128_DIR { M128_LEFT, M128_RIGHT, M128_RIGHT_ARITH };
+static void gen_shift_mod128(TCGv ret, TCGv arg1, TCGv arg2, enum M128_DIR dir)
+{
+    TCGv tmp1 = tcg_temp_new(),
+         tmp2 = tcg_temp_new(),
+         cnst_zero = tcg_const_tl(0),
+         sgn = tcg_temp_new();
+
+    tcg_gen_setcondi_tl(TCG_COND_GE, tmp1, arg2, 64);
+    tcg_gen_setcondi_tl(TCG_COND_LT, tmp2, arg2, 0);
+    tcg_gen_or_tl(tmp1, tmp1, tmp2);

What in the world are you doing with signed comparisons?

+    tcg_gen_andi_tl(tmp2, arg2, 0x3f);

You should have one test with 0x3f and one with 0x40.



r~



reply via email to

[Prev in Thread] Current Thread [Next in Thread]