qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PULL 11/26] target/riscv: Add orc.b instruction for Zbb, removing g


From: Vincent Palatin
Subject: Re: [PULL 11/26] target/riscv: Add orc.b instruction for Zbb, removing gorc/gorci
Date: Wed, 13 Oct 2021 15:44:15 +0200

On Wed, Oct 13, 2021 at 3:13 PM Philipp Tomsich
<philipp.tomsich@vrull.eu> wrote:
>
> I had a much simpler version initially (using 3 x mask/shift/or, for
> 12 instructions after setup of constants), but took up the suggestion
> to optimize based on haszero(v)...
> Indeed this appears to not do what we expect, when there's only 0x01
> set in a byte.
>
> The less optimized form, with a single constant, that would still do
> what we want is:
>    /* set high-bit for non-zero bytes */
>    constant = dup_const_tl(MO_8, 0x7f);
>    tmp = v & constant;   // AND
>    tmp += constant;       // ADD
>    tmp |= v;                    // OR
>    /* extract high-bit to low-bit, for each word */
>    tmp &= ~constant;     // ANDC
>    tmp >>= 7;                 // SHR
>    /* multiply with 0xff to populate entire byte where the low-bit is set */
>    tmp *= 0xff;                // MUL
>
> I'll submit a patch with this one later today, once I had a chance to
> pass this through a full test.


Thanks for the insight.

I have tried it, implemented as:
```
static void gen_orc_b(TCGv ret, TCGv source1)
{
    TCGv  tmp = tcg_temp_new();
    TCGv  constant = tcg_constant_tl(dup_const_tl(MO_8, 0x7f));

    /* set high-bit for non-zero bytes */
    tcg_gen_and_tl(tmp, source1, constant);
    tcg_gen_add_tl(tmp, tmp, constant);
    tcg_gen_or_tl(tmp, tmp, source1);
    /* extract high-bit to low-bit, for each word */
    tcg_gen_andc_tl(tmp, tmp, constant);
    tcg_gen_shri_tl(tmp, tmp, 7);

    /* Replicate the lsb of each byte across the byte. */
    tcg_gen_muli_tl(ret, tmp, 0xff);

    tcg_temp_free(tmp);
}
```

It does pass my own test sequences.


>
> On Wed, 13 Oct 2021 at 11:36, Vincent Palatin <vpalatin@rivosinc.com> wrote:
> >
> > On Thu, Oct 7, 2021 at 8:58 AM Alistair Francis
> > <alistair.francis@opensource.wdc.com> wrote:
> > >
> > > From: Philipp Tomsich <philipp.tomsich@vrull.eu>
> > >
> > > The 1.0.0 version of Zbb does not contain gorc/gorci.  Instead, a
> > > orc.b instruction (equivalent to the orc.b pseudo-instruction built on
> > > gorci from pre-0.93 draft-B) is available, mainly targeting
> > > string-processing workloads.
> > >
> > > This commit adds the new orc.b instruction and removed gorc/gorci.
> > >
> > > Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
> > > Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> > > Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
> > > Message-id: 20210911140016.834071-12-philipp.tomsich@vrull.eu
> > > Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
> > > ---
> > >  target/riscv/helper.h                   |  2 --
> > >  target/riscv/insn32.decode              |  6 +---
> > >  target/riscv/bitmanip_helper.c          | 26 -----------------
> > >  target/riscv/insn_trans/trans_rvb.c.inc | 39 +++++++++++--------------
> > >  4 files changed, 18 insertions(+), 55 deletions(-)
> > >
> > > diff --git a/target/riscv/helper.h b/target/riscv/helper.h
> > > index 8a318a2dbc..a9bda2c8ac 100644
> > > --- a/target/riscv/helper.h
> > > +++ b/target/riscv/helper.h
> > > @@ -61,8 +61,6 @@ DEF_HELPER_FLAGS_1(fclass_d, TCG_CALL_NO_RWG_SE, tl, 
> > > i64)
> > >  /* Bitmanip */
> > >  DEF_HELPER_FLAGS_2(grev, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > >  DEF_HELPER_FLAGS_2(grevw, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > > -DEF_HELPER_FLAGS_2(gorc, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > > -DEF_HELPER_FLAGS_2(gorcw, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > >  DEF_HELPER_FLAGS_2(clmul, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > >  DEF_HELPER_FLAGS_2(clmulr, TCG_CALL_NO_RWG_SE, tl, tl, tl)
> > >
> > > diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
> > > index a509cfee11..59202196dc 100644
> > > --- a/target/riscv/insn32.decode
> > > +++ b/target/riscv/insn32.decode
> > > @@ -681,6 +681,7 @@ max        0000101 .......... 110 ..... 0110011 @r
> > >  maxu       0000101 .......... 111 ..... 0110011 @r
> > >  min        0000101 .......... 100 ..... 0110011 @r
> > >  minu       0000101 .......... 101 ..... 0110011 @r
> > > +orc_b      001010 000111 ..... 101 ..... 0010011 @r2
> > >  orn        0100000 .......... 110 ..... 0110011 @r
> > >  rol        0110000 .......... 001 ..... 0110011 @r
> > >  ror        0110000 .......... 101 ..... 0110011 @r
> > > @@ -702,19 +703,14 @@ pack       0000100 .......... 100 ..... 0110011 @r
> > >  packu      0100100 .......... 100 ..... 0110011 @r
> > >  packh      0000100 .......... 111 ..... 0110011 @r
> > >  grev       0110100 .......... 101 ..... 0110011 @r
> > > -gorc       0010100 .......... 101 ..... 0110011 @r
> > > -
> > >  grevi      01101. ........... 101 ..... 0010011 @sh
> > > -gorci      00101. ........... 101 ..... 0010011 @sh
> > >
> > >  # *** RV64B Standard Extension (in addition to RV32B) ***
> > >  packw      0000100 .......... 100 ..... 0111011 @r
> > >  packuw     0100100 .......... 100 ..... 0111011 @r
> > >  grevw      0110100 .......... 101 ..... 0111011 @r
> > > -gorcw      0010100 .......... 101 ..... 0111011 @r
> > >
> > >  greviw     0110100 .......... 101 ..... 0011011 @sh5
> > > -gorciw     0010100 .......... 101 ..... 0011011 @sh5
> > >
> > >  # *** RV32 Zbc Standard Extension ***
> > >  clmul      0000101 .......... 001 ..... 0110011 @r
> > > diff --git a/target/riscv/bitmanip_helper.c 
> > > b/target/riscv/bitmanip_helper.c
> > > index 73be5a81c7..bb48388fcd 100644
> > > --- a/target/riscv/bitmanip_helper.c
> > > +++ b/target/riscv/bitmanip_helper.c
> > > @@ -64,32 +64,6 @@ target_ulong HELPER(grevw)(target_ulong rs1, 
> > > target_ulong rs2)
> > >      return do_grev(rs1, rs2, 32);
> > >  }
> > >
> > > -static target_ulong do_gorc(target_ulong rs1,
> > > -                            target_ulong rs2,
> > > -                            int bits)
> > > -{
> > > -    target_ulong x = rs1;
> > > -    int i, shift;
> > > -
> > > -    for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) {
> > > -        if (rs2 & shift) {
> > > -            x |= do_swap(x, adjacent_masks[i], shift);
> > > -        }
> > > -    }
> > > -
> > > -    return x;
> > > -}
> > > -
> > > -target_ulong HELPER(gorc)(target_ulong rs1, target_ulong rs2)
> > > -{
> > > -    return do_gorc(rs1, rs2, TARGET_LONG_BITS);
> > > -}
> > > -
> > > -target_ulong HELPER(gorcw)(target_ulong rs1, target_ulong rs2)
> > > -{
> > > -    return do_gorc(rs1, rs2, 32);
> > > -}
> > > -
> > >  target_ulong HELPER(clmul)(target_ulong rs1, target_ulong rs2)
> > >  {
> > >      target_ulong result = 0;
> > > diff --git a/target/riscv/insn_trans/trans_rvb.c.inc 
> > > b/target/riscv/insn_trans/trans_rvb.c.inc
> > > index bdfb495f24..d32af5915a 100644
> > > --- a/target/riscv/insn_trans/trans_rvb.c.inc
> > > +++ b/target/riscv/insn_trans/trans_rvb.c.inc
> > > @@ -295,16 +295,27 @@ static bool trans_grevi(DisasContext *ctx, 
> > > arg_grevi *a)
> > >      return gen_shift_imm_fn(ctx, a, EXT_NONE, gen_grevi);
> > >  }
> > >
> > > -static bool trans_gorc(DisasContext *ctx, arg_gorc *a)
> > > +static void gen_orc_b(TCGv ret, TCGv source1)
> > >  {
> > > -    REQUIRE_EXT(ctx, RVB);
> > > -    return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc);
> > > +    TCGv  tmp = tcg_temp_new();
> > > +    TCGv  ones = tcg_constant_tl(dup_const_tl(MO_8, 0x01));
> > > +
> > > +    /* Set lsb in each byte if the byte was zero. */
> > > +    tcg_gen_sub_tl(tmp, source1, ones);
> > > +    tcg_gen_andc_tl(tmp, tmp, source1);
> > > +    tcg_gen_shri_tl(tmp, tmp, 7);
> > > +    tcg_gen_andc_tl(tmp, ones, tmp);
> > > +
> > > +    /* Replicate the lsb of each byte across the byte. */
> > > +    tcg_gen_muli_tl(ret, tmp, 0xff);
> > > +
> > > +    tcg_temp_free(tmp);
> > >  }
> >
> > It seems there is a bug in the current orc.b implementation,
> > the following 7 hexadecimal patterns return one wrong byte (0x00
> > instead of 0xff):
> > orc.b(0x............01..) = 0x............00.. (instead of 
> > 0x............ff..)
> > orc.b(0x..........01....) = 0x..........00.... (instead of 
> > 0x..........ff....)
> > orc.b(0x........01......) = 0x........00...... (instead of 
> > 0x........ff......)
> > orc.b(0x......01........) = 0x......00........ (instead of 
> > 0x......ff........)
> > orc.b(0x....01..........) = 0x....00.......... (instead of 
> > 0x....ff..........)
> > orc.b(0x..01............) = 0x..00............ (instead of 
> > 0x..ff............)
> > orc.b(0x01..............) = 0x00.............. (instead of 
> > 0xff..............)
> > (see test cases below)
> >
> > The issue seems to be related to the propagation of the carry.
> > I had a hard time fixing it. With some help, I have added a prolog
> > which basically computes:
> > (source1 | ((source1 << 1) & ~ones)) in order to avoid the carry.
> > I will send the patch as a follow-up in this thread as 'Patch 1A'.
> >
> > But it's notably less optimized than the current code,  so feel free
> > to come up with better options.
> > Actually my initial stab at fixing it was implementing a more
> > straightforward but less astute 'divide and conquer' method
> > where bits are or'ed by pairs, then the pairs are or'ed by pair ...
> > using the following formula:
> > tmp = source1 | (source1 >> 1)
> > tmp = tmp | (tmp >> 2)
> > tmp = tmp | (tmp >> 4)
> > ret = tmp & 0x0101010101010101
> > ret = tmp * 0xff
> > as it's notably less optimized than the current code when converted in
> > tcg_gen_ primitives but de par with the fixed version.
> > I'm also sending in this thread as 'Patch 1B' as I feel it's slightly
> > easier to grasp.
> >
> >
> > Test cases run on current implementation:
> > orc.b(0x0000000000000000) = 0x0000000000000000   OK (expect 
> > 0x0000000000000000)
> > orc.b(0x0000000000000001) = 0x00000000000000ff   OK (expect 
> > 0x00000000000000ff)
> > orc.b(0x0000000000000002) = 0x00000000000000ff   OK (expect 
> > 0x00000000000000ff)
> > orc.b(0x0000000000000004) = 0x00000000000000ff   OK (expect 
> > 0x00000000000000ff)
> > orc.b(0x0000000000000008) = 0x00000000000000ff   OK (expect 
> > 0x00000000000000ff)
> > orc.b(0x0000000000000010) = 0x00000000000000ff   OK (expect 
> > 0x00000000000000ff)
> > orc.b(0x0000000000000020) = 0x00000000000000ff   OK (expect 
> > 0x00000000000000ff)
> > orc.b(0x0000000000000040) = 0x00000000000000ff   OK (expect 
> > 0x00000000000000ff)
> > orc.b(0x0000000000000080) = 0x00000000000000ff   OK (expect 
> > 0x00000000000000ff)
> > orc.b(0x0000000000000100) = 0x0000000000000000 FAIL (expect 
> > 0x000000000000ff00)
> > orc.b(0x0000000000000200) = 0x000000000000ff00   OK (expect 
> > 0x000000000000ff00)
> > orc.b(0x0000000000000400) = 0x000000000000ff00   OK (expect 
> > 0x000000000000ff00)
> > orc.b(0x0000000000000800) = 0x000000000000ff00   OK (expect 
> > 0x000000000000ff00)
> > orc.b(0x0000000000001000) = 0x000000000000ff00   OK (expect 
> > 0x000000000000ff00)
> > orc.b(0x0000000000002000) = 0x000000000000ff00   OK (expect 
> > 0x000000000000ff00)
> > orc.b(0x0000000000004000) = 0x000000000000ff00   OK (expect 
> > 0x000000000000ff00)
> > orc.b(0x0000000000008000) = 0x000000000000ff00   OK (expect 
> > 0x000000000000ff00)
> > orc.b(0x0000000000010000) = 0x0000000000000000 FAIL (expect 
> > 0x0000000000ff0000)
> > orc.b(0x0000000000020000) = 0x0000000000ff0000   OK (expect 
> > 0x0000000000ff0000)
> > orc.b(0x0000000001000000) = 0x0000000000000000 FAIL (expect 
> > 0x00000000ff000000)
> > orc.b(0x0000000002000000) = 0x00000000ff000000   OK (expect 
> > 0x00000000ff000000)
> > orc.b(0x0000000100000000) = 0x0000000000000000 FAIL (expect 
> > 0x000000ff00000000)
> > orc.b(0x0000000200000000) = 0x000000ff00000000   OK (expect 
> > 0x000000ff00000000)
> > orc.b(0x0000010000000000) = 0x0000000000000000 FAIL (expect 
> > 0x0000ff0000000000)
> > orc.b(0x0000020000000000) = 0x0000ff0000000000   OK (expect 
> > 0x0000ff0000000000)
> > orc.b(0x0001000000000000) = 0x0000000000000000 FAIL (expect 
> > 0x00ff000000000000)
> > orc.b(0x0002000000000000) = 0x00ff000000000000   OK (expect 
> > 0x00ff000000000000)
> > orc.b(0x0100000000000000) = 0x0000000000000000 FAIL (expect 
> > 0xff00000000000000)
> > orc.b(0x0200000000000000) = 0xff00000000000000   OK (expect 
> > 0xff00000000000000)
> > orc.b(0x0400000000000000) = 0xff00000000000000   OK (expect 
> > 0xff00000000000000)
> > orc.b(0x0800000000000000) = 0xff00000000000000   OK (expect 
> > 0xff00000000000000)
> > orc.b(0x1000000000000000) = 0xff00000000000000   OK (expect 
> > 0xff00000000000000)
> > orc.b(0x2000000000000000) = 0xff00000000000000   OK (expect 
> > 0xff00000000000000)
> > orc.b(0x4000000000000000) = 0xff00000000000000   OK (expect 
> > 0xff00000000000000)
> > orc.b(0x8000000000000000) = 0xff00000000000000   OK (expect 
> > 0xff00000000000000)
> > orc.b(0xffffffffffffffff) = 0xffffffffffffffff   OK (expect 
> > 0xffffffffffffffff)
> > orc.b(0x00ff00ff00ff00ff) = 0x00ff00ff00ff00ff   OK (expect 
> > 0x00ff00ff00ff00ff)
> > orc.b(0xff00ff00ff00ff00) = 0xff00ff00ff00ff00   OK (expect 
> > 0xff00ff00ff00ff00)
> > orc.b(0x0001000100010001) = 0x00000000000000ff FAIL (expect 
> > 0x00ff00ff00ff00ff)
> > orc.b(0x0100010001000100) = 0x0000000000000000 FAIL (expect 
> > 0xff00ff00ff00ff00)
> > orc.b(0x8040201008040201) = 0xffffffffffffffff   OK (expect 
> > 0xffffffffffffffff)
> > orc.b(0x0804020180402010) = 0xffffffffffffffff   OK (expect 
> > 0xffffffffffffffff)
> > orc.b(0x010055aa00401100) = 0x0000ffff00ffff00 FAIL (expect 
> > 0xff00ffff00ffff00)
> >
> >
> > >
> > > -static bool trans_gorci(DisasContext *ctx, arg_gorci *a)
> > > +static bool trans_orc_b(DisasContext *ctx, arg_orc_b *a)
> > >  {
> > > -    REQUIRE_EXT(ctx, RVB);
> > > -    return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc);
> > > +    REQUIRE_ZBB(ctx);
> > > +    return gen_unary(ctx, a, EXT_ZERO, gen_orc_b);
> > >  }
> > >
> > >  #define GEN_SHADD(SHAMT)                                       \
> > > @@ -476,22 +487,6 @@ static bool trans_greviw(DisasContext *ctx, 
> > > arg_greviw *a)
> > >      return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_grev);
> > >  }
> > >
> > > -static bool trans_gorcw(DisasContext *ctx, arg_gorcw *a)
> > > -{
> > > -    REQUIRE_64BIT(ctx);
> > > -    REQUIRE_EXT(ctx, RVB);
> > > -    ctx->w = true;
> > > -    return gen_shift(ctx, a, EXT_ZERO, gen_helper_gorc);
> > > -}
> > > -
> > > -static bool trans_gorciw(DisasContext *ctx, arg_gorciw *a)
> > > -{
> > > -    REQUIRE_64BIT(ctx);
> > > -    REQUIRE_EXT(ctx, RVB);
> > > -    ctx->w = true;
> > > -    return gen_shift_imm_tl(ctx, a, EXT_ZERO, gen_helper_gorc);
> > > -}
> > > -
> > >  #define GEN_SHADD_UW(SHAMT)                                       \
> > >  static void gen_sh##SHAMT##add_uw(TCGv ret, TCGv arg1, TCGv arg2) \
> > >  {                                                                 \
> > > --
> > > 2.31.1
> > >
> > >



reply via email to

[Prev in Thread] Current Thread [Next in Thread]