Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and

qemu-riscv

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and

From:	LIU Zhiwei
Subject:	Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract
Date:	Mon, 16 Mar 2020 07:23:28 +0800
User-agent:	Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Thunderbird/68.5.0

On 2020/3/15 9:00, Richard Henderson wrote:

On 3/14/20 4:12 PM, LIU Zhiwei wrote:

I am not sure whether I get it. In my opinion, the code should be modified like

static inline int8_t aadd8_rnu(CPURISCVState *env, int8_t a, int8_t b)
{
    int16_t res = (int16_t)a + (int16_t)b;
    uint8_t round = res & 0x1;
    res   = (res >> 1) + round;
    return res;
}

static inline int8_t aadd8_rne(CPURISCVState *env, int8_t a, int8_t b)
{
    int16_t res = (int16_t)a + (int16_t)b;
    uint8_t round = ((res & 0x3) == 0x3);
    res   = (res >> 1) + round;
    return res;
}

static inline int8_t aadd8_rdn(CPURISCVState *env, int8_t a, int8_t b)
{
    int16_t res = (int16_t)a + (int16_t)b;
    res   = (res >> 1);
    return res;
}

static inline int8_t aadd8_rod(CPURISCVState *env, int8_t a, int8_t b)
{
    int16_t res = (int16_t)a + (int16_t)b;
    uint8_t round = ((res & 0x3) == 0x1);
   res   = (res >> 1) + round;
    return res;
}

RVVCALL(OPIVV2_ENV, vaadd_vv_b_rnu, OP_SSS_B, H1, H1, H1, aadd8_rnu)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rne, OP_SSS_B, H1, H1, H1, aadd8_rne)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rdn, OP_SSS_B, H1, H1, H1, aadd8_rdn)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rod, OP_SSS_B, H1, H1, H1, aadd8_rod)

void do_vext_vv_env(void *vd, void *v0, void *vs1,
                    void *vs2, CPURISCVState *env, uint32_t desc,
                    uint32_t esz, uint32_t dsz,
                    opivv2_fn *fn, clear_fn *clearfn)
{
    uint32_t vlmax = vext_maxsz(desc) / esz;
    uint32_t mlen = vext_mlen(desc);
    uint32_t vm = vext_vm(desc);
    uint32_t vl = env->vl;
    uint32_t i;
    for (i = 0; i < vl; i++) {
        if (!vm && !vext_elem_mask(v0, mlen, i)) {
            continue;
        }
        fn(vd, vs1, vs2, i, env);
    }
    if (i != 0) {
        clear_fn(vd, vl, vl * dsz,  vlmax * dsz);
    }
}

#define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ, CLEAR_FN)         \
void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
                  void *vs2, CPURISCVState *env,          \
                  uint32_t desc)                          \
{                                                         \
    static opivv2_fn *fns[4] = {                          \
        NAME##_rnu, NAME##_rne,                           \
        NAME##_rdn, NAME##_rod                            \
    }                                                     \
    return do_vext_vv_env(vd, v0, vs1, vs2, env, desc,    \
                          ESZ, DSZ, fns[env->vxrm],       \
              CLEAR_FN);                      \
}

Is it true?

While that does look good for this case, there are many other uses of
get_round(), and it may not be quite as simple there.

My suggestion was

static inline int32_t aadd32(int vxrm, int32_t a, int32_t b)
{
    int64_t res = (int64_t)a + b;
    uint8_t round = get_round(vxrm, res, 1);

    return (res >> 1) + round;
}

static inline int64_t aadd64(int vxrm, int64_t a, int64_t b)
{
    int64_t res = a + b;
    uint8_t round = get_round(vxrm, res, 1);
    int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;

    /* With signed overflow, bit 64 is inverse of bit 63. */
    return ((res >> 1) ^ over) + round;
}

RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)

static inline void
vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
             uint32_t vl, uint32_t vm, uint32_t mlen, int vxrm,
             opivv2_rm_fn *fn)
{
    for (uint32_t i = 0; i < vl; i++) {
        if (!vm && !vext_elem_mask(v0, mlen, i)) {
            continue;
        }
        fn(vd, vs1, vs2, i, vxrm);
    }
}

static inline void
vext_vv_rm_2(void *vd, void *v0, void *vs1,
             void *vs2, CPURISCVState *env, uint32_t desc,
             uint32_t esz, uint32_t dsz,
             opivv2_rm_fn *fn, clear_fn *clearfn)
{
    uint32_t vlmax = vext_maxsz(desc) / esz;
    uint32_t mlen = vext_mlen(desc);
    uint32_t vm = vext_vm(desc);
    uint32_t vl = env->vl;

    if (vl == 0) {
        return;
    }

    switch (env->vxrm) {
    case 0: /* rnu */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 0, fn);
        break;
    case 1: /* rne */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 1, fn);
        break;
    case 2: /* rdn */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 2, fn);
        break;
    default: /* rod */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 3, fn);
        break;
    }

    clear_fn(vd, vl, vl * dsz,  vlmax * dsz);
}

>From vext_vv_rm_2, a constant is passed down all of the inline functions, so
that a constant arrives in get_round() at the bottom of the call chain.  At
which point all of the expressions get folded by the compiler and we *should*
get very similar generated code as to what you have above.

Yes, it will be much better.

I still have one question here.

Many other fixed point instructions also need vxsat besides vxsrm.

In that cases, can I just define OPIVV2_RM like this:

#define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
static inline void                                                  \
do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
          CPURISCVState *env, int vxrm)                             \
{                                                                   \
    TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
    TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
    *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
}

static inline int32_t aadd32(__attribute__((unused)) CPURISCVState *env, 
			     int vxrm, int32_t a, int32_t b)
{
    int64_t res = (int64_t)a + b;
    uint8_t round = get_round(vxrm, res, 1);

    return (res >> 1) + round;
}

In this way， I can write just one OPIVV2_RM instead of (OPIVV2_RM, OPIVV2_RM_ENV, OPIVV2_ENV).

Zhiwei

r~

[Prev in Thread]

Current Thread

[Next in Thread]

Re: [PATCH v5 22/60] target/riscv: vector integer merge and move instructions, (continued)
- [PATCH v5 23/60] target/riscv: vector single-width saturating add and subtract, LIU Zhiwei, 2020/03/12
  - Re: [PATCH v5 23/60] target/riscv: vector single-width saturating add and subtract, Richard Henderson, 2020/03/14
- [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract, LIU Zhiwei, 2020/03/12
  - Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract, Richard Henderson, 2020/03/14
    - Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract, Richard Henderson, 2020/03/14
    - Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract, LIU Zhiwei, 2020/03/14
    - Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract, Richard Henderson, 2020/03/14
    - Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract, LIU Zhiwei <=
    - Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract, Richard Henderson, 2020/03/15
- [PATCH v5 25/60] target/riscv: vector single-width fractional multiply with rounding and saturation, LIU Zhiwei, 2020/03/12
  - Re: [PATCH v5 25/60] target/riscv: vector single-width fractional multiply with rounding and saturation, Richard Henderson, 2020/03/14
- [PATCH v5 26/60] target/riscv: vector widening saturating scaled multiply-add, LIU Zhiwei, 2020/03/12
  - Re: [PATCH v5 26/60] target/riscv: vector widening saturating scaled multiply-add, Richard Henderson, 2020/03/14
- [PATCH v5 27/60] target/riscv: vector single-width scaling shift instructions, LIU Zhiwei, 2020/03/12
  - Re: [PATCH v5 27/60] target/riscv: vector single-width scaling shift instructions, Richard Henderson, 2020/03/14
- [PATCH v5 28/60] target/riscv: vector narrowing fixed-point clip instructions, LIU Zhiwei, 2020/03/12
  - Re: [PATCH v5 28/60] target/riscv: vector narrowing fixed-point clip instructions, Richard Henderson, 2020/03/14
- [PATCH v5 29/60] target/riscv: vector single-width floating-point add/subtract instructions, LIU Zhiwei, 2020/03/12

Prev by Date: Re: [PATCH 01/14] Makefile: Only build virtiofsd if system-mode is enabled
Next by Date: Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract
Previous by thread: Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract
Next by thread: Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract
Index(es):
- Date
- Thread