qemu-riscv
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and


From: LIU Zhiwei
Subject: Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract
Date: Mon, 16 Mar 2020 07:23:28 +0800
User-agent: Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Thunderbird/68.5.0



On 2020/3/15 9:00, Richard Henderson wrote:
On 3/14/20 4:12 PM, LIU Zhiwei wrote:
I am not sure whether I get it. In my opinion, the code should be modified like

static inline int8_t aadd8_rnu(CPURISCVState *env, int8_t a, int8_t b)
{
    int16_t res = (int16_t)a + (int16_t)b;
    uint8_t round = res & 0x1;
    res   = (res >> 1) + round;
    return res;
}

static inline int8_t aadd8_rne(CPURISCVState *env, int8_t a, int8_t b)
{
    int16_t res = (int16_t)a + (int16_t)b;
    uint8_t round = ((res & 0x3) == 0x3);
    res   = (res >> 1) + round;
    return res;
}

static inline int8_t aadd8_rdn(CPURISCVState *env, int8_t a, int8_t b)
{
    int16_t res = (int16_t)a + (int16_t)b;
    res   = (res >> 1);
    return res;
}

static inline int8_t aadd8_rod(CPURISCVState *env, int8_t a, int8_t b)
{
    int16_t res = (int16_t)a + (int16_t)b;
    uint8_t round = ((res & 0x3) == 0x1);
   res   = (res >> 1) + round;
    return res;
}

RVVCALL(OPIVV2_ENV, vaadd_vv_b_rnu, OP_SSS_B, H1, H1, H1, aadd8_rnu)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rne, OP_SSS_B, H1, H1, H1, aadd8_rne)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rdn, OP_SSS_B, H1, H1, H1, aadd8_rdn)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rod, OP_SSS_B, H1, H1, H1, aadd8_rod)

void do_vext_vv_env(void *vd, void *v0, void *vs1,
                    void *vs2, CPURISCVState *env, uint32_t desc,
                    uint32_t esz, uint32_t dsz,
                    opivv2_fn *fn, clear_fn *clearfn)
{
    uint32_t vlmax = vext_maxsz(desc) / esz;
    uint32_t mlen = vext_mlen(desc);
    uint32_t vm = vext_vm(desc);
    uint32_t vl = env->vl;
    uint32_t i;
    for (i = 0; i < vl; i++) {
        if (!vm && !vext_elem_mask(v0, mlen, i)) {
            continue;
        }
        fn(vd, vs1, vs2, i, env);
    }
    if (i != 0) {
        clear_fn(vd, vl, vl * dsz,  vlmax * dsz);
    }
}

#define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ, CLEAR_FN)         \
void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
                  void *vs2, CPURISCVState *env,          \
                  uint32_t desc)                          \
{                                                         \
    static opivv2_fn *fns[4] = {                          \
        NAME##_rnu, NAME##_rne,                           \
        NAME##_rdn, NAME##_rod                            \
    }                                                     \
    return do_vext_vv_env(vd, v0, vs1, vs2, env, desc,    \
                          ESZ, DSZ, fns[env->vxrm],       \
              CLEAR_FN);                      \
}

Is it true?
While that does look good for this case, there are many other uses of
get_round(), and it may not be quite as simple there.

My suggestion was

static inline int32_t aadd32(int vxrm, int32_t a, int32_t b)
{
    int64_t res = (int64_t)a + b;
    uint8_t round = get_round(vxrm, res, 1);

    return (res >> 1) + round;
}

static inline int64_t aadd64(int vxrm, int64_t a, int64_t b)
{
    int64_t res = a + b;
    uint8_t round = get_round(vxrm, res, 1);
    int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;

    /* With signed overflow, bit 64 is inverse of bit 63. */
    return ((res >> 1) ^ over) + round;
}

RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)

static inline void
vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
             uint32_t vl, uint32_t vm, uint32_t mlen, int vxrm,
             opivv2_rm_fn *fn)
{
    for (uint32_t i = 0; i < vl; i++) {
        if (!vm && !vext_elem_mask(v0, mlen, i)) {
            continue;
        }
        fn(vd, vs1, vs2, i, vxrm);
    }
}

static inline void
vext_vv_rm_2(void *vd, void *v0, void *vs1,
             void *vs2, CPURISCVState *env, uint32_t desc,
             uint32_t esz, uint32_t dsz,
             opivv2_rm_fn *fn, clear_fn *clearfn)
{
    uint32_t vlmax = vext_maxsz(desc) / esz;
    uint32_t mlen = vext_mlen(desc);
    uint32_t vm = vext_vm(desc);
    uint32_t vl = env->vl;

    if (vl == 0) {
        return;
    }

    switch (env->vxrm) {
    case 0: /* rnu */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 0, fn);
        break;
    case 1: /* rne */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 1, fn);
        break;
    case 2: /* rdn */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 2, fn);
        break;
    default: /* rod */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 3, fn);
        break;
    }

    clear_fn(vd, vl, vl * dsz,  vlmax * dsz);
}

>From vext_vv_rm_2, a constant is passed down all of the inline functions, so
that a constant arrives in get_round() at the bottom of the call chain.  At
which point all of the expressions get folded by the compiler and we *should*
get very similar generated code as to what you have above.
Yes, it will be much better.

I still have one question here.

Many other fixed point instructions also need vxsat besides vxsrm.

In that cases, can I just define OPIVV2_RM like this:

#define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
static inline void                                                  \
do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
          CPURISCVState *env, int vxrm)                             \
{                                                                   \
    TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
    TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
    *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
}

static inline int32_t aadd32(__attribute__((unused)) CPURISCVState *env, 
			     int vxrm, int32_t a, int32_t b)
{
    int64_t res = (int64_t)a + b;
    uint8_t round = get_round(vxrm, res, 1);

    return (res >> 1) + round;
}

In this way, I can write just one OPIVV2_RM instead of (OPIVV2_RM, OPIVV2_RM_ENV, OPIVV2_ENV).

Zhiwei


r~


reply via email to

[Prev in Thread] Current Thread [Next in Thread]