[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-arm] [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLA
From: |
Laurent Desnogues |
Subject: |
Re: [Qemu-arm] [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL |
Date: |
Thu, 14 Feb 2019 10:16:14 +0100 |
Hello,
On Thu, Feb 14, 2019 at 5:00 AM Richard Henderson
<address@hidden> wrote:
>
> Note that float16_to_float32 rightly squashes SNaN to QNaN.
> But of course pickNaNMulAdd, for ARM, selects SNaNs first.
> So we have to preserve SNaN long enough for the correct NaN
> to be selected. Thus float16_to_float32_by_bits.
>
> Signed-off-by: Richard Henderson <address@hidden>
> ---
> target/arm/helper.h | 9 +++
> target/arm/vec_helper.c | 154 ++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 163 insertions(+)
>
> diff --git a/target/arm/helper.h b/target/arm/helper.h
> index 53a38188c6..0302e13604 100644
> --- a/target/arm/helper.h
> +++ b/target/arm/helper.h
> @@ -653,6 +653,15 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
> DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG,
> void, ptr, ptr, ptr, ptr, ptr, i32)
>
> +DEF_HELPER_FLAGS_5(gvec_fmlal_h, TCG_CALL_NO_RWG,
> + void, ptr, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_5(gvec_fmlsl_h, TCG_CALL_NO_RWG,
> + void, ptr, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_5(gvec_fmlal_idx_h, TCG_CALL_NO_RWG,
> + void, ptr, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_5(gvec_fmlsl_idx_h, TCG_CALL_NO_RWG,
> + void, ptr, ptr, ptr, ptr, i32)
> +
> #ifdef TARGET_AARCH64
> #include "helper-a64.h"
> #include "helper-sve.h"
> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
> index 37f338732e..0c3b3de961 100644
> --- a/target/arm/vec_helper.c
> +++ b/target/arm/vec_helper.c
> @@ -766,3 +766,157 @@ DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
> DO_FMLA_IDX(gvec_fmla_idx_d, float64, )
>
> #undef DO_FMLA_IDX
> +
> +/*
> + * Convert float16 to float32, raising no exceptions and
> + * preserving exceptional values, including SNaN.
> + * This is effectively an unpack+repack operation.
> + */
> +static float32 float16_to_float32_by_bits(uint32_t f16)
> +{
> + const int f16_bias = 15;
> + const int f32_bias = 127;
> + uint32_t sign = extract32(f16, 15, 1);
> + uint32_t exp = extract32(f16, 10, 5);
> + uint32_t frac = extract32(f16, 0, 10);
> +
> + if (exp == 0x1f) {
> + /* Inf or NaN */
> + exp = 0xff;
> + } else if (exp == 0) {
> + /* Zero or denormal. */
> + if (frac != 0) {
> + /*
> + * Denormal; these are all normal float32.
> + * Shift the fraction so that the msb is at bit 11,
> + * then remove bit 11 as the implicit bit of the
> + * normalized float32. Note that we still go through
> + * the shift for normal numbers below, to put the
> + * float32 fraction at the right place.
> + */
> + int shift = clz32(frac) - 21;
> + frac = (frac << shift) & 0x3ff;
> + exp = f32_bias - f16_bias - shift + 1;
If FZ16 is set, this should flush to zero.
This means you will have to use both fp_status (for the muladd) and
fp_status_f16 (for this function) and so you should pass cpu_env to
the helpers rather than the fp_status.
Thanks,
Laurent
> + }
> + } else {
> + /* Normal number; adjust the bias. */
> + exp += f32_bias - f16_bias;
> + }
> + sign <<= 31;
> + exp <<= 23;
> + frac <<= 23 - 10;
> +
> + return sign | exp | frac;
> +}
> +
> +static float32 fmlal(float32 a, float16 n16, float16 m16, float_status *fpst)
> +{
> + float32 n = float16_to_float32_by_bits(n16);
> + float32 m = float16_to_float32_by_bits(m16);
> + return float32_muladd(n, m, a, 0, fpst);
> +}
> +
> +static float32 fmlsl(float32 a, float16 n16, float16 m16, float_status *fpst)
> +{
> + float32 n = float16_to_float32_by_bits(n16);
> + float32 m = float16_to_float32_by_bits(m16);
> + return float32_muladd(float32_chs(n), m, a, 0, fpst);
> +}
> +
> +static inline uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
> +{
> + /*
> + * Branchless load of u32[0], u64[0], u32[1], or u64[1].
> + * Load the 2nd qword iff is_q & is_2.
> + * Shift to the 2nd dword iff !is_q & is_2.
> + * For !is_q & !is_2, the upper bits of the result are garbage.
> + */
> + return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
> +}
> +
> +/*
> + * Note that FMLAL and FMLSL require oprsz == 8 or oprsz == 16,
> + * as there is not yet SVE versions that might use blocking.
> + */
> +
> +void HELPER(gvec_fmlal_h)(void *vd, void *vn, void *vm,
> + void *fpst, uint32_t desc)
> +{
> + intptr_t i, oprsz = simd_oprsz(desc);
> + int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
> + int is_q = oprsz == 16;
> + float32 *d = vd;
> + uint64_t n_4, m_4;
> +
> + /* Pre-load all of the f16 data, avoiding overlap issues. */
> + n_4 = load4_f16(vn, is_q, is_2);
> + m_4 = load4_f16(vm, is_q, is_2);
> +
> + for (i = 0; i < oprsz / 4; i++) {
> + d[H4(i)] = fmlal(d[H4(i)], extract64(n_4, i*16, 16),
> + extract64(m_4, i*16, 16), fpst);
> + }
> + clear_tail(d, oprsz, simd_maxsz(desc));
> +}
> +
> +void HELPER(gvec_fmlsl_h)(void *vd, void *vn, void *vm,
> + void *fpst, uint32_t desc)
> +{
> + intptr_t i, oprsz = simd_oprsz(desc);
> + int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
> + int is_q = oprsz == 16;
> + float32 *d = vd;
> + uint64_t n_4, m_4;
> +
> + /* Pre-load all of the f16 data, avoiding overlap issues. */
> + n_4 = load4_f16(vn, is_q, is_2);
> + m_4 = load4_f16(vm, is_q, is_2);
> +
> + for (i = 0; i < oprsz / 4; i++) {
> + d[H4(i)] = fmlsl(d[H4(i)], extract64(n_4, i*16, 16),
> + extract64(m_4, i*16, 16), fpst);
> + }
> + clear_tail(d, oprsz, simd_maxsz(desc));
> +}
> +
> +void HELPER(gvec_fmlal_idx_h)(void *vd, void *vn, void *vm,
> + void *fpst, uint32_t desc)
> +{
> + intptr_t i, oprsz = simd_oprsz(desc);
> + int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
> + int index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
> + int is_q = oprsz == 16;
> + float32 *d = vd;
> + uint64_t n_4;
> + float16 m_1;
> +
> + /* Pre-load all of the f16 data, avoiding overlap issues. */
> + n_4 = load4_f16(vn, is_q, is_2);
> + m_1 = ((float16 *)vm)[H2(index)];
> +
> + for (i = 0; i < oprsz / 4; i++) {
> + d[H4(i)] = fmlal(d[H4(i)], extract64(n_4, i * 16, 16), m_1, fpst);
> + }
> + clear_tail(d, oprsz, simd_maxsz(desc));
> +}
> +
> +void HELPER(gvec_fmlsl_idx_h)(void *vd, void *vn, void *vm,
> + void *fpst, uint32_t desc)
> +{
> + intptr_t i, oprsz = simd_oprsz(desc);
> + int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
> + int index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
> + int is_q = oprsz == 16;
> + float32 *d = vd;
> + uint64_t n_4;
> + float16 m_1;
> +
> + /* Pre-load all of the f16 data, avoiding overlap issues. */
> + n_4 = load4_f16(vn, is_q, is_2);
> + m_1 = ((float16 *)vm)[H2(index)];
> +
> + for (i = 0; i < oprsz / 4; i++) {
> + d[H4(i)] = fmlsl(d[H4(i)], extract64(n_4, i*16, 16), m_1, fpst);
> + }
> + clear_tail(d, oprsz, simd_maxsz(desc));
> +}
> --
> 2.17.2
>
>