[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PULL 12/16] target/arm: Add helpers for FMLAL
From: |
Peter Maydell |
Subject: |
[Qemu-devel] [PULL 12/16] target/arm: Add helpers for FMLAL |
Date: |
Thu, 28 Feb 2019 11:08:31 +0000 |
From: Richard Henderson <address@hidden>
Note that float16_to_float32 rightly squashes SNaN to QNaN.
But of course pickNaNMulAdd, for ARM, selects SNaNs first.
So we have to preserve SNaN long enough for the correct NaN
to be selected. Thus float16_to_float32_by_bits.
Signed-off-by: Richard Henderson <address@hidden>
Message-id: address@hidden
Reviewed-by: Peter Maydell <address@hidden>
Signed-off-by: Peter Maydell <address@hidden>
---
target/arm/helper.h | 9 +++
target/arm/vec_helper.c | 148 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 157 insertions(+)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 747cb64d29f..d363904278a 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -677,6 +677,15 @@ DEF_HELPER_FLAGS_5(gvec_sqsub_s, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(gvec_sqsub_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmlal_a32, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmlal_a64, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmlal_idx_a32, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmlal_idx_a64, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
#ifdef TARGET_AARCH64
#include "helper-a64.h"
#include "helper-sve.h"
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index dfc635cf9a5..dedef62403a 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -898,3 +898,151 @@ void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
}
clear_tail(d, oprsz, simd_maxsz(desc));
}
+
+/*
+ * Convert float16 to float32, raising no exceptions and
+ * preserving exceptional values, including SNaN.
+ * This is effectively an unpack+repack operation.
+ */
+static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
+{
+ const int f16_bias = 15;
+ const int f32_bias = 127;
+ uint32_t sign = extract32(f16, 15, 1);
+ uint32_t exp = extract32(f16, 10, 5);
+ uint32_t frac = extract32(f16, 0, 10);
+
+ if (exp == 0x1f) {
+ /* Inf or NaN */
+ exp = 0xff;
+ } else if (exp == 0) {
+ /* Zero or denormal. */
+ if (frac != 0) {
+ if (fz16) {
+ frac = 0;
+ } else {
+ /*
+ * Denormal; these are all normal float32.
+ * Shift the fraction so that the msb is at bit 11,
+ * then remove bit 11 as the implicit bit of the
+ * normalized float32. Note that we still go through
+ * the shift for normal numbers below, to put the
+ * float32 fraction at the right place.
+ */
+ int shift = clz32(frac) - 21;
+ frac = (frac << shift) & 0x3ff;
+ exp = f32_bias - f16_bias - shift + 1;
+ }
+ }
+ } else {
+ /* Normal number; adjust the bias. */
+ exp += f32_bias - f16_bias;
+ }
+ sign <<= 31;
+ exp <<= 23;
+ frac <<= 23 - 10;
+
+ return sign | exp | frac;
+}
+
+static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
+{
+ /*
+ * Branchless load of u32[0], u64[0], u32[1], or u64[1].
+ * Load the 2nd qword iff is_q & is_2.
+ * Shift to the 2nd dword iff !is_q & is_2.
+ * For !is_q & !is_2, the upper bits of the result are garbage.
+ */
+ return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
+}
+
+/*
+ * Note that FMLAL requires oprsz == 8 or oprsz == 16,
+ * as there is not yet SVE versions that might use blocking.
+ */
+
+static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
+ uint32_t desc, bool fz16)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+ int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
+ int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ int is_q = oprsz == 16;
+ uint64_t n_4, m_4;
+
+ /* Pre-load all of the f16 data, avoiding overlap issues. */
+ n_4 = load4_f16(vn, is_q, is_2);
+ m_4 = load4_f16(vm, is_q, is_2);
+
+ /* Negate all inputs for FMLSL at once. */
+ if (is_s) {
+ n_4 ^= 0x8000800080008000ull;
+ }
+
+ for (i = 0; i < oprsz / 4; i++) {
+ float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
+ float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
+ d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
+ }
+ clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
+ void *venv, uint32_t desc)
+{
+ CPUARMState *env = venv;
+ do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
+ get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
+ void *venv, uint32_t desc)
+{
+ CPUARMState *env = venv;
+ do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
+ get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
+ uint32_t desc, bool fz16)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+ int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
+ int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
+ int is_q = oprsz == 16;
+ uint64_t n_4;
+ float32 m_1;
+
+ /* Pre-load all of the f16 data, avoiding overlap issues. */
+ n_4 = load4_f16(vn, is_q, is_2);
+
+ /* Negate all inputs for FMLSL at once. */
+ if (is_s) {
+ n_4 ^= 0x8000800080008000ull;
+ }
+
+ m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
+
+ for (i = 0; i < oprsz / 4; i++) {
+ float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
+ d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
+ }
+ clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
+ void *venv, uint32_t desc)
+{
+ CPUARMState *env = venv;
+ do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
+ get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
+ void *venv, uint32_t desc)
+{
+ CPUARMState *env = venv;
+ do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
+ get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
--
2.20.1
- [Qemu-devel] [PULL 05/16] hw/misc/iotkit-sysctl: Correct typo in INITSVTOR0 register name, (continued)
- [Qemu-devel] [PULL 05/16] hw/misc/iotkit-sysctl: Correct typo in INITSVTOR0 register name, Peter Maydell, 2019/02/28
- [Qemu-devel] [PULL 03/16] target/arm/cpu: Allow init-svtor property to be set after realize, Peter Maydell, 2019/02/28
- [Qemu-devel] [PULL 01/16] hw/misc/armsse-mhu.c: Model the SSE-200 Message Handling Unit, Peter Maydell, 2019/02/28
- [Qemu-devel] [PULL 06/16] hw/arm/iotkit-sysctl: Add SSE-200 registers, Peter Maydell, 2019/02/28
- [Qemu-devel] [PULL 07/16] hw/arm/iotkit-sysctl: Implement CPUWAIT and INITSVTOR*, Peter Maydell, 2019/02/28
- [Qemu-devel] [PULL 02/16] hw/arm/armsse: Wire up the MHUs, Peter Maydell, 2019/02/28
- [Qemu-devel] [PULL 09/16] target/arm: Use MVFR1 feature bits to gate A32/T32 FP16 instructions, Peter Maydell, 2019/02/28
- [Qemu-devel] [PULL 08/16] hw/arm/armsse: Unify init-svtor and cpuwait handling, Peter Maydell, 2019/02/28
- [Qemu-devel] [PULL 10/16] target/arm: Gate "miscellaneous FP" insns by ID register field, Peter Maydell, 2019/02/28
- [Qemu-devel] [PULL 11/16] Revert "arm: Allow system registers for KVM guests to be changed by QEMU code", Peter Maydell, 2019/02/28
- [Qemu-devel] [PULL 12/16] target/arm: Add helpers for FMLAL,
Peter Maydell <=
- [Qemu-devel] [PULL 13/16] target/arm: Implement FMLAL and FMLSL for aarch64, Peter Maydell, 2019/02/28
- [Qemu-devel] [PULL 15/16] target/arm: Enable ARMv8.2-FHM for -cpu max, Peter Maydell, 2019/02/28
- [Qemu-devel] [PULL 14/16] target/arm: Implement VFMAL and VFMSL for aarch32, Peter Maydell, 2019/02/28
- [Qemu-devel] [PULL 16/16] linux-user: Enable HWCAP_ASIMDFHM, HWCAP_JSCVT, Peter Maydell, 2019/02/28
- Re: [Qemu-devel] [PULL 00/16] target-arm queue, no-reply, 2019/02/28
- Re: [Qemu-devel] [PULL 00/16] target-arm queue, Peter Maydell, 2019/02/28