[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 43/61] target/arm: Implement SME2 BFMLA, BFMLS
From: |
Richard Henderson |
Subject: |
[PATCH 43/61] target/arm: Implement SME2 BFMLA, BFMLS |
Date: |
Thu, 6 Feb 2025 11:56:57 -0800 |
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper.h | 9 +++++++++
target/arm/tcg/translate-sme.c | 14 ++++++++++++++
target/arm/tcg/vec_helper.c | 26 ++++++++++++++++++++++++++
target/arm/tcg/sme.decode | 18 ++++++++++++++++++
4 files changed, 67 insertions(+)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index b00b79c12c..ac2372bbe7 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -785,14 +785,17 @@ DEF_HELPER_FLAGS_5(gvec_fmls_nf_s, TCG_CALL_NO_RWG, void,
ptr, ptr, ptr, fpst, i
DEF_HELPER_FLAGS_5(gvec_vfma_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst,
i32)
DEF_HELPER_FLAGS_5(gvec_vfma_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst,
i32)
DEF_HELPER_FLAGS_5(gvec_vfma_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst,
i32)
+DEF_HELPER_FLAGS_5(gvec_bfmla, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_vfms_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst,
i32)
DEF_HELPER_FLAGS_5(gvec_vfms_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst,
i32)
DEF_HELPER_FLAGS_5(gvec_vfms_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst,
i32)
+DEF_HELPER_FLAGS_5(gvec_bfmls, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_ah_vfms_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst,
i32)
DEF_HELPER_FLAGS_5(gvec_ah_vfms_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst,
i32)
DEF_HELPER_FLAGS_5(gvec_ah_vfms_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst,
i32)
+DEF_HELPER_FLAGS_5(gvec_ah_bfmls, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst,
i32)
DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, fpst, i32)
@@ -824,6 +827,8 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_bfmla_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_6(gvec_fmls_idx_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
@@ -831,6 +836,8 @@ DEF_HELPER_FLAGS_6(gvec_fmls_idx_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_6(gvec_fmls_idx_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_bfmls_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_6(gvec_ah_fmls_idx_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
@@ -838,6 +845,8 @@ DEF_HELPER_FLAGS_6(gvec_ah_fmls_idx_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_6(gvec_ah_fmls_idx_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_ah_bfmls_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_uqadd_b, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
index 1989dedab5..5dce2dfd15 100644
--- a/target/arm/tcg/translate-sme.c
+++ b/target/arm/tcg/translate-sme.c
@@ -971,6 +971,15 @@ TRANS_FEAT(FMLA_nn_d, aa64_sme2_f64f64, do_fmla, a, true,
FPST_ZA,
TRANS_FEAT(FMLS_nn_d, aa64_sme2_f64f64, do_fmla, a, true, FPST_ZA,
s->fpcr_ah ? gen_helper_gvec_ah_vfms_d : gen_helper_gvec_vfms_d)
+TRANS_FEAT(BFMLA_n1, aa64_sme2_b16b16, do_fmla, a, false, FPST_ZA,
+ gen_helper_gvec_bfmla)
+TRANS_FEAT(BFMLS_n1, aa64_sme2_b16b16, do_fmla, a, false, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_bfmls : gen_helper_gvec_bfmls)
+TRANS_FEAT(BFMLA_nn, aa64_sme2_b16b16, do_fmla, a, true, FPST_ZA,
+ gen_helper_gvec_bfmla)
+TRANS_FEAT(BFMLS_nn, aa64_sme2_b16b16, do_fmla, a, true, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_bfmls : gen_helper_gvec_bfmls)
+
static bool do_fmla_nx(DisasContext *s, arg_azx_n *a,
ARMFPStatusFlavour fpst, gen_helper_gvec_4_ptr *fn)
{
@@ -991,6 +1000,11 @@ TRANS_FEAT(FMLA_nx_d, aa64_sme2_f64f64, do_fmla_nx, a,
FPST_ZA,
TRANS_FEAT(FMLS_nx_d, aa64_sme2_f64f64, do_fmla_nx, a, FPST_ZA,
s->fpcr_ah ? gen_helper_gvec_ah_fmls_idx_d :
gen_helper_gvec_fmls_idx_d)
+TRANS_FEAT(BFMLA_nx, aa64_sme2_b16b16, do_fmla_nx, a, FPST_ZA,
+ gen_helper_gvec_bfmla_idx)
+TRANS_FEAT(BFMLS_nx, aa64_sme2_b16b16, do_fmla_nx, a, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_bfmls_idx :
gen_helper_gvec_bfmls_idx)
+
/*
* Expand array multi-vector single (n1), array multi-vector (nn),
* and array multi-vector indexed (nx), for integer accumulate.
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index 2d2a000a4a..e5a4f56ef7 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -1607,6 +1607,12 @@ static float16 float16_muladd_f(float16 dest, float16
op1, float16 op2,
return float16_muladd(op1, op2, dest, 0, stat);
}
+static bfloat16 bfloat16_muladd_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
+ float_status *stat)
+{
+ return bfloat16_muladd(op1, op2, dest, 0, stat);
+}
+
static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
float_status *stat)
{
@@ -1625,6 +1631,12 @@ static float16 float16_mulsub_f(float16 dest, float16
op1, float16 op2,
return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
}
+static bfloat16 bfloat16_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
+ float_status *stat)
+{
+ return bfloat16_muladd(bfloat16_chs(op1), op2, dest, 0, stat);
+}
+
static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
float_status *stat)
{
@@ -1643,6 +1655,12 @@ static float16 float16_ah_mulsub_f(float16 dest, float16
op1, float16 op2,
return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
}
+static bfloat16 bfloat16_ah_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
+ float_status *stat)
+{
+ return bfloat16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
+}
+
static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2,
float_status *stat)
{
@@ -1676,14 +1694,19 @@ DO_MULADD(gvec_fmls_nf_s, float32_mulsub_nf, float32)
DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
+DO_MULADD(gvec_bfmla, bfloat16_muladd_f, bfloat16)
DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
+DO_MULADD(gvec_bfmls, bfloat16_mulsub_f, bfloat16)
DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16)
DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32)
DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64)
+DO_MULADD(gvec_ah_bfmls, bfloat16_ah_mulsub_f, bfloat16)
+
+#undef DO_MULADD
/* For the indexed ops, SVE applies the index per 128-bit vector segment.
* For AdvSIMD, there is of course only one such vector segment.
@@ -1802,14 +1825,17 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void
*va, \
DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0)
DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0)
DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0)
+DO_FMLA_IDX(gvec_bfmla_idx, bfloat16, H2, 0, 0)
DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0)
DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0)
DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0)
+DO_FMLA_IDX(gvec_bfmls_idx, bfloat16, H2, INT16_MIN, 0)
DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product)
DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product)
DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product)
+DO_FMLA_IDX(gvec_ah_bfmls_idx, bfloat16, H2, 0, float_muladd_negate_product)
#undef DO_FMLA_IDX
diff --git a/target/arm/tcg/sme.decode b/target/arm/tcg/sme.decode
index 30fa60f9a0..0d592bb467 100644
--- a/target/arm/tcg/sme.decode
+++ b/target/arm/tcg/sme.decode
@@ -373,16 +373,22 @@ SUMLALL_n1_d 11000001 011 0 .... 0 .. 000 ..... 1010 .
@azz_nx1_o1x4 n=2
SUMLALL_n1_s 11000001 001 1 .... 0 .. 000 ..... 1010 . @azz_nx1_o1x4 n=4
SUMLALL_n1_d 11000001 011 1 .... 0 .. 000 ..... 1010 . @azz_nx1_o1x4 n=4
+BFMLA_n1 11000001 011 0 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=2
FMLA_n1_h 11000001 001 0 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=2
FMLA_n1_s 11000001 001 0 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=2
FMLA_n1_d 11000001 011 0 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=2
+
+BFMLA_n1 11000001 011 1 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=4
FMLA_n1_h 11000001 001 1 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=4
FMLA_n1_s 11000001 001 1 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=4
FMLA_n1_d 11000001 011 1 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=4
+BFMLS_n1 11000001 011 0 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=2
FMLS_n1_h 11000001 001 0 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=2
FMLS_n1_s 11000001 001 0 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=2
FMLS_n1_d 11000001 011 0 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=2
+
+BFMLS_n1 11000001 011 1 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=4
FMLS_n1_h 11000001 001 1 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=4
FMLS_n1_s 11000001 001 1 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=4
FMLS_n1_d 11000001 011 1 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=4
@@ -489,16 +495,22 @@ USMLALL_nn_d 11000001 111 ....0 0 .. 000 ....0 1010 .
@azz_2x2_o1x4
USMLALL_nn_s 11000001 101 ...01 0 .. 000 ...00 1010 . @azz_4x4_o1x4
USMLALL_nn_d 11000001 111 ...01 0 .. 000 ...00 1010 . @azz_4x4_o1x4
+BFMLA_nn 11000001 111 ....0 0 .. 100 ....0 01 ... @azz_2x2_o3
FMLA_nn_h 11000001 101 ....0 0 .. 100 ....0 01 ... @azz_2x2_o3
FMLA_nn_s 11000001 101 ....0 0 .. 110 ....0 00 ... @azz_2x2_o3
FMLA_nn_d 11000001 111 ....0 0 .. 110 ....0 00 ... @azz_2x2_o3
+
+BFMLA_nn 11000001 111 ...01 0 .. 100 ...00 01 ... @azz_4x4_o3
FMLA_nn_h 11000001 101 ...01 0 .. 100 ...00 01 ... @azz_4x4_o3
FMLA_nn_s 11000001 101 ...01 0 .. 110 ...00 00 ... @azz_4x4_o3
FMLA_nn_d 11000001 111 ...01 0 .. 110 ...00 00 ... @azz_4x4_o3
+BFMLS_nn 11000001 111 ....0 0 .. 100 ....0 11 ... @azz_2x2_o3
FMLS_nn_h 11000001 101 ....0 0 .. 100 ....0 11 ... @azz_2x2_o3
FMLS_nn_s 11000001 101 ....0 0 .. 110 ....0 01 ... @azz_2x2_o3
FMLS_nn_d 11000001 111 ....0 0 .. 110 ....0 01 ... @azz_2x2_o3
+
+BFMLS_nn 11000001 111 ...01 0 .. 100 ...00 11 ... @azz_4x4_o3
FMLS_nn_h 11000001 101 ...01 0 .. 100 ...00 11 ... @azz_4x4_o3
FMLS_nn_s 11000001 101 ...01 0 .. 110 ...00 01 ... @azz_4x4_o3
FMLS_nn_d 11000001 111 ...01 0 .. 110 ...00 01 ... @azz_4x4_o3
@@ -663,16 +675,22 @@ SUMLALL_nx_d 11000001 1001 .... 1 .. 00. ...01 10 ...
@azx_4x1_i3_o1
@azx_4x1_i3_o3 ........ .... zm:4 . .. ... ..... .. off:3 \
&azx_n n=4 rv=%mova_rv zn=%zn_ax4 idx=%idx3_10_3
+BFMLA_nx 11000001 0001 .... 0 .. 1.. ....1 0 .... @azx_2x1_i3_o3
FMLA_nx_h 11000001 0001 .... 0 .. 1.. ....0 0 .... @azx_2x1_i3_o3
FMLA_nx_s 11000001 0101 .... 0 .. 0.. ....0 00 ... @azx_2x1_i2_o3
FMLA_nx_d 11000001 1101 .... 0 .. 00. ....0 00 ... @azx_2x1_i1_o3
+
+BFMLA_nx 11000001 0001 .... 1 .. 1.. ...01 0 .... @azx_4x1_i3_o3
FMLA_nx_h 11000001 0001 .... 1 .. 1.. ...00 0 .... @azx_4x1_i3_o3
FMLA_nx_s 11000001 0101 .... 1 .. 0.. ...00 00 ... @azx_4x1_i2_o3
FMLA_nx_d 11000001 1101 .... 1 .. 00. ...00 00 ... @azx_4x1_i1_o3
+BFMLS_nx 11000001 0001 .... 0 .. 1.. ....1 1 .... @azx_2x1_i3_o3
FMLS_nx_h 11000001 0001 .... 0 .. 1.. ....0 1 .... @azx_2x1_i3_o3
FMLS_nx_s 11000001 0101 .... 0 .. 0.. ....0 10 ... @azx_2x1_i2_o3
FMLS_nx_d 11000001 1101 .... 0 .. 00. ....0 10 ... @azx_2x1_i1_o3
+
+BFMLS_nx 11000001 0001 .... 1 .. 1.. ...01 1 .... @azx_4x1_i3_o3
FMLS_nx_h 11000001 0001 .... 1 .. 1.. ...00 1 .... @azx_4x1_i3_o3
FMLS_nx_s 11000001 0101 .... 1 .. 0.. ...00 10 ... @azx_4x1_i2_o3
FMLS_nx_d 11000001 1101 .... 1 .. 00. ...00 10 ... @azx_4x1_i1_o3
--
2.43.0
- [PATCH 25/61] target/arm: Introduce gen_gvec_sve2_sqdmulh, (continued)
- [PATCH 25/61] target/arm: Introduce gen_gvec_sve2_sqdmulh, Richard Henderson, 2025/02/06
- [PATCH 28/61] target/arm: Implement SME2 ADD/SUB (array results, multiple and single vector), Richard Henderson, 2025/02/06
- [PATCH 30/61] target/arm: Pass ZA to helper_sve2_fmlal_zz[zx]w_s, Richard Henderson, 2025/02/06
- [PATCH 32/61] target/arm: Implement SME2 FDOT, Richard Henderson, 2025/02/06
- [PATCH 33/61] target/arm: Implement SME2 BFDOT, Richard Henderson, 2025/02/06
- [PATCH 36/61] target/arm: Remove helper_gvec_sudot_idx_4b, Richard Henderson, 2025/02/06
- [PATCH 18/61] target/arm: Split get_tile_rowcol argument tile_index, Richard Henderson, 2025/02/06
- [PATCH 41/61] target/arm: Rename gvec_fml[as]_[hs] with _nf_ infix, Richard Henderson, 2025/02/06
- [PATCH 42/61] target/arm: Implement SME2 FMLA, FMLS, Richard Henderson, 2025/02/06
- [PATCH 40/61] target/arm: Implement SME2 SMLALL, SMLSLL, UMLALL, UMLSLL, Richard Henderson, 2025/02/06
- [PATCH 43/61] target/arm: Implement SME2 BFMLA, BFMLS,
Richard Henderson <=
- [PATCH 46/61] target/arm: Implement SME2 BFCVT, BFCVTN, FCVT, FCVTN, Richard Henderson, 2025/02/06
- [PATCH 54/61] target/arm: Implement SME2 SUNPK, UUNPK, Richard Henderson, 2025/02/06
- [PATCH 48/61] target/arm: Implement SME2 FCVTZS, FCVTZU, Richard Henderson, 2025/02/06
- [PATCH 51/61] target/arm: Introduce do_[us]sat_[bhs] macros, Richard Henderson, 2025/02/06
- [PATCH 45/61] target/arm: Remove CPUARMState.vfp.scratch, Richard Henderson, 2025/02/06
- [PATCH 49/61] target/arm: Implement SME2 SCVTF, UCVTF, Richard Henderson, 2025/02/06
- [PATCH 50/61] target/arm: Implement SME2 FRINTN, FRINTP, FRINTM, FRINTA, Richard Henderson, 2025/02/06
- [PATCH 60/61] target/arm: Implement SME2 SEL, Richard Henderson, 2025/02/06
- [PATCH 61/61] target/arm: Enable FEAT_SME2, FEAT_SME_F16F16, FEAT_SVE_B16B16 on -cpu max, Richard Henderson, 2025/02/06
- [PATCH 55/61] target/arm: Implement SME2 ZIP, UZP (four registers), Richard Henderson, 2025/02/06
- Prev by Date:
[PATCH 40/61] target/arm: Implement SME2 SMLALL, SMLSLL, UMLALL, UMLSLL
- Next by Date:
[PATCH 46/61] target/arm: Implement SME2 BFCVT, BFCVTN, FCVT, FCVTN
- Previous by thread:
[PATCH 40/61] target/arm: Implement SME2 SMLALL, SMLSLL, UMLALL, UMLSLL
- Next by thread:
[PATCH 46/61] target/arm: Implement SME2 BFCVT, BFCVTN, FCVT, FCVTN
- Index(es):