[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 42/61] target/arm: Implement SME2 FMLA, FMLS
From: |
Richard Henderson |
Subject: |
[PATCH 42/61] target/arm: Implement SME2 FMLA, FMLS |
Date: |
Thu, 6 Feb 2025 11:56:56 -0800 |
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-sme.c | 95 ++++++++++++++++++++++++++++++++++
target/arm/tcg/sme.decode | 48 +++++++++++++++++
2 files changed, 143 insertions(+)
diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
index 9c0989baff..1989dedab5 100644
--- a/target/arm/tcg/translate-sme.c
+++ b/target/arm/tcg/translate-sme.c
@@ -761,6 +761,47 @@ TRANS_FEAT(SUB_azz_nn_d, aa64_sme2_i16i64, do_azz_nn, a,
MO_64, tcg_gen_gvec_sub
*/
#define FPST_ENV -1
+static bool do_azz_fp(DisasContext *s, int nreg, int nsel,
+ int rv, int off, int zn, int zm,
+ int data, int shsel, bool multi, int fpst,
+ gen_helper_gvec_3_ptr *fn)
+{
+ if (sme_sm_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ int vstride = svl / nreg;
+ TCGv_ptr t_za = get_zarray(s, rv, off, nreg);
+ TCGv_ptr t, ptr;
+
+ if (fpst >= 0) {
+ ptr = fpstatus_ptr(fpst);
+ } else {
+ ptr = tcg_env;
+ }
+ t = tcg_temp_new_ptr();
+
+ for (int r = 0; r < nreg; ++r) {
+ TCGv_ptr t_zn = vec_full_reg_ptr(s, zn);
+ TCGv_ptr t_zm = vec_full_reg_ptr(s, zm);
+
+ for (int i = 0; i < nsel; ++i) {
+ int o_za = (r * vstride + i) * sizeof(ARMVectorReg);
+ int desc = simd_desc(svl, svl, data | (i << shsel));
+
+ tcg_gen_addi_ptr(t, t_za, o_za);
+ fn(t, t_zn, t_zm, ptr, tcg_constant_i32(desc));
+ }
+
+ /*
+ * For multiple-and-single vectors, Zn may wrap.
+ * For multiple vectors, both Zn and Zm are aligned.
+ */
+ zn = (zn + 1) % 32;
+ zm += multi;
+ }
+ }
+ return true;
+}
+
static bool do_azz_acc_fp(DisasContext *s, int nreg, int nsel,
int rv, int off, int zn, int zm,
int data, int shsel, bool multi, int fpst,
@@ -896,6 +937,60 @@ static bool trans_BFVDOT(DisasContext *s, arg_azx_n *a)
gen_helper_sme2_bfvdot_idx);
}
+static bool do_fmla(DisasContext *s, arg_azz_n *a, bool multi,
+ ARMFPStatusFlavour fpst, gen_helper_gvec_3_ptr *fn)
+{
+ return do_azz_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm,
+ 0, 0, multi, fpst, fn);
+}
+
+TRANS_FEAT(FMLA_n1_h, aa64_sme2_f16f16, do_fmla, a, false, FPST_ZA_F16,
+ gen_helper_gvec_vfma_h)
+TRANS_FEAT(FMLS_n1_h, aa64_sme2_f16f16, do_fmla, a, false, FPST_ZA_F16,
+ s->fpcr_ah ? gen_helper_gvec_ah_vfms_h : gen_helper_gvec_vfms_h)
+TRANS_FEAT(FMLA_nn_h, aa64_sme2_f16f16, do_fmla, a, true, FPST_ZA_F16,
+ gen_helper_gvec_vfma_h)
+TRANS_FEAT(FMLS_nn_h, aa64_sme2_f16f16, do_fmla, a, true, FPST_ZA_F16,
+ s->fpcr_ah ? gen_helper_gvec_ah_vfms_h : gen_helper_gvec_vfms_h)
+
+TRANS_FEAT(FMLA_n1_s, aa64_sme2, do_fmla, a, false, FPST_ZA,
+ gen_helper_gvec_vfma_s)
+TRANS_FEAT(FMLS_n1_s, aa64_sme2, do_fmla, a, false, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_vfms_s : gen_helper_gvec_vfms_s)
+TRANS_FEAT(FMLA_nn_s, aa64_sme2, do_fmla, a, true, FPST_ZA,
+ gen_helper_gvec_vfma_s)
+TRANS_FEAT(FMLS_nn_s, aa64_sme2, do_fmla, a, true, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_vfms_s : gen_helper_gvec_vfms_s)
+
+TRANS_FEAT(FMLA_n1_d, aa64_sme2_f64f64, do_fmla, a, false, FPST_ZA,
+ gen_helper_gvec_vfma_d)
+TRANS_FEAT(FMLS_n1_d, aa64_sme2_f64f64, do_fmla, a, false, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_vfms_d : gen_helper_gvec_vfms_d)
+TRANS_FEAT(FMLA_nn_d, aa64_sme2_f64f64, do_fmla, a, true, FPST_ZA,
+ gen_helper_gvec_vfma_d)
+TRANS_FEAT(FMLS_nn_d, aa64_sme2_f64f64, do_fmla, a, true, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_vfms_d : gen_helper_gvec_vfms_d)
+
+static bool do_fmla_nx(DisasContext *s, arg_azx_n *a,
+ ARMFPStatusFlavour fpst, gen_helper_gvec_4_ptr *fn)
+{
+ return do_azz_acc_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm,
+ a->idx, 0, false, fpst, fn);
+}
+
+TRANS_FEAT(FMLA_nx_h, aa64_sme2_f16f16, do_fmla_nx, a, FPST_ZA_F16,
+ gen_helper_gvec_fmla_idx_h)
+TRANS_FEAT(FMLS_nx_h, aa64_sme2_f16f16, do_fmla_nx, a, FPST_ZA_F16,
+ s->fpcr_ah ? gen_helper_gvec_ah_fmls_idx_h :
gen_helper_gvec_fmls_idx_h)
+TRANS_FEAT(FMLA_nx_s, aa64_sme2, do_fmla_nx, a, FPST_ZA,
+ gen_helper_gvec_fmla_idx_s)
+TRANS_FEAT(FMLS_nx_s, aa64_sme2, do_fmla_nx, a, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_fmls_idx_s :
gen_helper_gvec_fmls_idx_s)
+TRANS_FEAT(FMLA_nx_d, aa64_sme2_f64f64, do_fmla_nx, a, FPST_ZA,
+ gen_helper_gvec_fmla_idx_d)
+TRANS_FEAT(FMLS_nx_d, aa64_sme2_f64f64, do_fmla_nx, a, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_fmls_idx_d :
gen_helper_gvec_fmls_idx_d)
+
/*
* Expand array multi-vector single (n1), array multi-vector (nn),
* and array multi-vector indexed (nx), for integer accumulate.
diff --git a/target/arm/tcg/sme.decode b/target/arm/tcg/sme.decode
index 49a45612fd..30fa60f9a0 100644
--- a/target/arm/tcg/sme.decode
+++ b/target/arm/tcg/sme.decode
@@ -373,6 +373,20 @@ SUMLALL_n1_d 11000001 011 0 .... 0 .. 000 ..... 1010 .
@azz_nx1_o1x4 n=2
SUMLALL_n1_s 11000001 001 1 .... 0 .. 000 ..... 1010 . @azz_nx1_o1x4 n=4
SUMLALL_n1_d 11000001 011 1 .... 0 .. 000 ..... 1010 . @azz_nx1_o1x4 n=4
+FMLA_n1_h 11000001 001 0 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=2
+FMLA_n1_s 11000001 001 0 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=2
+FMLA_n1_d 11000001 011 0 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=2
+FMLA_n1_h 11000001 001 1 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=4
+FMLA_n1_s 11000001 001 1 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=4
+FMLA_n1_d 11000001 011 1 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=4
+
+FMLS_n1_h 11000001 001 0 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=2
+FMLS_n1_s 11000001 001 0 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=2
+FMLS_n1_d 11000001 011 0 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=2
+FMLS_n1_h 11000001 001 1 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=4
+FMLS_n1_s 11000001 001 1 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=4
+FMLS_n1_d 11000001 011 1 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=4
+
### SME2 Multi-vector Multiple Array Vectors
%zn_ax2 6:4 !function=times_2
@@ -475,6 +489,20 @@ USMLALL_nn_d 11000001 111 ....0 0 .. 000 ....0 1010 .
@azz_2x2_o1x4
USMLALL_nn_s 11000001 101 ...01 0 .. 000 ...00 1010 . @azz_4x4_o1x4
USMLALL_nn_d 11000001 111 ...01 0 .. 000 ...00 1010 . @azz_4x4_o1x4
+FMLA_nn_h 11000001 101 ....0 0 .. 100 ....0 01 ... @azz_2x2_o3
+FMLA_nn_s 11000001 101 ....0 0 .. 110 ....0 00 ... @azz_2x2_o3
+FMLA_nn_d 11000001 111 ....0 0 .. 110 ....0 00 ... @azz_2x2_o3
+FMLA_nn_h 11000001 101 ...01 0 .. 100 ...00 01 ... @azz_4x4_o3
+FMLA_nn_s 11000001 101 ...01 0 .. 110 ...00 00 ... @azz_4x4_o3
+FMLA_nn_d 11000001 111 ...01 0 .. 110 ...00 00 ... @azz_4x4_o3
+
+FMLS_nn_h 11000001 101 ....0 0 .. 100 ....0 11 ... @azz_2x2_o3
+FMLS_nn_s 11000001 101 ....0 0 .. 110 ....0 01 ... @azz_2x2_o3
+FMLS_nn_d 11000001 111 ....0 0 .. 110 ....0 01 ... @azz_2x2_o3
+FMLS_nn_h 11000001 101 ...01 0 .. 100 ...00 11 ... @azz_4x4_o3
+FMLS_nn_s 11000001 101 ...01 0 .. 110 ...00 01 ... @azz_4x4_o3
+FMLS_nn_d 11000001 111 ...01 0 .. 110 ...00 01 ... @azz_4x4_o3
+
### SME2 Multi-vector Indexed
&azx_n n off rv zn zm idx
@@ -628,3 +656,23 @@ SUMLALL_nx_s 11000001 0001 .... 0 .. 0.. ....1 10 ...
@azx_2x1_i4_o1
SUMLALL_nx_d 11000001 1001 .... 0 .. 00. ....1 10 ... @azx_2x1_i3_o1
SUMLALL_nx_s 11000001 0001 .... 1 .. 0.. ...01 10 ... @azx_4x1_i4_o1
SUMLALL_nx_d 11000001 1001 .... 1 .. 00. ...01 10 ... @azx_4x1_i3_o1
+
+%idx3_10_3 10:2 3:1
+@azx_2x1_i3_o3 ........ .... zm:4 . .. ... ..... .. off:3 \
+ &azx_n n=2 rv=%mova_rv zn=%zn_ax2 idx=%idx3_10_3
+@azx_4x1_i3_o3 ........ .... zm:4 . .. ... ..... .. off:3 \
+ &azx_n n=4 rv=%mova_rv zn=%zn_ax4 idx=%idx3_10_3
+
+FMLA_nx_h 11000001 0001 .... 0 .. 1.. ....0 0 .... @azx_2x1_i3_o3
+FMLA_nx_s 11000001 0101 .... 0 .. 0.. ....0 00 ... @azx_2x1_i2_o3
+FMLA_nx_d 11000001 1101 .... 0 .. 00. ....0 00 ... @azx_2x1_i1_o3
+FMLA_nx_h 11000001 0001 .... 1 .. 1.. ...00 0 .... @azx_4x1_i3_o3
+FMLA_nx_s 11000001 0101 .... 1 .. 0.. ...00 00 ... @azx_4x1_i2_o3
+FMLA_nx_d 11000001 1101 .... 1 .. 00. ...00 00 ... @azx_4x1_i1_o3
+
+FMLS_nx_h 11000001 0001 .... 0 .. 1.. ....0 1 .... @azx_2x1_i3_o3
+FMLS_nx_s 11000001 0101 .... 0 .. 0.. ....0 10 ... @azx_2x1_i2_o3
+FMLS_nx_d 11000001 1101 .... 0 .. 00. ....0 10 ... @azx_2x1_i1_o3
+FMLS_nx_h 11000001 0001 .... 1 .. 1.. ...00 1 .... @azx_4x1_i3_o3
+FMLS_nx_s 11000001 0101 .... 1 .. 0.. ...00 10 ... @azx_4x1_i2_o3
+FMLS_nx_d 11000001 1101 .... 1 .. 00. ...00 10 ... @azx_4x1_i1_o3
--
2.43.0
- [PATCH 38/61] target/arm: Implement SME2 SVDOT, UVDOT, SUVDOT, USVDOT, (continued)
- [PATCH 38/61] target/arm: Implement SME2 SVDOT, UVDOT, SUVDOT, USVDOT, Richard Henderson, 2025/02/06
- [PATCH 23/61] target/arm: Implement SME2 BMOPA, Richard Henderson, 2025/02/06
- [PATCH 25/61] target/arm: Introduce gen_gvec_sve2_sqdmulh, Richard Henderson, 2025/02/06
- [PATCH 28/61] target/arm: Implement SME2 ADD/SUB (array results, multiple and single vector), Richard Henderson, 2025/02/06
- [PATCH 30/61] target/arm: Pass ZA to helper_sve2_fmlal_zz[zx]w_s, Richard Henderson, 2025/02/06
- [PATCH 32/61] target/arm: Implement SME2 FDOT, Richard Henderson, 2025/02/06
- [PATCH 33/61] target/arm: Implement SME2 BFDOT, Richard Henderson, 2025/02/06
- [PATCH 36/61] target/arm: Remove helper_gvec_sudot_idx_4b, Richard Henderson, 2025/02/06
- [PATCH 18/61] target/arm: Split get_tile_rowcol argument tile_index, Richard Henderson, 2025/02/06
- [PATCH 41/61] target/arm: Rename gvec_fml[as]_[hs] with _nf_ infix, Richard Henderson, 2025/02/06
- [PATCH 42/61] target/arm: Implement SME2 FMLA, FMLS,
Richard Henderson <=
- [PATCH 40/61] target/arm: Implement SME2 SMLALL, SMLSLL, UMLALL, UMLSLL, Richard Henderson, 2025/02/06
- [PATCH 43/61] target/arm: Implement SME2 BFMLA, BFMLS, Richard Henderson, 2025/02/06
- [PATCH 46/61] target/arm: Implement SME2 BFCVT, BFCVTN, FCVT, FCVTN, Richard Henderson, 2025/02/06
- [PATCH 54/61] target/arm: Implement SME2 SUNPK, UUNPK, Richard Henderson, 2025/02/06
- [PATCH 48/61] target/arm: Implement SME2 FCVTZS, FCVTZU, Richard Henderson, 2025/02/06
- [PATCH 51/61] target/arm: Introduce do_[us]sat_[bhs] macros, Richard Henderson, 2025/02/06
- [PATCH 45/61] target/arm: Remove CPUARMState.vfp.scratch, Richard Henderson, 2025/02/06
- [PATCH 49/61] target/arm: Implement SME2 SCVTF, UCVTF, Richard Henderson, 2025/02/06
- [PATCH 50/61] target/arm: Implement SME2 FRINTN, FRINTP, FRINTM, FRINTA, Richard Henderson, 2025/02/06
- [PATCH 60/61] target/arm: Implement SME2 SEL, Richard Henderson, 2025/02/06