[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 54/61] target/arm: Implement SME2 SUNPK, UUNPK
From: |
Richard Henderson |
Subject: |
[PATCH 54/61] target/arm: Implement SME2 SUNPK, UUNPK |
Date: |
Thu, 6 Feb 2025 11:57:08 -0800 |
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/helper-sme.h | 13 ++++++++++++
target/arm/tcg/sme_helper.c | 38 ++++++++++++++++++++++++++++++++++
target/arm/tcg/translate-sme.c | 16 ++++++++++++++
target/arm/tcg/sme.decode | 18 ++++++++++++++++
4 files changed, 85 insertions(+)
diff --git a/target/arm/tcg/helper-sme.h b/target/arm/tcg/helper-sme.h
index 7ae3fd172d..3d3eb393b3 100644
--- a/target/arm/tcg/helper-sme.h
+++ b/target/arm/tcg/helper-sme.h
@@ -238,3 +238,16 @@ DEF_HELPER_FLAGS_3(sme2_sqcvtun_sb, TCG_CALL_NO_RWG, void,
ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sme2_sqcvtn_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sme2_uqcvtn_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sme2_sqcvtun_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sme2_sunpk2_bh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sunpk2_hs, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sunpk2_sd, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sunpk4_bh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sunpk4_hs, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sunpk4_sd, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uunpk2_bh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uunpk2_hs, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uunpk2_sd, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uunpk4_bh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uunpk4_hs, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uunpk4_sd, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c
index a9adc8589f..cf4f09dbc4 100644
--- a/target/arm/tcg/sme_helper.c
+++ b/target/arm/tcg/sme_helper.c
@@ -1676,6 +1676,44 @@ void HELPER(sme2_fcvt_w)(void *vd, void *vs,
float_status *fpst, uint32_t desc)
}
}
+#define UNPK(NAME, SREG, TW, TN, HW, HN) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ ARMVectorReg scratch[SREG] __attribute__((uninitialized)); \
+ size_t oprsz = simd_oprsz(desc); \
+ size_t n = oprsz / sizeof(TW); \
+ if ((vs - vd) < 2 * SREG * sizeof(ARMVectorReg)) { \
+ vs = memcpy(scratch, vs, sizeof(scratch)); \
+ } \
+ for (size_t r = 0; r < SREG; ++r) { \
+ TN *s = vs + r * sizeof(ARMVectorReg); \
+ for (size_t i = 0; i < 2; ++i) { \
+ TW *d = vd + (2 * r + i) * sizeof(ARMVectorReg); \
+ for (size_t e = 0; e < n; ++e) { \
+ d[HW(e)] = s[i * n + HN(e)]; \
+ } \
+ } \
+ } \
+}
+
+UNPK(sme2_sunpk2_bh, 1, int16_t, int8_t, H2, H1)
+UNPK(sme2_sunpk2_hs, 1, int32_t, int16_t, H4, H2)
+UNPK(sme2_sunpk2_sd, 1, int64_t, int32_t, H8, H4)
+
+UNPK(sme2_sunpk4_bh, 2, int16_t, int8_t, H2, H1)
+UNPK(sme2_sunpk4_hs, 2, int32_t, int16_t, H4, H2)
+UNPK(sme2_sunpk4_sd, 2, int64_t, int32_t, H8, H4)
+
+UNPK(sme2_uunpk2_bh, 1, uint16_t, uint8_t, H2, H1)
+UNPK(sme2_uunpk2_hs, 1, uint32_t, uint16_t, H4, H2)
+UNPK(sme2_uunpk2_sd, 1, uint64_t, uint32_t, H8, H4)
+
+UNPK(sme2_uunpk4_bh, 2, uint16_t, uint8_t, H2, H1)
+UNPK(sme2_uunpk4_hs, 2, uint32_t, uint16_t, H4, H2)
+UNPK(sme2_uunpk4_sd, 2, uint64_t, uint32_t, H8, H4)
+
+#undef UNPK
+
/* Deinterleave and convert. */
void HELPER(sme2_fcvtl)(void *vd, void *vs, float_status *fpst, uint32_t desc)
{
diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
index ec88a91cf1..9d7a10aa6b 100644
--- a/target/arm/tcg/translate-sme.c
+++ b/target/arm/tcg/translate-sme.c
@@ -1347,3 +1347,19 @@ TRANS_FEAT(SQCVTUN_sb, aa64_sme2, do_zz, a, 0,
gen_helper_sme2_sqcvtun_sb)
TRANS_FEAT(SQCVTN_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtn_dh)
TRANS_FEAT(UQCVTN_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uqcvtn_dh)
TRANS_FEAT(SQCVTUN_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtun_dh)
+
+TRANS_FEAT(SUNPK_2bh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk2_bh)
+TRANS_FEAT(SUNPK_2hs, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk2_hs)
+TRANS_FEAT(SUNPK_2sd, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk2_sd)
+
+TRANS_FEAT(SUNPK_4bh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk4_bh)
+TRANS_FEAT(SUNPK_4hs, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk4_hs)
+TRANS_FEAT(SUNPK_4sd, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk4_sd)
+
+TRANS_FEAT(UUNPK_2bh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk2_bh)
+TRANS_FEAT(UUNPK_2hs, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk2_hs)
+TRANS_FEAT(UUNPK_2sd, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk2_sd)
+
+TRANS_FEAT(UUNPK_4bh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk4_bh)
+TRANS_FEAT(UUNPK_4hs, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk4_hs)
+TRANS_FEAT(UUNPK_4sd, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk4_sd)
diff --git a/target/arm/tcg/sme.decode b/target/arm/tcg/sme.decode
index 491c6231ea..1da64c5258 100644
--- a/target/arm/tcg/sme.decode
+++ b/target/arm/tcg/sme.decode
@@ -733,6 +733,8 @@ FMLS_nx_d 11000001 1101 .... 1 .. 00. ...00 10 ...
@azx_4x1_i1_o3
&zz_n n=2 zd=%zd_ax2 zn=%zn_ax2
@zz_4x4 ........ ... ..... ...... .... . ..... \
&zz_n n=4 zd=%zd_ax4 zn=%zn_ax4
+@zz_4x2_n1 ........ ... ..... ...... .... . ..... \
+ &zz_n n=1 zd=%zd_ax4 zn=%zn_ax2
BFCVT 11000001 011 00000 111000 ....0 ..... @zz_1x2
BFCVTN 11000001 011 00000 111000 ....1 ..... @zz_1x2
@@ -781,3 +783,19 @@ SQCVTUN_sb 11000001 011 10011 111000 ...10 .....
@zz_1x4
SQCVTN_dh 11000001 101 10011 111000 ...10 ..... @zz_1x4
UQCVTN_dh 11000001 101 10011 111000 ...11 ..... @zz_1x4
SQCVTUN_dh 11000001 111 10011 111000 ...10 ..... @zz_1x4
+
+SUNPK_2bh 11000001 011 00101 111000 ..... ....0 @zz_2x1
+SUNPK_2hs 11000001 101 00101 111000 ..... ....0 @zz_2x1
+SUNPK_2sd 11000001 111 00101 111000 ..... ....0 @zz_2x1
+
+UUNPK_2bh 11000001 011 00101 111000 ..... ....1 @zz_2x1
+UUNPK_2hs 11000001 101 00101 111000 ..... ....1 @zz_2x1
+UUNPK_2sd 11000001 111 00101 111000 ..... ....1 @zz_2x1
+
+SUNPK_4bh 11000001 011 10101 111000 ....0 ...00 @zz_4x2_n1
+SUNPK_4hs 11000001 101 10101 111000 ....0 ...00 @zz_4x2_n1
+SUNPK_4sd 11000001 111 10101 111000 ....0 ...00 @zz_4x2_n1
+
+UUNPK_4bh 11000001 011 10101 111000 ....0 ...01 @zz_4x2_n1
+UUNPK_4hs 11000001 101 10101 111000 ....0 ...01 @zz_4x2_n1
+UUNPK_4sd 11000001 111 10101 111000 ....0 ...01 @zz_4x2_n1
--
2.43.0
- [PATCH 30/61] target/arm: Pass ZA to helper_sve2_fmlal_zz[zx]w_s, (continued)
- [PATCH 30/61] target/arm: Pass ZA to helper_sve2_fmlal_zz[zx]w_s, Richard Henderson, 2025/02/06
- [PATCH 32/61] target/arm: Implement SME2 FDOT, Richard Henderson, 2025/02/06
- [PATCH 33/61] target/arm: Implement SME2 BFDOT, Richard Henderson, 2025/02/06
- [PATCH 36/61] target/arm: Remove helper_gvec_sudot_idx_4b, Richard Henderson, 2025/02/06
- [PATCH 18/61] target/arm: Split get_tile_rowcol argument tile_index, Richard Henderson, 2025/02/06
- [PATCH 41/61] target/arm: Rename gvec_fml[as]_[hs] with _nf_ infix, Richard Henderson, 2025/02/06
- [PATCH 42/61] target/arm: Implement SME2 FMLA, FMLS, Richard Henderson, 2025/02/06
- [PATCH 40/61] target/arm: Implement SME2 SMLALL, SMLSLL, UMLALL, UMLSLL, Richard Henderson, 2025/02/06
- [PATCH 43/61] target/arm: Implement SME2 BFMLA, BFMLS, Richard Henderson, 2025/02/06
- [PATCH 46/61] target/arm: Implement SME2 BFCVT, BFCVTN, FCVT, FCVTN, Richard Henderson, 2025/02/06
- [PATCH 54/61] target/arm: Implement SME2 SUNPK, UUNPK,
Richard Henderson <=
- [PATCH 48/61] target/arm: Implement SME2 FCVTZS, FCVTZU, Richard Henderson, 2025/02/06
- [PATCH 51/61] target/arm: Introduce do_[us]sat_[bhs] macros, Richard Henderson, 2025/02/06
- [PATCH 45/61] target/arm: Remove CPUARMState.vfp.scratch, Richard Henderson, 2025/02/06
- [PATCH 49/61] target/arm: Implement SME2 SCVTF, UCVTF, Richard Henderson, 2025/02/06
- [PATCH 50/61] target/arm: Implement SME2 FRINTN, FRINTP, FRINTM, FRINTA, Richard Henderson, 2025/02/06
- [PATCH 60/61] target/arm: Implement SME2 SEL, Richard Henderson, 2025/02/06
- [PATCH 61/61] target/arm: Enable FEAT_SME2, FEAT_SME_F16F16, FEAT_SVE_B16B16 on -cpu max, Richard Henderson, 2025/02/06
- [PATCH 55/61] target/arm: Implement SME2 ZIP, UZP (four registers), Richard Henderson, 2025/02/06
- [PATCH 44/61] target/arm: Implement SME2 FADD, FSUB, BFADD, BFSUB, Richard Henderson, 2025/02/06
- [PATCH 47/61] target/arm: Implement SME2 FCVT (widening), FCVTL, Richard Henderson, 2025/02/06