qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v2 34/42] i386: Implement VGATHER


From: Paul Brook
Subject: [PATCH v2 34/42] i386: Implement VGATHER
Date: Sun, 24 Apr 2022 23:01:56 +0100

These are scatter load instructions that need introduce a new "Vector SIB"
encoding.  Also a bit of hair to handle different index sizes and scaling
factors, but overall the combinatorial explosion doesn't end up too bad.

The other thing of note is probably that these also modify the mask operand.
Thankfully the operands may not overlap, and we do not have to make the whole
thing appear atomic.

Signed-off-by: Paul Brook <paul@nowt.org>
---
 target/i386/ops_sse.h        | 65 +++++++++++++++++++++++++++++++
 target/i386/ops_sse_header.h | 16 ++++++++
 target/i386/tcg/translate.c  | 74 ++++++++++++++++++++++++++++++++++++
 3 files changed, 155 insertions(+)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index ffcba3d02c..14a2d1bf78 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -3288,6 +3288,71 @@ void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, 
Reg *d, Reg *v, Reg *s)
 #endif
 }
 
+#define VGATHER_HELPER(scale)                                       \
+void glue(helper_vpgatherdd ## scale, SUFFIX)(CPUX86State *env,     \
+        Reg *d, Reg *v, Reg *s, target_ulong a0)                    \
+{                                                                   \
+    int i;                                                          \
+    for (i = 0; i < (2 << SHIFT); i++) {                            \
+        if (v->L(i) >> 31) {                                        \
+            target_ulong addr = a0                                  \
+                + ((target_ulong)(int32_t)s->L(i) << scale);        \
+            d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());          \
+        }                                                           \
+        v->L(i) = 0;                                                \
+    }                                                               \
+}                                                                   \
+void glue(helper_vpgatherdq ## scale, SUFFIX)(CPUX86State *env,     \
+        Reg *d, Reg *v, Reg *s, target_ulong a0)                    \
+{                                                                   \
+    int i;                                                          \
+    for (i = 0; i < (1 << SHIFT); i++) {                            \
+        if (v->Q(i) >> 63) {                                        \
+            target_ulong addr = a0                                  \
+                + ((target_ulong)(int32_t)s->L(i) << scale);        \
+            d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());          \
+        }                                                           \
+        v->Q(i) = 0;                                                \
+    }                                                               \
+}                                                                   \
+void glue(helper_vpgatherqd ## scale, SUFFIX)(CPUX86State *env,     \
+        Reg *d, Reg *v, Reg *s, target_ulong a0)                    \
+{                                                                   \
+    int i;                                                          \
+    for (i = 0; i < (1 << SHIFT); i++) {                            \
+        if (v->L(i) >> 31) {                                        \
+            target_ulong addr = a0                                  \
+                + ((target_ulong)(int64_t)s->Q(i) << scale);        \
+            d->L(i) = cpu_ldl_data_ra(env, addr, GETPC());          \
+        }                                                           \
+        v->L(i) = 0;                                                \
+    }                                                               \
+    d->Q(SHIFT) = 0;                                                    \
+    v->Q(SHIFT) = 0;                                                    \
+    YMM_ONLY(                                                       \
+    d->Q(3) = 0;                                                    \
+    v->Q(3) = 0;                                                    \
+    )                                                               \
+}                                                                   \
+void glue(helper_vpgatherqq ## scale, SUFFIX)(CPUX86State *env,     \
+        Reg *d, Reg *v, Reg *s, target_ulong a0)                    \
+{                                                                   \
+    int i;                                                          \
+    for (i = 0; i < (1 << SHIFT); i++) {                            \
+        if (v->Q(i) >> 63) {                                        \
+            target_ulong addr = a0                                  \
+                + ((target_ulong)(int64_t)s->Q(i) << scale);        \
+            d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC());          \
+        }                                                           \
+        v->Q(i) = 0;                                                \
+    }                                                               \
+}
+
+VGATHER_HELPER(0)
+VGATHER_HELPER(1)
+VGATHER_HELPER(2)
+VGATHER_HELPER(3)
+
 #if SHIFT == 2
 void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index a7a6bf6b10..e5d8ea9bb7 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -433,6 +433,22 @@ DEF_HELPER_4(glue(vpmaskmovd_st, SUFFIX), void, env, Reg, 
Reg, tl)
 DEF_HELPER_4(glue(vpmaskmovq_st, SUFFIX), void, env, Reg, Reg, tl)
 DEF_HELPER_4(glue(vpmaskmovd, SUFFIX), void, env, Reg, Reg, Reg)
 DEF_HELPER_4(glue(vpmaskmovq, SUFFIX), void, env, Reg, Reg, Reg)
+DEF_HELPER_5(glue(vpgatherdd0, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdq0, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqd0, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqq0, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdd1, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdq1, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqd1, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqq1, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdd2, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdq2, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqd2, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqq2, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdd3, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherdq3, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqd3, SUFFIX), void, env, Reg, Reg, Reg, tl)
+DEF_HELPER_5(glue(vpgatherqq3, SUFFIX), void, env, Reg, Reg, Reg, tl)
 #if SHIFT == 2
 DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_1(vzeroall, void, env)
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index e00195d301..fe1ab58d07 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3315,6 +3315,10 @@ static const struct SSEOpHelper_table6 
sse_op_table6[256] = {
     /* vpmaskmovd, vpmaskmovq */
     [0x8c] = BINARY_OP(vpmaskmovd, AVX, SSE_OPF_AVX2),
     [0x8e] = SPECIAL_OP(AVX), /* vpmaskmovd, vpmaskmovq */
+    [0x90] = SPECIAL_OP(AVX), /* vpgatherdd, vpgatherdq */
+    [0x91] = SPECIAL_OP(AVX), /* vpgatherqd, vpgatherqq */
+    [0x92] = SPECIAL_OP(AVX), /* vgatherdpd, vgatherdps */
+    [0x93] = SPECIAL_OP(AVX), /* vgatherqpd, vgatherqps */
 #define gen_helper_aesimc_ymm NULL
     [0xdb] = UNARY_OP(aesimc, AES, 0),
     [0xdc] = BINARY_OP(aesenc, AES, 0),
@@ -3381,6 +3385,25 @@ static const SSEFunc_0_eppt sse_op_table9[2][2] = {
     SSE_OP(vpmaskmovd_st),
     SSE_OP(vpmaskmovq_st),
 };
+
+static const SSEFunc_0_epppt sse_op_table10[16][2] = {
+    SSE_OP(vpgatherdd0),
+    SSE_OP(vpgatherdq0),
+    SSE_OP(vpgatherqd0),
+    SSE_OP(vpgatherqq0),
+    SSE_OP(vpgatherdd1),
+    SSE_OP(vpgatherdq1),
+    SSE_OP(vpgatherqd1),
+    SSE_OP(vpgatherqq1),
+    SSE_OP(vpgatherdd2),
+    SSE_OP(vpgatherdq2),
+    SSE_OP(vpgatherqd2),
+    SSE_OP(vpgatherqq2),
+    SSE_OP(vpgatherdd3),
+    SSE_OP(vpgatherdq3),
+    SSE_OP(vpgatherqd3),
+    SSE_OP(vpgatherqq3),
+};
 #undef SSE_OP
 
 /* VEX prefix not allowed */
@@ -4350,6 +4373,57 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
                 }
                 op1_offset = ZMM_OFFSET(reg);
 
+                if ((b & 0xfc) == 0x90) { /* vgather */
+                    int scale, index, base;
+                    target_long disp = 0;
+                    CHECK_AVX2(s);
+                    if (mod == 3 || rm != 4) {
+                        goto illegal_op;
+                    }
+
+                    /* Vector SIB */
+                    val = x86_ldub_code(env, s);
+                    scale = (val >> 6) & 3;
+                    index = ((val >> 3) & 7) | REX_X(s);
+                    base = (val & 7) | REX_B(s);
+                    switch (mod) {
+                    case 0:
+                        if (base == 5) {
+                            base = -1;
+                            disp = (int32_t)x86_ldl_code(env, s);
+                        }
+                        break;
+                    case 1:
+                        disp = (int8_t)x86_ldub_code(env, s);
+                        break;
+                    default:
+                    case 2:
+                        disp = (int32_t)x86_ldl_code(env, s);
+                        break;
+                    }
+
+                    /* destination, index and mask registers must not overlap 
*/
+                    if (reg == index || reg == reg_v) {
+                        goto illegal_op;
+                    }
+
+                    tcg_gen_addi_tl(s->A0, cpu_regs[base], disp);
+                    gen_add_A0_ds_seg(s);
+                    op2_offset = ZMM_OFFSET(index);
+                    v_offset = ZMM_OFFSET(reg_v);
+                    tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
+                    tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
+                    tcg_gen_addi_ptr(s->ptr2, cpu_env, v_offset);
+                    b1 = REX_W(s) | ((b & 1) << 1) | (scale << 2);
+                    sse_op_table10[b1][s->vex_l](cpu_env,
+                            s->ptr0, s->ptr2, s->ptr1, s->A0);
+                    if (!s->vex_l) {
+                        gen_clear_ymmh(s, reg);
+                        gen_clear_ymmh(s, reg_v);
+                    }
+                    return;
+                }
+
                 if (op6.flags & SSE_OPF_MMX) {
                     CHECK_AVX2_256(s);
                 }
-- 
2.36.0




reply via email to

[Prev in Thread] Current Thread [Next in Thread]