[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 2/5] tcg/i386: Simplify immediate 8-bit logical vector shifts
|
From: |
Richard Henderson |
|
Subject: |
[PATCH 2/5] tcg/i386: Simplify immediate 8-bit logical vector shifts |
|
Date: |
Wed, 24 Apr 2024 10:09:05 -0700 |
The x86 isa does not have this operation, so we need an expansion.
Use the same algorithm that we use for expanding this vector
operation with integers: perform the shift with a wider type
and then mask the bits that must be zero.
This reduces the instruction count from 5 to 2.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/i386/tcg-target.c.inc | 61 +++++++++------------------------------
1 file changed, 14 insertions(+), 47 deletions(-)
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index c6ba498623..6837c519b0 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -3769,49 +3769,20 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type,
unsigned vece)
}
}
-static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
+static void expand_vec_shi(TCGType type, unsigned vece, bool right,
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
{
- TCGv_vec t1, t2;
+ uint8_t mask;
tcg_debug_assert(vece == MO_8);
-
- t1 = tcg_temp_new_vec(type);
- t2 = tcg_temp_new_vec(type);
-
- /*
- * Unpack to W, shift, and repack. Tricky bits:
- * (1) Use punpck*bw x,x to produce DDCCBBAA,
- * i.e. duplicate in other half of the 16-bit lane.
- * (2) For right-shift, add 8 so that the high half of the lane
- * becomes zero. For left-shift, and left-rotate, we must
- * shift up and down again.
- * (3) Step 2 leaves high half zero such that PACKUSWB
- * (pack with unsigned saturation) does not modify
- * the quantity.
- */
- vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
- tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
- vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
- tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
-
- if (opc != INDEX_op_rotli_vec) {
- imm += 8;
- }
- if (opc == INDEX_op_shri_vec) {
- tcg_gen_shri_vec(MO_16, t1, t1, imm);
- tcg_gen_shri_vec(MO_16, t2, t2, imm);
+ if (right) {
+ mask = 0xff >> imm;
+ tcg_gen_shri_vec(MO_16, v0, v1, imm);
} else {
- tcg_gen_shli_vec(MO_16, t1, t1, imm);
- tcg_gen_shli_vec(MO_16, t2, t2, imm);
- tcg_gen_shri_vec(MO_16, t1, t1, 8);
- tcg_gen_shri_vec(MO_16, t2, t2, 8);
+ mask = 0xff << imm;
+ tcg_gen_shli_vec(MO_16, v0, v1, imm);
}
-
- vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
- tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
- tcg_temp_free_vec(t1);
- tcg_temp_free_vec(t2);
+ tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask));
}
static void expand_vec_sari(TCGType type, unsigned vece,
@@ -3821,7 +3792,7 @@ static void expand_vec_sari(TCGType type, unsigned vece,
switch (vece) {
case MO_8:
- /* Unpack to W, shift, and repack, as in expand_vec_shi. */
+ /* Unpack to 16-bit, shift, and repack. */
t1 = tcg_temp_new_vec(type);
t2 = tcg_temp_new_vec(type);
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
@@ -3874,12 +3845,7 @@ static void expand_vec_rotli(TCGType type, unsigned vece,
{
TCGv_vec t;
- if (vece == MO_8) {
- expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
- return;
- }
-
- if (have_avx512vbmi2) {
+ if (vece != MO_8 && have_avx512vbmi2) {
vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
return;
@@ -4155,10 +4121,11 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type,
unsigned vece,
switch (opc) {
case INDEX_op_shli_vec:
- case INDEX_op_shri_vec:
- expand_vec_shi(type, vece, opc, v0, v1, a2);
+ expand_vec_shi(type, vece, false, v0, v1, a2);
+ break;
+ case INDEX_op_shri_vec:
+ expand_vec_shi(type, vece, true, v0, v1, a2);
break;
-
case INDEX_op_sari_vec:
expand_vec_sari(type, vece, v0, v1, a2);
break;
--
2.34.1
- [PATCH 0/5] tcg: Misc improvements, Richard Henderson, 2024/04/24
- [PATCH] target/arm: Restrict translation disabled alignment check to VMSA, Richard Henderson, 2024/04/24
- [PATCH 1/5] tcg: Add write_aofs to GVecGen3i, Richard Henderson, 2024/04/24
- [PATCH 2/5] tcg/i386: Simplify immediate 8-bit logical vector shifts,
Richard Henderson <=
- [PATCH 3/5] tcg/i386: Optimize setcond of TST{EQ,NE} with 0xffffffff, Richard Henderson, 2024/04/24
- [PATCH 5/5] accel/tcg: Introduce CF_BP_PAGE, Richard Henderson, 2024/04/24
- [PATCH 4/5] tcg/optimize: Optimize setcond with zmask, Richard Henderson, 2024/04/24