[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH v9 09/26] tcg: Add generic vector ops for extension
From: |
Richard Henderson |
Subject: |
[Qemu-devel] [PATCH v9 09/26] tcg: Add generic vector ops for extension |
Date: |
Mon, 15 Jan 2018 19:33:47 -0800 |
Signed-off-by: Richard Henderson <address@hidden>
---
accel/tcg/tcg-runtime.h | 8 +++
tcg/tcg-op-gvec.h | 9 +++
tcg/tcg-op.h | 5 ++
tcg/tcg-opc.h | 5 ++
tcg/tcg.h | 2 +
accel/tcg/tcg-runtime-gvec.c | 26 +++++++++
tcg/tcg-op-gvec.c | 130 +++++++++++++++++++++++++++++++++++++++++++
tcg/tcg-op-vec.c | 39 +++++++++++++
tcg/tcg.c | 6 ++
tcg/README | 13 +++++
10 files changed, 243 insertions(+)
diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index c4a2e6b215..d1b3542946 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -199,6 +199,14 @@ DEF_HELPER_FLAGS_4(gvec_trn16, TCG_CALL_NO_RWG, void, ptr,
ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_trn32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_trn64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_extu8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_extu16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_extu32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_exts8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_exts16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_exts32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_4(gvec_eq8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_eq16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_eq32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
index 28ec0f260c..f716c53be0 100644
--- a/tcg/tcg-op-gvec.h
+++ b/tcg/tcg-op-gvec.h
@@ -222,6 +222,15 @@ void tcg_gen_gvec_trne(unsigned vece, uint32_t dofs,
uint32_t aofs,
void tcg_gen_gvec_trno(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_extul(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_extuh(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_extsl(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_extsh(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz);
+
void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
uint32_t aofs, uint32_t bofs,
uint32_t oprsz, uint32_t maxsz);
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index f967790cd9..28a5cbe47a 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -940,6 +940,11 @@ void tcg_gen_uzpo_vec(unsigned vece, TCGv_vec r, TCGv_vec
a, TCGv_vec b);
void tcg_gen_trne_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_trno_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_extul_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
+void tcg_gen_extuh_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
+void tcg_gen_extsl_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
+void tcg_gen_extsh_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
+
void tcg_gen_cmp_vec(TCGCond cond, unsigned vece, TCGv_vec r,
TCGv_vec a, TCGv_vec b);
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index b21a30273c..3dfd872a0f 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -249,6 +249,11 @@ DEF(uzpo_vec, 1, 2, 0, IMPLVEC |
IMPL(TCG_TARGET_HAS_uzp_vec))
DEF(trne_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_trn_vec))
DEF(trno_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_trn_vec))
+DEF(extul_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_extl_vec))
+DEF(extuh_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_exth_vec))
+DEF(extsl_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_extl_vec))
+DEF(extsh_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_exth_vec))
+
DEF(cmp_vec, 1, 2, 1, IMPLVEC)
DEF(last_generic, 0, 0, 0, TCG_OPF_NOT_PRESENT)
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 9ae7465d1e..f870a3f582 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -186,6 +186,8 @@ typedef uint64_t TCGRegSet;
#define TCG_TARGET_HAS_trn_vec 0
#define TCG_TARGET_HAS_cmp_vec 0
#define TCG_TARGET_HAS_mul_vec 0
+#define TCG_TARGET_HAS_extl_vec 0
+#define TCG_TARGET_HAS_exth_vec 0
#else
#define TCG_TARGET_MAYBE_vec 1
#endif
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index 9406ccd769..ff26be0744 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -588,3 +588,29 @@ DO_CMP2(8)
DO_CMP2(16)
DO_CMP2(32)
DO_CMP2(64)
+
+#define DO_EXT(NAME, TYPE1, TYPE2) \
+void HELPER(NAME)(void *d, void *a, uint32_t desc) \
+{ \
+ intptr_t oprsz = simd_oprsz(desc); \
+ intptr_t oprsz_2 = oprsz / 2; \
+ intptr_t i; \
+ /* We produce output faster than we consume input. \
+ Therefore we must be mindful of possible overlap. */ \
+ if (unlikely((a - d) < (uintptr_t)oprsz)) { \
+ void *a_new = alloca(oprsz_2); \
+ memcpy(a_new, a, oprsz_2); \
+ a = a_new; \
+ } \
+ for (i = 0; i < oprsz_2; i += sizeof(TYPE1)) { \
+ *(TYPE2 *)(d + 2 * i) = *(TYPE1 *)(a + i); \
+ } \
+ clear_high(d, oprsz, desc); \
+}
+
+DO_EXT(gvec_extu8, uint8_t, uint16_t)
+DO_EXT(gvec_extu16, uint16_t, uint32_t)
+DO_EXT(gvec_extu32, uint32_t, uint64_t)
+DO_EXT(gvec_exts8, int8_t, int16_t)
+DO_EXT(gvec_exts16, int16_t, int32_t)
+DO_EXT(gvec_exts32, int32_t, int64_t)
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 3695847e16..2c117a35f1 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -2038,3 +2038,133 @@ void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece,
uint32_t dofs,
expand_clr(dofs + oprsz, maxsz - oprsz);
}
}
+
+static void do_ext(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz, bool high, bool is_sign)
+{
+ static gen_helper_gvec_2 * const extu_fn[3] = {
+ gen_helper_gvec_extu8, gen_helper_gvec_extu16, gen_helper_gvec_extu32
+ };
+ static gen_helper_gvec_2 * const exts_fn[3] = {
+ gen_helper_gvec_exts8, gen_helper_gvec_exts16, gen_helper_gvec_exts32
+ };
+
+ TCGType type;
+ uint32_t step, i, n;
+ TCGOpcode opc;
+
+ check_size_align(oprsz, maxsz, dofs | aofs);
+ check_overlap_2(dofs, aofs, oprsz);
+ tcg_debug_assert(vece < MO_64);
+
+ opc = is_sign ? (high ? INDEX_op_extsh_vec : INDEX_op_extsl_vec)
+ : (high ? INDEX_op_extuh_vec : INDEX_op_extul_vec);
+
+ /* Since these operations don't operate in lock-step lanes,
+ we must care for overlap. */
+ if (TCG_TARGET_HAS_v256 && oprsz % 32 == 0 && oprsz / 32 <= 8
+ && tcg_can_emit_vec_op(opc, TCG_TYPE_V256, vece)) {
+ type = TCG_TYPE_V256;
+ step = 32;
+ n = oprsz / 32;
+ } else if (TCG_TARGET_HAS_v128 && oprsz % 16 == 0 && oprsz / 16 <= 8
+ && tcg_can_emit_vec_op(opc, TCG_TYPE_V128, vece)) {
+ type = TCG_TYPE_V128;
+ step = 16;
+ n = oprsz / 16;
+ } else if (TCG_TARGET_HAS_v64 && oprsz % 8 == 0 && oprsz / 8 <= 8
+ && tcg_can_emit_vec_op(opc, TCG_TYPE_V64, vece)) {
+ type = TCG_TYPE_V64;
+ step = 8;
+ n = oprsz / 8;
+ } else {
+ if (high) {
+ aofs += oprsz / 2;
+ }
+ tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, 0,
+ is_sign ? exts_fn[vece] : extu_fn[vece]);
+ return;
+ }
+
+ if (n == 1) {
+ TCGv_vec t1 = tcg_temp_new_vec(type);
+
+ tcg_gen_ld_vec(t1, cpu_env, aofs);
+ if (high) {
+ if (is_sign) {
+ tcg_gen_extsh_vec(vece, t1, t1);
+ } else {
+ tcg_gen_extuh_vec(vece, t1, t1);
+ }
+ } else {
+ if (is_sign) {
+ tcg_gen_extsl_vec(vece, t1, t1);
+ } else {
+ tcg_gen_extul_vec(vece, t1, t1);
+ }
+ }
+ tcg_gen_st_vec(t1, cpu_env, dofs);
+ tcg_temp_free_vec(t1);
+ } else {
+ TCGv_vec ta[4], tmp;
+
+ if (high) {
+ aofs += oprsz / 2;
+ }
+
+ for (i = 0; i < (n / 2 + n % 2); ++i) {
+ ta[i] = tcg_temp_new_vec(type);
+ tcg_gen_ld_vec(ta[i], cpu_env, aofs + i * step);
+ }
+
+ tmp = tcg_temp_new_vec(type);
+ for (i = 0; i < n; ++i) {
+ if (i & 1) {
+ if (is_sign) {
+ tcg_gen_extsh_vec(vece, tmp, ta[i / 2]);
+ } else {
+ tcg_gen_extuh_vec(vece, tmp, ta[i / 2]);
+ }
+ } else {
+ if (is_sign) {
+ tcg_gen_extsl_vec(vece, tmp, ta[i / 2]);
+ } else {
+ tcg_gen_extul_vec(vece, tmp, ta[i / 2]);
+ }
+ }
+ tcg_gen_st_vec(tmp, cpu_env, dofs + i * step);
+ }
+ tcg_temp_free_vec(tmp);
+
+ for (i = 0; i < (n / 2 + n % 2); ++i) {
+ tcg_temp_free_vec(ta[i]);
+ }
+ }
+ if (oprsz < maxsz) {
+ expand_clr(dofs + oprsz, maxsz - oprsz);
+ }
+}
+
+void tcg_gen_gvec_extul(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ do_ext(vece, dofs, aofs, oprsz, maxsz, false, false);
+}
+
+void tcg_gen_gvec_extuh(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ do_ext(vece, dofs, aofs, oprsz, maxsz, true, false);
+}
+
+void tcg_gen_gvec_extsl(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ do_ext(vece, dofs, aofs, oprsz, maxsz, false, true);
+}
+
+void tcg_gen_gvec_extsh(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ do_ext(vece, dofs, aofs, oprsz, maxsz, true, true);
+}
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index 9038cc6c84..a73d094ddb 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -525,3 +525,42 @@ void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec
a, TCGv_vec b)
tcg_expand_vec_op(INDEX_op_mul_vec, type, vece, ri, ai, bi);
}
}
+
+static void do_ext(TCGOpcode opc, unsigned vece, TCGv_vec r, TCGv_vec a)
+{
+ TCGTemp *rt = tcgv_vec_temp(r);
+ TCGTemp *at = tcgv_vec_temp(a);
+ TCGArg ri = temp_arg(rt);
+ TCGArg ai = temp_arg(at);
+ TCGType type = rt->base_type;
+ int can;
+
+ tcg_debug_assert(at->base_type == type);
+ can = tcg_can_emit_vec_op(opc, type, vece);
+ if (can > 0) {
+ vec_gen_2(opc, type, vece, ri, ai);
+ } else {
+ tcg_debug_assert(can < 0);
+ tcg_expand_vec_op(opc, type, vece, ri, ai);
+ }
+}
+
+void tcg_gen_extul_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
+{
+ do_ext(INDEX_op_extul_vec, vece, r, a);
+}
+
+void tcg_gen_extuh_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
+{
+ do_ext(INDEX_op_extuh_vec, vece, r, a);
+}
+
+void tcg_gen_extsl_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
+{
+ do_ext(INDEX_op_extsl_vec, vece, r, a);
+}
+
+void tcg_gen_extsh_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
+{
+ do_ext(INDEX_op_extsh_vec, vece, r, a);
+}
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 5608391dca..8c0ee0a9db 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1427,6 +1427,12 @@ bool tcg_op_supported(TCGOpcode op)
case INDEX_op_trne_vec:
case INDEX_op_trno_vec:
return have_vec && TCG_TARGET_HAS_trn_vec;
+ case INDEX_op_extul_vec:
+ case INDEX_op_extsl_vec:
+ return have_vec && TCG_TARGET_HAS_extl_vec;
+ case INDEX_op_extuh_vec:
+ case INDEX_op_extsh_vec:
+ return have_vec && TCG_TARGET_HAS_exth_vec;
default:
tcg_debug_assert(op > INDEX_op_last_generic && op < NB_OPS);
diff --git a/tcg/README b/tcg/README
index 17695ff7f6..56c70764bc 100644
--- a/tcg/README
+++ b/tcg/README
@@ -634,6 +634,19 @@ E.g. VECL=1 -> 64 << 1 -> v128, and VECE=2 -> 1 << 2 ->
i32.
v0[2i + 1] = v2[2i + part];
}
+* extul_vec v0, v1
+
+ Extend unsigned the low VECL/VECE/2 elements of v1 into v0.
+
+* extuh_vec v0, v1
+
+ Similarly for the high VECL/VECE/2 elements.
+
+* extsl_vec v0, v1
+* extsh_vec v0, v1
+
+ Similarly with signed extension.
+
* cmp_vec v0, v1, v2, cond
Compare vectors by element, storing -1 for true and 0 for false.
--
2.14.3
- [Qemu-devel] [PATCH v9 00/26] tcg: generic vector operations, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 01/26] tcg: Allow multiple word entries into the constant pool, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 03/26] tcg: Standardize integral arguments to expanders, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 02/26] tcg: Add types and basic operations for host vectors, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 05/26] tcg: Add generic vector ops for interleave, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 04/26] tcg: Add generic vector expanders, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 06/26] tcg: Add generic vector ops for constant shifts, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 08/26] tcg: Add generic vector ops for multiplication, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 09/26] tcg: Add generic vector ops for extension,
Richard Henderson <=
- [Qemu-devel] [PATCH v9 07/26] tcg: Add generic vector ops for comparisons, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 10/26] tcg: Add generic helpers for saturating arithmetic, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 11/26] tcg: Loosen vec_gen_op* typecheck rules, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 13/26] tcg: Add generic vector helpers with a scalar variable operand, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 12/26] tcg: Add generic vector helpers with a scalar immediate operand, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 14/26] tcg/optimize: Handle vector opcodes during optimize, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 15/26] target/arm: Align vector registers, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 16/26] target/arm: Use vector infrastructure for aa64 add/sub/logic, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 17/26] target/arm: Use vector infrastructure for aa64 mov/not/neg, Richard Henderson, 2018/01/15
- [Qemu-devel] [PATCH v9 18/26] target/arm: Use vector infrastructure for aa64 dup/movi, Richard Henderson, 2018/01/15