[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PULL 25/45] target/arm: Promote consecutive memory ops for
From: |
Peter Maydell |
Subject: |
[Qemu-devel] [PULL 25/45] target/arm: Promote consecutive memory ops for aa64 |
Date: |
Fri, 19 Oct 2018 17:57:15 +0100 |
From: Richard Henderson <address@hidden>
For a sequence of loads or stores from a single register,
little-endian operations can be promoted to an 8-byte op.
This can reduce the number of operations by a factor of 8.
Signed-off-by: Richard Henderson <address@hidden>
Message-id: address@hidden
Reviewed-by: Peter Maydell <address@hidden>
Signed-off-by: Peter Maydell <address@hidden>
---
target/arm/translate-a64.c | 66 +++++++++++++++++++++++---------------
1 file changed, 40 insertions(+), 26 deletions(-)
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 39ac45c0080..f1bd9d7633a 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -1200,25 +1200,23 @@ static void write_vec_element_i32(DisasContext *s,
TCGv_i32 tcg_src,
/* Store from vector register to memory */
static void do_vec_st(DisasContext *s, int srcidx, int element,
- TCGv_i64 tcg_addr, int size)
+ TCGv_i64 tcg_addr, int size, TCGMemOp endian)
{
- TCGMemOp memop = s->be_data + size;
TCGv_i64 tcg_tmp = tcg_temp_new_i64();
read_vec_element(s, tcg_tmp, srcidx, element, size);
- tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
+ tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
tcg_temp_free_i64(tcg_tmp);
}
/* Load from memory to vector register */
static void do_vec_ld(DisasContext *s, int destidx, int element,
- TCGv_i64 tcg_addr, int size)
+ TCGv_i64 tcg_addr, int size, TCGMemOp endian)
{
- TCGMemOp memop = s->be_data + size;
TCGv_i64 tcg_tmp = tcg_temp_new_i64();
- tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
+ tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
write_vec_element(s, tcg_tmp, destidx, element, size);
tcg_temp_free_i64(tcg_tmp);
@@ -3013,9 +3011,10 @@ static void disas_ldst_multiple_struct(DisasContext *s,
uint32_t insn)
bool is_postidx = extract32(insn, 23, 1);
bool is_q = extract32(insn, 30, 1);
TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
+ TCGMemOp endian = s->be_data;
- int ebytes = 1 << size;
- int elements = (is_q ? 128 : 64) / (8 << size);
+ int ebytes; /* bytes per element */
+ int elements; /* elements per vector */
int rpt; /* num iterations */
int selem; /* structure elements */
int r;
@@ -3074,6 +3073,20 @@ static void disas_ldst_multiple_struct(DisasContext *s,
uint32_t insn)
gen_check_sp_alignment(s);
}
+ /* For our purposes, bytes are always little-endian. */
+ if (size == 0) {
+ endian = MO_LE;
+ }
+
+ /* Consecutive little-endian elements from a single register
+ * can be promoted to a larger little-endian operation.
+ */
+ if (selem == 1 && endian == MO_LE) {
+ size = 3;
+ }
+ ebytes = 1 << size;
+ elements = (is_q ? 16 : 8) / ebytes;
+
tcg_rn = cpu_reg_sp(s, rn);
tcg_addr = tcg_temp_new_i64();
tcg_gen_mov_i64(tcg_addr, tcg_rn);
@@ -3082,32 +3095,33 @@ static void disas_ldst_multiple_struct(DisasContext *s,
uint32_t insn)
for (r = 0; r < rpt; r++) {
int e;
for (e = 0; e < elements; e++) {
- int tt = (rt + r) % 32;
int xs;
for (xs = 0; xs < selem; xs++) {
+ int tt = (rt + r + xs) % 32;
if (is_store) {
- do_vec_st(s, tt, e, tcg_addr, size);
+ do_vec_st(s, tt, e, tcg_addr, size, endian);
} else {
- do_vec_ld(s, tt, e, tcg_addr, size);
-
- /* For non-quad operations, setting a slice of the low
- * 64 bits of the register clears the high 64 bits (in
- * the ARM ARM pseudocode this is implicit in the fact
- * that 'rval' is a 64 bit wide variable).
- * For quad operations, we might still need to zero the
- * high bits of SVE. We optimize by noticing that we only
- * need to do this the first time we touch a register.
- */
- if (e == 0 && (r == 0 || xs == selem - 1)) {
- clear_vec_high(s, is_q, tt);
- }
+ do_vec_ld(s, tt, e, tcg_addr, size, endian);
}
tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
- tt = (tt + 1) % 32;
}
}
}
+ if (!is_store) {
+ /* For non-quad operations, setting a slice of the low
+ * 64 bits of the register clears the high 64 bits (in
+ * the ARM ARM pseudocode this is implicit in the fact
+ * that 'rval' is a 64 bit wide variable).
+ * For quad operations, we might still need to zero the
+ * high bits of SVE.
+ */
+ for (r = 0; r < rpt * selem; r++) {
+ int tt = (rt + r) % 32;
+ clear_vec_high(s, is_q, tt);
+ }
+ }
+
if (is_postidx) {
int rm = extract32(insn, 16, 5);
if (rm == 31) {
@@ -3228,9 +3242,9 @@ static void disas_ldst_single_struct(DisasContext *s,
uint32_t insn)
} else {
/* Load/store one element per register */
if (is_load) {
- do_vec_ld(s, rt, index, tcg_addr, scale);
+ do_vec_ld(s, rt, index, tcg_addr, scale, s->be_data);
} else {
- do_vec_st(s, rt, index, tcg_addr, scale);
+ do_vec_st(s, rt, index, tcg_addr, scale, s->be_data);
}
}
tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
--
2.19.1
- [Qemu-devel] [PULL 37/45] target/arm: Use gvec for NEON_3R_VTST_VCEQ, NEON_3R_VCGT, NEON_3R_VCGE, (continued)
- [Qemu-devel] [PULL 37/45] target/arm: Use gvec for NEON_3R_VTST_VCEQ, NEON_3R_VCGT, NEON_3R_VCGE, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 34/45] target/arm: Use gvec for VSRA, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 33/45] target/arm: Use gvec for VSHR, VSHL, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 32/45] target/arm: Use gvec for NEON_3R_VMUL, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 31/45] target/arm: Use gvec for NEON_2RM_VMN, NEON_2RM_VNEG, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 30/45] target/arm: Use gvec for NEON_3R_VADD_VSUB insns, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 29/45] target/arm: Use gvec for NEON_3R_LOGIC insns, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 27/45] target/arm: Use gvec for NEON VDUP, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 28/45] target/arm: Use gvec for NEON VMOV, VMVN, VBIC & VORR (immediate), Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 26/45] target/arm: Mark some arrays const, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 25/45] target/arm: Promote consecutive memory ops for aa64,
Peter Maydell <=
- [Qemu-devel] [PULL 24/45] target/arm: Use tcg_gen_gvec_dup_i64 for LD[1-4]R, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 23/45] target/arm: Don't call tcg_clear_temp_count, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 19/45] target/arm: Get IL bit correct for v7 syndrome values, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 18/45] target/arm: New utility function to extract EC from syndrome, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 17/45] target/arm: Implement HCR.PTW, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 16/45] target/arm: Implement HCR.VI and VF, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 15/45] target/arm: ISR_EL1 bits track virtual interrupts if IMO/FMO set, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 14/45] target/arm: Implement HCR.DC, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 12/45] target/arm: Make switch_mode() file-local, Peter Maydell, 2018/10/19
- [Qemu-devel] [PULL 13/45] target/arm: Implement HCR.FB, Peter Maydell, 2018/10/19