[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 04/13] tcg/aarch64: enable dynamic TLB sizing
From: |
Alex Bennée |
Subject: |
Re: [Qemu-devel] [PATCH 04/13] tcg/aarch64: enable dynamic TLB sizing |
Date: |
Fri, 25 Jan 2019 19:12:12 +0000 |
User-agent: |
mu4e 1.1.0; emacs 26.1.91 |
Richard Henderson <address@hidden> writes:
> Signed-off-by: Richard Henderson <address@hidden>
> ---
> tcg/aarch64/tcg-target.h | 2 +-
> tcg/aarch64/tcg-target.inc.c | 100 +++++++++++++++++++++--------------
> 2 files changed, 60 insertions(+), 42 deletions(-)
>
> diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
> index 68868a27eb..5085a81060 100644
> --- a/tcg/aarch64/tcg-target.h
> +++ b/tcg/aarch64/tcg-target.h
> @@ -15,7 +15,7 @@
>
> #define TCG_TARGET_INSN_UNIT_SIZE 4
> #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
> -#define TCG_TARGET_IMPLEMENTS_DYN_TLB 0
> +#define TCG_TARGET_IMPLEMENTS_DYN_TLB 1
> #undef TCG_TARGET_STACK_GROWSUP
>
> typedef enum {
> diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
> index ee0d5819af..d57f9e500f 100644
> --- a/tcg/aarch64/tcg-target.inc.c
> +++ b/tcg/aarch64/tcg-target.inc.c
> @@ -498,6 +498,9 @@ typedef enum {
> I3510_EON = 0x4a200000,
> I3510_ANDS = 0x6a000000,
>
> + /* Logical shifted register instructions (with a shift). */
> + I3502S_AND_LSR = I3510_AND | (1 << 22),
> +
> /* AdvSIMD copy */
> I3605_DUP = 0x0e000400,
> I3605_INS = 0x4e001c00,
> @@ -1448,6 +1451,14 @@ static void add_qemu_ldst_label(TCGContext *s, bool
> is_ld, TCGMemOpIdx oi,
> label->label_ptr[0] = label_ptr;
> }
>
> +/* We expect tlb_mask to be before tlb_table. */
> +QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table) <
> + offsetof(CPUArchState, tlb_mask));
> +
> +/* We expect to use a 24-bit unsigned offset from ENV. */
> +QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1])
> + > 0xffffff);
> +
> /* Load and compare a TLB entry, emitting the conditional jump to the
> slow path for the failure case, which will be patched later when
> finalizing
> the slow path. Generated code returns the host addend in X1,
> @@ -1456,15 +1467,55 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg
> addr_reg, TCGMemOp opc,
> tcg_insn_unit **label_ptr, int mem_index,
> bool is_read)
> {
> - int tlb_offset = is_read ?
> - offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
> - : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write);
> + int mask_ofs = offsetof(CPUArchState, tlb_mask[mem_index]);
> + int table_ofs = offsetof(CPUArchState, tlb_table[mem_index]);
> unsigned a_bits = get_alignment_bits(opc);
> unsigned s_bits = opc & MO_SIZE;
> unsigned a_mask = (1u << a_bits) - 1;
> unsigned s_mask = (1u << s_bits) - 1;
> - TCGReg base = TCG_AREG0, x3;
> - uint64_t tlb_mask;
> + TCGReg mask_base = TCG_AREG0, table_base = TCG_AREG0, x3;
> + TCGType mask_type;
> + uint64_t compare_mask;
> +
> + if (table_ofs > 0xfff) {
> + int table_hi = table_ofs & ~0xfff;
> + int mask_hi = mask_ofs & ~0xfff;
Isn't there a #define for this number here?
> +
> + table_base = TCG_REG_X1;
> + if (mask_hi == table_hi) {
> + mask_base = table_base;
> + } else if (mask_hi) {
> + mask_base = TCG_REG_X0;
> + tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64,
> + mask_base, TCG_AREG0, mask_hi);
> + }
> + tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64,
> + table_base, TCG_AREG0, table_hi);
> + mask_ofs -= mask_hi;
> + table_ofs -= table_hi;
> + }
> +
> + mask_type = (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32
> + ? TCG_TYPE_I64 : TCG_TYPE_I32);
> +
> + /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */
> + tcg_out_ld(s, mask_type, TCG_REG_X0, mask_base, mask_ofs);
> + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_X1, table_base, table_ofs);
> +
> + /* Extract the TLB index from the address into X0. */
> + tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64,
> + TCG_REG_X0, TCG_REG_X0, addr_reg,
> + TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
> +
> + /* Add the tlb_table pointer, creating the CPUTLBEntry address into X1.
> */
> + tcg_out_insn(s, 3502, ADD, 1, TCG_REG_X1, TCG_REG_X1, TCG_REG_X0);
> +
> + /* Load the tlb comparator into X0, and the fast path addend into X1. */
> + tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_X0, TCG_REG_X1, is_read
> + ? offsetof(CPUTLBEntry, addr_read)
> + : offsetof(CPUTLBEntry, addr_write));
> + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_X1, TCG_REG_X1,
> + offsetof(CPUTLBEntry, addend));
>
> /* For aligned accesses, we check the first byte and include the
> alignment
> bits within the address. For unaligned access, we check that we don't
> @@ -1476,47 +1527,14 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg
> addr_reg, TCGMemOp opc,
> TCG_REG_X3, addr_reg, s_mask - a_mask);
> x3 = TCG_REG_X3;
> }
> - tlb_mask = (uint64_t)TARGET_PAGE_MASK | a_mask;
> -
> - /* Extract the TLB index from the address into X0.
> - X0<CPU_TLB_BITS:0> =
> - addr_reg<TARGET_PAGE_BITS+CPU_TLB_BITS:TARGET_PAGE_BITS> */
> - tcg_out_ubfm(s, TARGET_LONG_BITS == 64, TCG_REG_X0, addr_reg,
> - TARGET_PAGE_BITS, TARGET_PAGE_BITS + CPU_TLB_BITS);
> + compare_mask = (uint64_t)TARGET_PAGE_MASK | a_mask;
>
> /* Store the page mask part of the address into X3. */
> tcg_out_logicali(s, I3404_ANDI, TARGET_LONG_BITS == 64,
> - TCG_REG_X3, x3, tlb_mask);
> -
> - /* Add any "high bits" from the tlb offset to the env address into X2,
> - to take advantage of the LSL12 form of the ADDI instruction.
> - X2 = env + (tlb_offset & 0xfff000) */
> - if (tlb_offset & 0xfff000) {
> - tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_X2, base,
> - tlb_offset & 0xfff000);
> - base = TCG_REG_X2;
> - }
> -
> - /* Merge the tlb index contribution into X2.
> - X2 = X2 + (X0 << CPU_TLB_ENTRY_BITS) */
> - tcg_out_insn(s, 3502S, ADD_LSL, TCG_TYPE_I64, TCG_REG_X2, base,
> - TCG_REG_X0, CPU_TLB_ENTRY_BITS);
> -
> - /* Merge "low bits" from tlb offset, load the tlb comparator into X0.
> - X0 = load [X2 + (tlb_offset & 0x000fff)] */
> - tcg_out_ldst(s, TARGET_LONG_BITS == 32 ? I3312_LDRW : I3312_LDRX,
> - TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff,
> - TARGET_LONG_BITS == 32 ? 2 : 3);
> -
> - /* Load the tlb addend. Do that early to avoid stalling.
> - X1 = load [X2 + (tlb_offset & 0xfff) + offsetof(addend)] */
> - tcg_out_ldst(s, I3312_LDRX, TCG_REG_X1, TCG_REG_X2,
> - (tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend)) -
> - (is_read ? offsetof(CPUTLBEntry, addr_read)
> - : offsetof(CPUTLBEntry, addr_write)), 3);
> + TCG_REG_X3, x3, compare_mask);
>
> /* Perform the address comparison. */
> - tcg_out_cmp(s, (TARGET_LONG_BITS == 64), TCG_REG_X0, TCG_REG_X3, 0);
> + tcg_out_cmp(s, TARGET_LONG_BITS == 64, TCG_REG_X0, TCG_REG_X3, 0);
>
> /* If not equal, we jump to the slow path. */
> *label_ptr = s->code_ptr;
Anyway:
Reviewed-by: Alex Bennée <address@hidden>
Tested-by: Alex Bennée <address@hidden>
(running s a very slow MTTCG s390x on my SynQuacer)
--
Alex Bennée
- [Qemu-devel] [PATCH 00/13] Dynamic TLB sizing, backends, Richard Henderson, 2019/01/23
- [Qemu-devel] [PATCH 01/13] cputlb: do not evict empty entries to the vtlb, Richard Henderson, 2019/01/23
- [Qemu-devel] [PATCH 02/13] tcg: introduce dynamic TLB sizing, Richard Henderson, 2019/01/23
- [Qemu-devel] [PATCH 03/13] tcg/i386: enable dynamic TLB sizing, Richard Henderson, 2019/01/23
- [Qemu-devel] [PATCH 06/13] tcg/sparc: enable dynamic TLB sizing, Richard Henderson, 2019/01/23
- [Qemu-devel] [PATCH 04/13] tcg/aarch64: enable dynamic TLB sizing, Richard Henderson, 2019/01/23
- Re: [Qemu-devel] [PATCH 04/13] tcg/aarch64: enable dynamic TLB sizing,
Alex Bennée <=
- [Qemu-devel] [PATCH 12/13] tcg/tci: enable dynamic TLB sizing, Richard Henderson, 2019/01/23
- [Qemu-devel] [PATCH 05/13] tcg/ppc: enable dynamic TLB sizing, Richard Henderson, 2019/01/23
- [Qemu-devel] [PATCH 10/13] tcg/mips: Fix tcg_out_qemu_ld_slow_path, Richard Henderson, 2019/01/23
- [Qemu-devel] [PATCH 11/13] tcg/mips: enable dynamic TLB sizing, Richard Henderson, 2019/01/23
- [Qemu-devel] [PATCH 08/13] tcg/riscv: enable dynamic TLB sizing, Richard Henderson, 2019/01/23
- [Qemu-devel] [PATCH 07/13] tcg/s390: enable dynamic TLB sizing, Richard Henderson, 2019/01/23
- [Qemu-devel] [PATCH 13/13] cputlb: Remove static tlb sizing, Richard Henderson, 2019/01/23