qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v2 16/22] tcg/aarch64: Reorg goto_tb implementation


From: Alex Bennée
Subject: Re: [PATCH v2 16/22] tcg/aarch64: Reorg goto_tb implementation
Date: Tue, 17 Jan 2023 18:26:29 +0000
User-agent: mu4e 1.9.15; emacs 29.0.60

Richard Henderson <richard.henderson@linaro.org> writes:

> The old implementation replaces two insns, swapping between
>
>       b       <dest>
>       nop
>       br      x30
> and
>       adrp    x30, <dest>
>       addi    x30, x30, lo12:<dest>
>       br      x30
>
> There is a race condition in which a thread could be stopped at
> the PC of the second insn, and when restarted does not see the
> complete address computation and branches to nowhere.
>
> The new implemetation replaces only one insn, swapping between
>
>       b       <dest>
>       br      tmp
> and
>       ldr     tmp, <jmp_addr>
>       br      tmp
>
> Reported-by: hev <r@hev.cc>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/aarch64/tcg-target.h     |  3 +-
>  tcg/aarch64/tcg-target.c.inc | 64 +++++++++++++++---------------------
>  2 files changed, 28 insertions(+), 39 deletions(-)
>
> diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
> index 6067446b03..0ba2298ea6 100644
> --- a/tcg/aarch64/tcg-target.h
> +++ b/tcg/aarch64/tcg-target.h
> @@ -15,7 +15,8 @@
>  
>  #define TCG_TARGET_INSN_UNIT_SIZE  4
>  #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
> -#define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
> +#define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
> +#undef TCG_TARGET_STACK_GROWSUP
>  
>  typedef enum {
>      TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
> diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
> index 0b65f2cac1..1d0ebf01a5 100644
> --- a/tcg/aarch64/tcg-target.c.inc
> +++ b/tcg/aarch64/tcg-target.c.inc
> @@ -1353,33 +1353,6 @@ static void tcg_out_call(TCGContext *s, const 
> tcg_insn_unit *target,
>      tcg_out_call_int(s, target);
>  }
>  
> -void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> -                              uintptr_t jmp_rx, uintptr_t jmp_rw)
> -{
> -    uintptr_t addr = tb->jmp_target_addr[n];
> -    tcg_insn_unit i1, i2;
> -    TCGType rt = TCG_TYPE_I64;
> -    TCGReg  rd = TCG_REG_TMP;
> -    uint64_t pair;
> -
> -    ptrdiff_t offset = addr - jmp_rx;
> -
> -    if (offset == sextract64(offset, 0, 26)) {
> -        i1 = I3206_B | ((offset >> 2) & 0x3ffffff);
> -        i2 = NOP;
> -    } else {
> -        offset = (addr >> 12) - (jmp_rx >> 12);
> -
> -        /* patch ADRP */
> -        i1 = I3406_ADRP | (offset & 3) << 29 | (offset & 0x1ffffc) << (5 - 
> 2) | rd;
> -        /* patch ADDI */
> -        i2 = I3401_ADDI | rt << 31 | (addr & 0xfff) << 10 | rd << 5 | rd;
> -    }
> -    pair = (uint64_t)i2 << 32 | i1;
> -    qatomic_set((uint64_t *)jmp_rw, pair);
> -    flush_idcache_range(jmp_rx, jmp_rw, 8);
> -}
> -
>  static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
>  {
>      if (!l->has_value) {
> @@ -1902,23 +1875,38 @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t 
> a0)
>  static void tcg_out_goto_tb(TCGContext *s, int which)
>  {
>      /*
> -     * Ensure that ADRP+ADD are 8-byte aligned so that an atomic
> -     * write can be used to patch the target address.
> +     * Direct branch, or indirect address load, will be patched
> +     * by tb_target_set_jmp_target.  Assert indirect load offset
> +     * in range early, regardless of direct branch distance.
>       */
> -    if ((uintptr_t)s->code_ptr & 7) {
> -        tcg_out32(s, NOP);
> -    }
> +    intptr_t i_off = tcg_pcrel_diff(s, (void *)get_jmp_target_addr(s, 
> which));
> +    tcg_debug_assert(i_off == sextract64(i_off, 0, 21));
> +
>      set_jmp_insn_offset(s, which);
> -    /*
> -     * actual branch destination will be patched by
> -     * tb_target_set_jmp_target later
> -     */
> -    tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
> -    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
> +    tcg_out32(s, I3206_B);
>      tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
>      set_jmp_reset_offset(s, which);
>  }
>  
> +void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> +                              uintptr_t jmp_rx, uintptr_t jmp_rw)
> +{
> +    uintptr_t d_addr = tb->jmp_target_addr[n];
> +    uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
> +    ptrdiff_t d_offset = d_addr - jmp_rx;
> +    ptrdiff_t i_offset = i_addr - jmp_rx;
> +    tcg_insn_unit insn;
> +
> +    /* Either directly branch, or indirect branch load. */
> +    if (d_offset == sextract64(d_offset, 0, 26)) {
> +        insn = I3206_B | ((d_offset >> 2) & 0x3ffffff);
> +    } else {
> +        insn = I3305_LDR | TCG_REG_TMP | (((i_offset >> 2) & 0x7ffff) << 5);
> +    }

Could we use deposits to build our instructions here? Also the mask
doesn't match the 24 bits you have left, bug from old code?

Otherwise:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro



reply via email to

[Prev in Thread] Current Thread [Next in Thread]