qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v4 5/5] tcg/sparc: Support unaligned access for user-only


From: Peter Maydell
Subject: Re: [PATCH v4 5/5] tcg/sparc: Support unaligned access for user-only
Date: Fri, 4 Feb 2022 19:07:56 +0000

On Fri, 4 Feb 2022 at 08:16, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> This is kinda sorta the opposite of the other tcg hosts, where
> we get (normal) alignment checks for free with host SIGBUS and
> need to add code to support unaligned accesses.
>
> This inline code expansion is somewhat large, but it takes quite
> a few instructions to make a function call to a helper anyway.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/sparc/tcg-target.c.inc | 308 +++++++++++++++++++++++++++++++++++--
>  1 file changed, 299 insertions(+), 9 deletions(-)
>
> diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
> index 82a9d88816..a0d206380c 100644
> --- a/tcg/sparc/tcg-target.c.inc
> +++ b/tcg/sparc/tcg-target.c.inc
> @@ -211,6 +211,7 @@ static const int tcg_target_call_oarg_regs[] = {
>  #define ARITH_ADD  (INSN_OP(2) | INSN_OP3(0x00))
>  #define ARITH_ADDCC (INSN_OP(2) | INSN_OP3(0x10))
>  #define ARITH_AND  (INSN_OP(2) | INSN_OP3(0x01))
> +#define ARITH_ANDCC (INSN_OP(2) | INSN_OP3(0x11))
>  #define ARITH_ANDN (INSN_OP(2) | INSN_OP3(0x05))
>  #define ARITH_OR   (INSN_OP(2) | INSN_OP3(0x02))
>  #define ARITH_ORCC (INSN_OP(2) | INSN_OP3(0x12))
> @@ -997,7 +998,7 @@ static void build_trampolines(TCGContext *s)
>              /* Skip the oi argument.  */
>              ra += 1;
>          }
> -
> +
>          /* Set the retaddr operand.  */
>          if (ra >= TCG_REG_O6) {
>              tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_O7, TCG_REG_CALL_STACK,

Stray whitespace change.

> @@ -1012,6 +1013,38 @@ static void build_trampolines(TCGContext *s)
>          tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
>      }
>  }
> +#else
> +static const tcg_insn_unit *qemu_unalign_ld_trampoline;
> +static const tcg_insn_unit *qemu_unalign_st_trampoline;
> +
> +static void build_trampolines(TCGContext *s)
> +{
> +    for (int ld = 0; ld < 2; ++ld) {
> +        void *helper;
> +
> +        while ((uintptr_t)s->code_ptr & 15) {
> +            tcg_out_nop(s);
> +        }
> +
> +        if (ld) {
> +            helper = helper_unaligned_ld;
> +            qemu_unalign_ld_trampoline = tcg_splitwx_to_rx(s->code_ptr);
> +        } else {
> +            helper = helper_unaligned_st;
> +            qemu_unalign_st_trampoline = tcg_splitwx_to_rx(s->code_ptr);
> +        }
> +
> +        if (!SPARC64 && TARGET_LONG_BITS == 64) {
> +            /* Install the high part of the address.  */
> +            tcg_out_arithi(s, TCG_REG_O1, TCG_REG_O2, 32, SHIFT_SRLX);
> +        }
> +
> +        /* Tail call.  */
> +        tcg_out_jmpl_const(s, helper, true, true);
> +        /* delay slot -- set the env argument */
> +        tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
> +    }
> +}
>  #endif
>
>  /* Generate global QEMU prologue and epilogue code */
> @@ -1062,9 +1095,7 @@ static void tcg_target_qemu_prologue(TCGContext *s)
>      /* delay slot */
>      tcg_out_movi_imm13(s, TCG_REG_O0, 0);
>
> -#ifdef CONFIG_SOFTMMU
>      build_trampolines(s);
> -#endif
>  }
>
>  static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
> @@ -1149,18 +1180,22 @@ static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg 
> addr, int mem_index,
>  static const int qemu_ld_opc[(MO_SSIZE | MO_BSWAP) + 1] = {
>      [MO_UB]   = LDUB,
>      [MO_SB]   = LDSB,
> +    [MO_UB | MO_LE] = LDUB,
> +    [MO_SB | MO_LE] = LDSB,
>
>      [MO_BEUW] = LDUH,
>      [MO_BESW] = LDSH,
>      [MO_BEUL] = LDUW,
>      [MO_BESL] = LDSW,
>      [MO_BEUQ] = LDX,
> +    [MO_BESQ] = LDX,
>
>      [MO_LEUW] = LDUH_LE,
>      [MO_LESW] = LDSH_LE,
>      [MO_LEUL] = LDUW_LE,
>      [MO_LESL] = LDSW_LE,
>      [MO_LEUQ] = LDX_LE,
> +    [MO_LESQ] = LDX_LE,
>  };
>
>  static const int qemu_st_opc[(MO_SIZE | MO_BSWAP) + 1] = {
> @@ -1179,11 +1214,12 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg 
> data, TCGReg addr,
>                              MemOpIdx oi, bool is_64)
>  {
>      MemOp memop = get_memop(oi);
> +    tcg_insn_unit *label_ptr;
> +
>  #ifdef CONFIG_SOFTMMU
>      unsigned memi = get_mmuidx(oi);
>      TCGReg addrz, param;
>      const tcg_insn_unit *func;
> -    tcg_insn_unit *label_ptr;
>
>      addrz = tcg_out_tlb_load(s, addr, memi, memop,
>                               offsetof(CPUTLBEntry, addr_read));
> @@ -1247,13 +1283,190 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg 
> data, TCGReg addr,
>
>      *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
>  #else
> +    TCGReg index = (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0);
> +    unsigned a_bits = get_alignment_bits(memop);
> +    unsigned s_bits = memop & MO_SIZE;
> +    unsigned t_bits;
> +    TCGReg orig_addr = addr;
> +
>      if (SPARC64 && TARGET_LONG_BITS == 32) {
>          tcg_out_arithi(s, TCG_REG_T1, addr, 0, SHIFT_SRL);
>          addr = TCG_REG_T1;
>      }
> -    tcg_out_ldst_rr(s, data, addr,
> -                    (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0),
> +
> +    /*
> +     * Normal case: alignment equal to access size.
> +     */
> +    if (a_bits == s_bits) {
> +        tcg_out_ldst_rr(s, data, addr, index,
> +                        qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
> +        return;
> +    }
> +
> +    /*
> +     * Test for at least natural alignment, and assume most accesses
> +     * will be aligned -- perform a straight load in the delay slot.
> +     * This is required to preserve atomicity for aligned accesses.
> +     */
> +    t_bits = MAX(a_bits, s_bits);
> +    tcg_debug_assert(t_bits < 13);
> +    tcg_out_arithi(s, TCG_REG_G0, addr, (1u << t_bits) - 1, ARITH_ANDCC);
> +
> +    /* beq,a,pt %icc, label */
> +    label_ptr = s->code_ptr;
> +    tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT | BPCC_ICC, 0);
> +    /* delay slot */
> +    tcg_out_ldst_rr(s, data, addr, index,
>                      qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
> +
> +    /*
> +     * Overalignment: When we're asking for really large alignment,
> +     * the actual access is always done above and all we need to do
> +     * here is invoke the handler for SIGBUS.
> +     */

I thought the access was in an annulled delay slot and so won't
be "done above" ?

> +    if (a_bits >= s_bits) {
> +        TCGReg arg_low = TCG_REG_O1 + (!SPARC64 && TARGET_LONG_BITS == 64);
> +        tcg_out_call_nodelay(s, qemu_unalign_ld_trampoline, false);
> +        /* delay slot -- move to low part of argument reg */
> +        tcg_out_mov_delay(s, arg_low, addr);
> +        goto done;
> +    }
> +
> +    /*
> +     * Underalignment: use multiple loads to perform the operation.
> +     *
> +     * Force full address into T1 early; avoids problems with
> +     * overlap between @addr and @data.
> +     *
> +     * Note that the little-endian instructions use the immediate
> +     * asi form, and must use tcg_out_ldst_rr.
> +     */
> +    tcg_out_arith(s, TCG_REG_T1, addr, index, ARITH_ADD);
> +
> +    switch ((unsigned)memop) {
> +    case MO_BEUW | MO_UNALN:
> +    case MO_BESW | MO_UNALN:
> +    case MO_BEUL | MO_ALIGN_2:
> +    case MO_BESL | MO_ALIGN_2:
> +    case MO_BEUQ | MO_ALIGN_4:
> +        /* Two loads: shift and combine. */
> +        tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, 0,
> +                        qemu_ld_opc[a_bits | MO_BE | (memop & MO_SIGN)]);
> +        tcg_out_ldst(s, data, TCG_REG_T1, 1 << a_bits,
> +                        qemu_ld_opc[a_bits | MO_BE]);
> +        tcg_out_arithi(s, TCG_REG_T2, TCG_REG_T2, 8 << a_bits, SHIFT_SLLX);

Why are we calculating the offset in memory of the second half of
the data and the amount to shift it by using the alignment-bits
rather than the size-bits ? Because of the cases we know that
here a_bits == s_bits - 1, but I think it would be clearer to
work in terms of the size.

> +        tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR);
> +        break;
> +
> +    case MO_LEUW | MO_UNALN:
> +    case MO_LESW | MO_UNALN:
> +    case MO_LEUL | MO_ALIGN_2:
> +    case MO_LESL | MO_ALIGN_2:
> +    case MO_LEUQ | MO_ALIGN_4:
> +        /* Similarly, with shifts adjusted for little-endian. */
> +        tcg_out_ldst_rr(s, TCG_REG_T2, TCG_REG_T1, TCG_REG_G0,
> +                        qemu_ld_opc[a_bits | MO_LE]);
> +        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 1 << a_bits, ARITH_ADD);
> +        tcg_out_ldst_rr(s, data, TCG_REG_T1, TCG_REG_G0,
> +                        qemu_ld_opc[a_bits | MO_LE | (memop & MO_SIGN)]);
> +        tcg_out_arithi(s, data, data, 8 << a_bits, SHIFT_SLLX);
> +        tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR);
> +        break;
> +
> +    case MO_BEUL | MO_UNALN:
> +    case MO_BESL | MO_UNALN:
> +        /*
> +         * Naively, this would require 4 loads, 3 shifts, 3 ors.
> +         * Use two 32-bit aligned loads, combine, and extract.
> +         */

Accessing off the end of a buffer just because you know it can't
segfault because it's not going to cross a page boundary will
make Valgrind unhappy :-)

At least, we should mention in the comment that this loads data
not from where the guest asked but that we know it won't access
any data in a page that the guest didn't ask to touch.

> +        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 3, ARITH_ANDN);
> +        tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, 0, LDUW);

Doesn't this give the wrong fault-address value to the guest
(ie not the address it used for the load, but a rounded-down one)
if we take a SIGSEGV? Or do we fix that up elsewhere?

> +        tcg_out_ldst(s, TCG_REG_T1, TCG_REG_T1, 4, LDUW);
> +        tcg_out_arithi(s, TCG_REG_T2, TCG_REG_T2, 32, SHIFT_SLLX);
> +        tcg_out_arith(s, TCG_REG_T1, TCG_REG_T1, TCG_REG_T2, ARITH_OR);
> +        tcg_out_arithi(s, TCG_REG_T2, orig_addr, 3, ARITH_AND);
> +        tcg_out_arithi(s, TCG_REG_T2, TCG_REG_T2, 3, SHIFT_SLL);
> +        tcg_out_arith(s, TCG_REG_T1, TCG_REG_T1, TCG_REG_T2, SHIFT_SLLX);
> +        tcg_out_arithi(s, data, TCG_REG_T1, 32,
> +                       memop & MO_SIGN ? SHIFT_SRAX : SHIFT_SRLX);
> +        break;
> +
> +    case MO_LEUL | MO_UNALN:
> +    case MO_LESL | MO_UNALN:
> +        /* Similarly, with shifts adjusted for little-endian. */
> +        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 3, ARITH_ANDN);
> +        tcg_out_ldst_rr(s, TCG_REG_T2, TCG_REG_T1, TCG_REG_G0, LDUW_LE);
> +        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 4, ARITH_ADD);
> +        tcg_out_ldst_rr(s, TCG_REG_T1, TCG_REG_T1, TCG_REG_G0, LDUW_LE);
> +        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 32, SHIFT_SLLX);
> +        tcg_out_arith(s, TCG_REG_T1, TCG_REG_T1, TCG_REG_T2, ARITH_OR);
> +        tcg_out_arithi(s, TCG_REG_T2, orig_addr, 3, ARITH_AND);
> +        tcg_out_arithi(s, TCG_REG_T2, TCG_REG_T2, 3, SHIFT_SLL);
> +        tcg_out_arith(s, data, TCG_REG_T1, TCG_REG_T2, SHIFT_SRLX);
> +        if (is_64) {
> +            tcg_out_arithi(s, data, data, 0,
> +                           memop & MO_SIGN ? SHIFT_SRA : SHIFT_SRL);
> +        }
> +        break;
> +
> +    case MO_BEUQ | MO_UNALN:
> +        /* Similarly for 64-bit. */
> +        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 7, ARITH_ANDN);
> +        tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, 0, LDX);
> +        tcg_out_ldst(s, TCG_REG_T1, TCG_REG_T1, 8, LDX);
> +        tcg_out_arithi(s, data, orig_addr, 7, ARITH_AND);
> +        tcg_out_arithi(s, data, data, 3, SHIFT_SLL);
> +        tcg_out_arith(s, TCG_REG_T2, TCG_REG_T2, data, SHIFT_SLLX);
> +        tcg_out_arithi(s, data, data, 64, ARITH_SUB);
> +        tcg_out_arith(s, TCG_REG_T1, TCG_REG_T1, data, SHIFT_SRLX);
> +        tcg_out_arith(s, data, TCG_REG_T1, TCG_REG_T2, ARITH_OR);
> +        break;
> +
> +    case MO_LEUQ | MO_UNALN:
> +        /* Similarly for little-endian. */
> +        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 7, ARITH_ANDN);
> +        tcg_out_ldst_rr(s, TCG_REG_T2, TCG_REG_T1, TCG_REG_G0, LDX_LE);
> +        tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 8, ARITH_ADD);
> +        tcg_out_ldst_rr(s, TCG_REG_T1, TCG_REG_T1, TCG_REG_G0, LDX_LE);
> +        tcg_out_arithi(s, data, orig_addr, 7, ARITH_AND);
> +        tcg_out_arithi(s, data, data, 3, SHIFT_SLL);
> +        tcg_out_arith(s, TCG_REG_T2, TCG_REG_T2, data, SHIFT_SRLX);
> +        tcg_out_arithi(s, data, data, 64, ARITH_SUB);
> +        tcg_out_arith(s, TCG_REG_T1, TCG_REG_T1, data, SHIFT_SLLX);
> +        tcg_out_arith(s, data, TCG_REG_T1, TCG_REG_T2, ARITH_OR);
> +        break;
> +
> +    case MO_BEUQ | MO_ALIGN_2:
> +        /*
> +         * An extra test to verify alignment 2 is 5 insns, which
> +         * is more than we would save by using the slightly smaller
> +         * unaligned sequence above.
> +         */
> +        tcg_out_ldst(s, data, TCG_REG_T1, 0, LDUH);
> +        for (int i = 2; i < 8; i += 2) {
> +            tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, LDUW);

Isn't this loading 2 + 3 * 4 == 14 bytes?

> +            tcg_out_arithi(s, data, data, 16, SHIFT_SLLX);
> +            tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR);
> +        }
> +        break;
> +
> +    case MO_LEUQ | MO_ALIGN_2:
> +        /* Similarly for little-endian. */
> +        tcg_out_ldst_rr(s, data, TCG_REG_T1, TCG_REG_G0, LDUH_LE);
> +        for (int i = 2; i < 8; i += 2) {
> +            tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, 2, ARITH_ADD);
> +            tcg_out_ldst_rr(s, TCG_REG_T2, TCG_REG_T1, TCG_REG_G0, LDUW_LE);

Ditto.

> +            tcg_out_arithi(s, TCG_REG_T2, TCG_REG_T2, i * 8, SHIFT_SLLX);
> +            tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR);
> +        }
> +        break;
> +
> +    default:
> +        g_assert_not_reached();
> +    }
> +
> + done:
> +    *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
>  #endif /* CONFIG_SOFTMMU */
>  }
>
> @@ -1261,11 +1474,12 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg 
> data, TCGReg addr,
>                              MemOpIdx oi)
>  {
>      MemOp memop = get_memop(oi);
> +    tcg_insn_unit *label_ptr;
> +
>  #ifdef CONFIG_SOFTMMU
>      unsigned memi = get_mmuidx(oi);
>      TCGReg addrz, param;
>      const tcg_insn_unit *func;
> -    tcg_insn_unit *label_ptr;
>
>      addrz = tcg_out_tlb_load(s, addr, memi, memop,
>                               offsetof(CPUTLBEntry, addr_write));
> @@ -1302,13 +1516,89 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg 
> data, TCGReg addr,
>
>      *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
>  #else
> +    TCGReg index = (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0);
> +    unsigned a_bits = get_alignment_bits(memop);
> +    unsigned s_bits = memop & MO_SIZE;
> +    unsigned t_bits;
> +
>      if (SPARC64 && TARGET_LONG_BITS == 32) {
>          tcg_out_arithi(s, TCG_REG_T1, addr, 0, SHIFT_SRL);
>          addr = TCG_REG_T1;
>      }
> -    tcg_out_ldst_rr(s, data, addr,
> -                    (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0),
> +
> +    /*
> +     * Normal case: alignment equal to access size.
> +     */
> +    if (a_bits == s_bits) {
> +        tcg_out_ldst_rr(s, data, addr, index,
> +                        qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
> +        return;
> +    }
> +
> +    /*
> +     * Test for at least natural alignment, and assume most accesses
> +     * will be aligned -- perform a straight store in the delay slot.
> +     * This is required to preserve atomicity for aligned accesses.
> +     */
> +    t_bits = MAX(a_bits, s_bits);
> +    tcg_debug_assert(t_bits < 13);
> +    tcg_out_arithi(s, TCG_REG_G0, addr, (1u << t_bits) - 1, ARITH_ANDCC);
> +
> +    /* beq,a,pt %icc, label */
> +    label_ptr = s->code_ptr;
> +    tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT | BPCC_ICC, 0);
> +    /* delay slot */
> +    tcg_out_ldst_rr(s, data, addr, index,
>                      qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
> +
> +    if (a_bits >= s_bits) {
> +        TCGReg arg_low = TCG_REG_O1 + (!SPARC64 && TARGET_LONG_BITS == 64);
> +        /* Overalignment: only need to call helper for SIGBUS. */
> +        tcg_out_call_nodelay(s, qemu_unalign_st_trampoline, false);
> +        /* delay slot -- move to low part of argument reg */
> +        tcg_out_mov_delay(s, arg_low, addr);
> +    } else {
> +        /* Underalignment: store by pieces of minimum alignment. */
> +        int st_opc, a_size, s_size, i;
> +
> +        /*
> +         * Force full address into T1 early; avoids problems with
> +         * overlap between @addr and @data.
> +         */
> +        tcg_out_arith(s, TCG_REG_T1, addr, index, ARITH_ADD);
> +
> +        a_size = 1 << a_bits;
> +        s_size = 1 << (memop & MO_SIZE);

Isn't this "1 << s_bits" ?

> +        if ((memop & MO_BSWAP) == MO_BE) {
> +            st_opc = qemu_st_opc[a_bits + MO_BE];
> +            for (i = 0; i < s_size; i += a_size) {
> +                TCGReg d = data;
> +                int shift = (s_size - a_size - i) * 8;
> +                if (shift) {
> +                    d = TCG_REG_T2;
> +                    tcg_out_arithi(s, d, data, shift, SHIFT_SRLX);
> +                }
> +                tcg_out_ldst(s, d, TCG_REG_T1, i, st_opc);
> +            }
> +        } else if (a_bits == 0) {
> +            tcg_out_ldst(s, data, TCG_REG_T1, 0, STB);
> +            for (i = 1; i < s_size; i++) {
> +                tcg_out_arithi(s, TCG_REG_T2, data, i * 8, SHIFT_SRLX);
> +                tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, STB);
> +            }
> +        } else {
> +            /* Note that ST*A with immediate asi must use indexed address. */
> +            st_opc = qemu_st_opc[a_bits + MO_LE];
> +            tcg_out_ldst_rr(s, data, TCG_REG_T1, TCG_REG_G0, st_opc);
> +            for (i = a_size; i < s_size; i += a_size) {
> +                tcg_out_arithi(s, TCG_REG_T2, data, i * 8, SHIFT_SRLX);
> +                tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, a_size, ARITH_ADD);
> +                tcg_out_ldst_rr(s, TCG_REG_T2, TCG_REG_T1, TCG_REG_G0, 
> st_opc);
> +            }
> +        }
> +    }
> +
> +    *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
>  #endif /* CONFIG_SOFTMMU */

thanks
-- PMM



reply via email to

[Prev in Thread] Current Thread [Next in Thread]