qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 2/4] tcg-hppa: Finish the port.


From: Aurelien Jarno
Subject: Re: [Qemu-devel] [PATCH 2/4] tcg-hppa: Finish the port.
Date: Thu, 8 Apr 2010 11:56:13 +0200
User-agent: Mutt/1.5.20 (2009-06-14)

On Wed, Apr 07, 2010 at 04:56:43AM -0700, Richard Henderson wrote:
> Delete inline functions from tcg-target.h that don't need to be there,
> move the others to tcg-target.c.  Add 'Z', 'I', 'J' constraints for
> 0, signed 11-bit, and signed 5-bit respectively.  Add GUEST_BASE support
> similar to ppc64, with the value stored in a register.  Add missing

Doing so actually don't work in a lot of cases. See below for more
explanations.

> registers to reg_alloc_order.  Add support for 12-bit branch relocations.
> Add functions for synthetic operations: addi, mtctl, dep, shd, vshd, ori,
> andi, shifts, rotates, multiply, branches, setcond.  Split out TLB reads
> from qemu_ld and qemu_st; fix argument loading for tlb external calls.
> Generate the prologue.

I have applied the patch. I have some comments though, it would be nice
if you can address them with additional patches.

> Signed-off-by: Richard Henderson <address@hidden>
> ---
>  configure             |    5 +-
>  tcg/hppa/tcg-target.c | 1758 
> ++++++++++++++++++++++++++++++++++---------------
>  tcg/hppa/tcg-target.h |  142 +----
>  3 files changed, 1258 insertions(+), 647 deletions(-)
> 
> diff --git a/configure b/configure
> index 1d5fb17..966cd7d 100755
> --- a/configure
> +++ b/configure
> @@ -722,6 +722,9 @@ case "$cpu" in
>      ia64*)
>             host_guest_base="yes"
>             ;;
> +    hppa*)
> +           host_guest_base="yes"
> +           ;;
>  esac
>  
>  [ -z "$guest_base" ] && guest_base="$host_guest_base"
> @@ -2744,7 +2747,7 @@ if test "$target_linux_user" = "yes" -o 
> "$target_bsd_user" = "yes" ; then
>      # -static is used to avoid g1/g3 usage by the dynamic linker
>      ldflags="$linker_script -static $ldflags"
>      ;;
> -  i386|x86_64|ppc|ppc64|s390|sparc64|alpha|arm|m68k|mips|mips64|ia64)
> +  *)
>      ldflags="$linker_script $ldflags"
>      ;;
>    esac
> diff --git a/tcg/hppa/tcg-target.c b/tcg/hppa/tcg-target.c
> index f9ae898..4e15256 100644
> --- a/tcg/hppa/tcg-target.c
> +++ b/tcg/hppa/tcg-target.c
> @@ -24,41 +24,26 @@
>  
>  #ifndef NDEBUG
>  static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
> -    "%r0",
> -    "%r1",
> -    "%rp",
> -    "%r3",
> -    "%r4",
> -    "%r5",
> -    "%r6",
> -    "%r7",
> -    "%r8",
> -    "%r9",
> -    "%r10",
> -    "%r11",
> -    "%r12",
> -    "%r13",
> -    "%r14",
> -    "%r15",
> -    "%r16",
> -    "%r17",
> -    "%r18",
> -    "%r19",
> -    "%r20",
> -    "%r21",
> -    "%r22",
> -    "%r23",
> -    "%r24",
> -    "%r25",
> -    "%r26",
> -    "%dp",
> -    "%ret0",
> -    "%ret1",
> -    "%sp",
> -    "%r31",
> +    "%r0", "%r1", "%rp", "%r3", "%r4", "%r5", "%r6", "%r7",
> +    "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
> +    "%r16", "%r17", "%r18", "%r19", "%r20", "%r21", "%r22", "%r23",
> +    "%r24", "%r25", "%r26", "%dp", "%ret0", "%ret1", "%sp", "%r31",
>  };
>  #endif
>  
> +/* This is an 8 byte temp slot in the stack frame.  */
> +#define STACK_TEMP_OFS -16
> +
> +#ifndef GUEST_BASE
> +#define GUEST_BASE 0
> +#endif
> +
> +#ifdef CONFIG_USE_GUEST_BASE
> +#define TCG_GUEST_BASE_REG TCG_REG_R16
> +#else
> +#define TCG_GUEST_BASE_REG TCG_REG_R0
> +#endif
> +
>  static const int tcg_target_reg_alloc_order[] = {
>      TCG_REG_R4,
>      TCG_REG_R5,
> @@ -75,6 +60,14 @@ static const int tcg_target_reg_alloc_order[] = {
>      TCG_REG_R14,
>      TCG_REG_R15,
>      TCG_REG_R16,
> +
> +    TCG_REG_R26,
> +    TCG_REG_R25,
> +    TCG_REG_R24,
> +    TCG_REG_R23,
> +
> +    TCG_REG_RET0,
> +    TCG_REG_RET1,
>  };
>  
>  static const int tcg_target_call_iarg_regs[4] = {
> @@ -89,16 +82,98 @@ static const int tcg_target_call_oarg_regs[2] = {
>      TCG_REG_RET1,
>  };
>  
> +/* True iff val fits a signed field of width BITS.  */
> +static inline int check_fit_tl(tcg_target_long val, unsigned int bits)
> +{
> +    return (val << ((sizeof(tcg_target_long) * 8 - bits))
> +            >> (sizeof(tcg_target_long) * 8 - bits)) == val;
> +}
> +
> +/* True iff depi can be used to compute (reg | MASK).
> +   Accept a bit pattern like:
> +      0....01....1
> +      1....10....0
> +      0..01..10..0
> +   Copied from gcc sources.  */
> +static inline int or_mask_p(tcg_target_ulong mask)
> +{
> +    mask += mask & -mask;
> +    return (mask & (mask - 1)) == 0;
> +}
> +
> +/* True iff depi or extru can be used to compute (reg & mask).
> +   Accept a bit pattern like these:
> +      0....01....1
> +      1....10....0
> +      1..10..01..1 
> +   Copied from gcc sources.  */
> +static inline int and_mask_p(tcg_target_ulong mask)
> +{
> +    return or_mask_p(~mask);
> +}
> +
> +static int low_sign_ext(int val, int len)
> +{
> +    return (((val << 1) & ~(-1u << len)) | ((val >> (len - 1)) & 1));
> +}
> +
> +static int reassemble_12(int as12)
> +{
> +    return (((as12 & 0x800) >> 11) |
> +            ((as12 & 0x400) >> 8) |
> +            ((as12 & 0x3ff) << 3));
> +}
> +
> +static int reassemble_17(int as17)
> +{
> +    return (((as17 & 0x10000) >> 16) |
> +            ((as17 & 0x0f800) << 5) |
> +            ((as17 & 0x00400) >> 8) |
> +            ((as17 & 0x003ff) << 3));
> +}
> +
> +static int reassemble_21(int as21)
> +{
> +    return (((as21 & 0x100000) >> 20) |
> +            ((as21 & 0x0ffe00) >> 8) |
> +            ((as21 & 0x000180) << 7) |
> +            ((as21 & 0x00007c) << 14) |
> +            ((as21 & 0x000003) << 12));
> +}
> +
> +/* ??? Bizzarely, there is no PCREL12F relocation type.  I guess all
> +   such relocations are simply fully handled by the assembler.  */
> +#define R_PARISC_PCREL12F  R_PARISC_NONE
> +
>  static void patch_reloc(uint8_t *code_ptr, int type,
>                          tcg_target_long value, tcg_target_long addend)
>  {
> +    uint32_t *insn_ptr = (uint32_t *)code_ptr;
> +    uint32_t insn = *insn_ptr;
> +    tcg_target_long pcrel;
> +
> +    value += addend;
> +    pcrel = (value - ((tcg_target_long)code_ptr + 8)) >> 2;
> +
>      switch (type) {
> +    case R_PARISC_PCREL12F:
> +        assert(check_fit_tl(pcrel, 12));
> +        /* ??? We assume all patches are forward.  See tcg_out_brcond
> +           re setting the NUL bit on the branch and eliding the nop.  */
> +        assert(pcrel >= 0);
> +        insn &= ~0x1ffdu;
> +        insn |= reassemble_12(pcrel);
> +        break;
>      case R_PARISC_PCREL17F:
> -        hppa_patch17f((uint32_t *)code_ptr, value, addend);
> +        assert(check_fit_tl(pcrel, 17));
> +        insn &= ~0x1f1ffdu;
> +        insn |= reassemble_17(pcrel);
>          break;
>      default:
>          tcg_abort();
>      }
> +
> +    *insn_ptr = insn;
>  }
>  
>  /* maximum number of register used for input function arguments */
> @@ -126,6 +201,15 @@ static int target_parse_constraint(TCGArgConstraint *ct, 
> const char **pct_str)
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R24);
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R23);
>          break;
> +    case 'Z':
> +        ct->ct |= TCG_CT_CONST_0;
> +        break;
> +    case 'I':
> +        ct->ct |= TCG_CT_CONST_S11;
> +        break;
> +    case 'J':
> +        ct->ct |= TCG_CT_CONST_S5;
> +     break;
>      default:
>          return -1;
>      }
> @@ -135,15 +219,19 @@ static int target_parse_constraint(TCGArgConstraint 
> *ct, const char **pct_str)
>  }
>  
>  /* test if a constant matches the constraint */
> -static inline int tcg_target_const_match(tcg_target_long val,
> -                                         const TCGArgConstraint *arg_ct)
> +static int tcg_target_const_match(tcg_target_long val,
> +                                  const TCGArgConstraint *arg_ct)
>  {
> -    int ct;
> -
> -    ct = arg_ct->ct;
> -
> -    /* TODO */
> -
> +    int ct = arg_ct->ct;
> +    if (ct & TCG_CT_CONST) {
> +        return 1;
> +    } else if (ct & TCG_CT_CONST_0) {
> +        return val == 0;
> +    } else if (ct & TCG_CT_CONST_S5) {
> +        return check_fit_tl(val, 5);
> +    } else if (ct & TCG_CT_CONST_S11) {
> +        return check_fit_tl(val, 11);
> +    }
>      return 0;
>  }
>  
> @@ -163,191 +251,588 @@ static inline int 
> tcg_target_const_match(tcg_target_long val,
>  #define INSN_SHDEP_CP(x) ((31 - (x)) << 5)
>  #define INSN_SHDEP_P(x)  ((x) << 5)
>  #define INSN_COND(x)     ((x) << 13)
> +#define INSN_IM11(x)     low_sign_ext(x, 11)
> +#define INSN_IM14(x)     low_sign_ext(x, 14)
> +#define INSN_IM5(x)      (low_sign_ext(x, 5) << 16)
> +
> +#define COND_NEVER   0
> +#define COND_EQ      1
> +#define COND_LT      2
> +#define COND_LE      3
> +#define COND_LTU     4
> +#define COND_LEU     5
> +#define COND_SV      6
> +#define COND_OD      7
> +#define COND_FALSE   8
> +
> +#define INSN_ADD     (INSN_OP(0x02) | INSN_EXT6(0x18))
> +#define INSN_ADDC    (INSN_OP(0x02) | INSN_EXT6(0x1c))
> +#define INSN_ADDI    (INSN_OP(0x2d))
> +#define INSN_ADDIL   (INSN_OP(0x0a))
> +#define INSN_ADDL    (INSN_OP(0x02) | INSN_EXT6(0x28))
> +#define INSN_AND     (INSN_OP(0x02) | INSN_EXT6(0x08))
> +#define INSN_ANDCM   (INSN_OP(0x02) | INSN_EXT6(0x00))
> +#define INSN_COMCLR  (INSN_OP(0x02) | INSN_EXT6(0x22))
> +#define INSN_COMICLR (INSN_OP(0x24))
> +#define INSN_DEP     (INSN_OP(0x35) | INSN_EXT3SH(3))
> +#define INSN_DEPI    (INSN_OP(0x35) | INSN_EXT3SH(7))
> +#define INSN_EXTRS   (INSN_OP(0x34) | INSN_EXT3SH(7))
> +#define INSN_EXTRU   (INSN_OP(0x34) | INSN_EXT3SH(6))
> +#define INSN_LDIL    (INSN_OP(0x08))
> +#define INSN_LDO     (INSN_OP(0x0d))
> +#define INSN_MTCTL   (INSN_OP(0x00) | INSN_EXT8B(0xc2))
> +#define INSN_OR              (INSN_OP(0x02) | INSN_EXT6(0x09))
> +#define INSN_SHD     (INSN_OP(0x34) | INSN_EXT3SH(2))
> +#define INSN_SUB     (INSN_OP(0x02) | INSN_EXT6(0x10))
> +#define INSN_SUBB    (INSN_OP(0x02) | INSN_EXT6(0x14))
> +#define INSN_SUBI    (INSN_OP(0x25))
> +#define INSN_VEXTRS  (INSN_OP(0x34) | INSN_EXT3SH(5))
> +#define INSN_VEXTRU  (INSN_OP(0x34) | INSN_EXT3SH(4))
> +#define INSN_VSHD    (INSN_OP(0x34) | INSN_EXT3SH(0))
> +#define INSN_XOR     (INSN_OP(0x02) | INSN_EXT6(0x0a))
> +#define INSN_ZDEP    (INSN_OP(0x35) | INSN_EXT3SH(2))
> +#define INSN_ZVDEP   (INSN_OP(0x35) | INSN_EXT3SH(0))
> +
> +#define INSN_BL         (INSN_OP(0x3a) | INSN_EXT3BR(0))
> +#define INSN_BL_N       (INSN_OP(0x3a) | INSN_EXT3BR(0) | 2)
> +#define INSN_BLR        (INSN_OP(0x3a) | INSN_EXT3BR(2))
> +#define INSN_BV         (INSN_OP(0x3a) | INSN_EXT3BR(6))
> +#define INSN_BV_N       (INSN_OP(0x3a) | INSN_EXT3BR(6) | 2)
> +#define INSN_BLE_SR4    (INSN_OP(0x39) | (1 << 13))
> +
> +#define INSN_LDB        (INSN_OP(0x10))
> +#define INSN_LDH        (INSN_OP(0x11))
> +#define INSN_LDW        (INSN_OP(0x12))
> +#define INSN_LDWM       (INSN_OP(0x13))
> +#define INSN_FLDDS      (INSN_OP(0x0b) | INSN_EXT4(0) | (1 << 12))
> +
> +#define INSN_LDBX    (INSN_OP(0x03) | INSN_EXT4(0))
> +#define INSN_LDHX    (INSN_OP(0x03) | INSN_EXT4(1))
> +#define INSN_LDWX       (INSN_OP(0x03) | INSN_EXT4(2))
> +
> +#define INSN_STB        (INSN_OP(0x18))
> +#define INSN_STH        (INSN_OP(0x19))
> +#define INSN_STW        (INSN_OP(0x1a))
> +#define INSN_STWM       (INSN_OP(0x1b))
> +#define INSN_FSTDS      (INSN_OP(0x0b) | INSN_EXT4(8) | (1 << 12))
> +
> +#define INSN_COMBT      (INSN_OP(0x20))
> +#define INSN_COMBF      (INSN_OP(0x22))
> +#define INSN_COMIBT     (INSN_OP(0x21))
> +#define INSN_COMIBF     (INSN_OP(0x23))
> +
> +/* supplied by libgcc */
> +extern void *__canonicalize_funcptr_for_compare(void *);
> +
> +static void tcg_out_mov(TCGContext *s, int ret, int arg)
> +{
> +    /* PA1.1 defines COPY as OR r,0,t; PA2.0 defines COPY as LDO 0(r),t
> +       but hppa-dis.c is unaware of this definition */
> +    if (ret != arg) {
> +        tcg_out32(s, INSN_OR | INSN_T(ret) | INSN_R1(arg)
> +                  | INSN_R2(TCG_REG_R0));
> +    }
> +}
>  
> -#define COND_NEVER 0
> -#define COND_EQUAL 1
> -#define COND_LT    2
> -#define COND_LTEQ  3
> -#define COND_LTU   4
> -#define COND_LTUEQ 5
> -#define COND_SV    6
> -#define COND_OD    7
> +static void tcg_out_movi(TCGContext *s, TCGType type,
> +                         int ret, tcg_target_long arg)
> +{
> +    if (check_fit_tl(arg, 14)) {
> +        tcg_out32(s, INSN_LDO | INSN_R1(ret)
> +                  | INSN_R2(TCG_REG_R0) | INSN_IM14(arg));
> +    } else {
> +        uint32_t hi, lo;
> +        hi = arg >> 11;
> +        lo = arg & 0x7ff;
> +
> +        tcg_out32(s, INSN_LDIL | INSN_R2(ret) | reassemble_21(hi));
> +        if (lo) {
> +            tcg_out32(s, INSN_LDO | INSN_R1(ret)
> +                      | INSN_R2(ret) | INSN_IM14(lo));
> +        }
> +    }
> +}
>  
> +static void tcg_out_ldst(TCGContext *s, int ret, int addr,
> +                         tcg_target_long offset, int op)
> +{
> +    if (!check_fit_tl(offset, 14)) {
> +        uint32_t hi, lo, op;
>  
> -/* Logical ADD */
> -#define ARITH_ADD  (INSN_OP(0x02) | INSN_EXT6(0x28))
> -#define ARITH_AND  (INSN_OP(0x02) | INSN_EXT6(0x08))
> -#define ARITH_OR   (INSN_OP(0x02) | INSN_EXT6(0x09))
> -#define ARITH_XOR  (INSN_OP(0x02) | INSN_EXT6(0x0a))
> -#define ARITH_SUB  (INSN_OP(0x02) | INSN_EXT6(0x10))
> +        hi = offset >> 11;
> +        lo = offset & 0x7ff;
>  
> -#define SHD        (INSN_OP(0x34) | INSN_EXT3SH(2))
> -#define VSHD       (INSN_OP(0x34) | INSN_EXT3SH(0))
> -#define DEP        (INSN_OP(0x35) | INSN_EXT3SH(3))
> -#define ZDEP       (INSN_OP(0x35) | INSN_EXT3SH(2))
> -#define ZVDEP      (INSN_OP(0x35) | INSN_EXT3SH(0))
> -#define EXTRU      (INSN_OP(0x34) | INSN_EXT3SH(6))
> -#define EXTRS      (INSN_OP(0x34) | INSN_EXT3SH(7))
> -#define VEXTRS     (INSN_OP(0x34) | INSN_EXT3SH(5))
> +        if (addr == TCG_REG_R0) {
> +            op = INSN_LDIL | INSN_R2(TCG_REG_R1);
> +        } else {
> +            op = INSN_ADDIL | INSN_R2(addr);
> +        }
> +        tcg_out32(s, op | reassemble_21(hi));
>  
> -#define SUBI       (INSN_OP(0x25))
> -#define MTCTL      (INSN_OP(0x00) | INSN_EXT8B(0xc2))
> +        addr = TCG_REG_R1;
> +     offset = lo;
> +    }
>  
> -#define BL         (INSN_OP(0x3a) | INSN_EXT3BR(0))
> -#define BLE_SR4    (INSN_OP(0x39) | (1 << 13))
> -#define BV         (INSN_OP(0x3a) | INSN_EXT3BR(6))
> -#define BV_N       (INSN_OP(0x3a) | INSN_EXT3BR(6) | 2)
> -#define LDIL       (INSN_OP(0x08))
> -#define LDO        (INSN_OP(0x0d))
> +    if (ret != addr || offset != 0 || op != INSN_LDO) {
> +        tcg_out32(s, op | INSN_R1(ret) | INSN_R2(addr) | INSN_IM14(offset));
> +    }
> +}
>  
> -#define LDB        (INSN_OP(0x10))
> -#define LDH        (INSN_OP(0x11))
> -#define LDW        (INSN_OP(0x12))
> -#define LDWM       (INSN_OP(0x13))
> +/* This function is required by tcg.c.  */
> +static inline void tcg_out_ld(TCGContext *s, TCGType type, int ret,
> +                              int arg1, tcg_target_long arg2)
> +{
> +    tcg_out_ldst(s, ret, arg1, arg2, INSN_LDW);
> +}
> +
> +/* This function is required by tcg.c.  */
> +static inline void tcg_out_st(TCGContext *s, TCGType type, int ret,
> +                              int arg1, tcg_target_long arg2)
> +{
> +    tcg_out_ldst(s, ret, arg1, arg2, INSN_STW);
> +}
> +
> +static void tcg_out_ldst_index(TCGContext *s, int data,
> +                               int base, int index, int op)
> +{
> +    tcg_out32(s, op | INSN_T(data) | INSN_R1(index) | INSN_R2(base));
> +}
> +
> +static inline void tcg_out_addi2(TCGContext *s, int ret, int arg1,
> +                                 tcg_target_long val)
> +{
> +    tcg_out_ldst(s, ret, arg1, val, INSN_LDO);
> +}
>  
> -#define STB        (INSN_OP(0x18))
> -#define STH        (INSN_OP(0x19))
> -#define STW        (INSN_OP(0x1a))
> -#define STWM       (INSN_OP(0x1b))
> +/* This function is required by tcg.c.  */
> +static inline void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
> +{
> +    tcg_out_addi2(s, reg, reg, val);
> +}
>  
> -#define COMBT      (INSN_OP(0x20))
> -#define COMBF      (INSN_OP(0x22))
> +static inline void tcg_out_arith(TCGContext *s, int t, int r1, int r2, int 
> op)
> +{
> +    tcg_out32(s, op | INSN_T(t) | INSN_R1(r1) | INSN_R2(r2));
> +}
>  
> -static int lowsignext(uint32_t val, int start, int length)
> +static inline void tcg_out_arithi(TCGContext *s, int t, int r1,
> +                                  tcg_target_long val, int op)
>  {
> -    return (((val << 1) & ~(~0 << length)) |
> -            ((val >> (length - 1)) & 1)) << start;
> +    assert(check_fit_tl(val, 11));
> +    tcg_out32(s, op | INSN_R1(t) | INSN_R2(r1) | INSN_IM11(val));
>  }
>  
> -static inline void tcg_out_mov(TCGContext *s, int ret, int arg)
> +static inline void tcg_out_nop(TCGContext *s)
>  {
> -    /* PA1.1 defines COPY as OR r,0,t */
> -    tcg_out32(s, ARITH_OR | INSN_T(ret) | INSN_R1(arg) | 
> INSN_R2(TCG_REG_R0));
> +    tcg_out_arith(s, TCG_REG_R0, TCG_REG_R0, TCG_REG_R0, INSN_OR);
> +}
>  
> -    /* PA2.0 defines COPY as LDO 0(r),t
> -     * but hppa-dis.c is unaware of this definition */
> -    /* tcg_out32(s, LDO | INSN_R1(ret) | INSN_R2(arg) | reassemble_14(0)); */
> +static inline void tcg_out_mtctl_sar(TCGContext *s, int arg)
> +{
> +    tcg_out32(s, INSN_MTCTL | INSN_R2(11) | INSN_R1(arg));
> +}
> +
> +/* Extract LEN bits at position OFS from ARG and place in RET.
> +   Note that here the bit ordering is reversed from the PA-RISC
> +   standard, such that the right-most bit is 0.  */
> +static inline void tcg_out_extr(TCGContext *s, int ret, int arg,
> +                                unsigned ofs, unsigned len, int sign)
> +{
> +    assert(ofs < 32 && len <= 32 - ofs);
> +    tcg_out32(s, (sign ? INSN_EXTRS : INSN_EXTRU)
> +              | INSN_R1(ret) | INSN_R2(arg)
> +              | INSN_SHDEP_P(31 - ofs) | INSN_DEP_LEN(len));
>  }
>  
> -static inline void tcg_out_movi(TCGContext *s, TCGType type,
> -                                int ret, tcg_target_long arg)
> +/* Likewise with OFS interpreted little-endian.  */
> +static inline void tcg_out_dep(TCGContext *s, int ret, int arg,
> +                               unsigned ofs, unsigned len)
>  {
> -    if (arg == (arg & 0x1fff)) {
> -        tcg_out32(s, LDO | INSN_R1(ret) | INSN_R2(TCG_REG_R0) |
> -                     reassemble_14(arg));
> +    assert(ofs < 32 && len <= 32 - ofs);
> +    tcg_out32(s, INSN_DEP | INSN_R2(ret) | INSN_R1(arg)
> +              | INSN_SHDEP_CP(31 - ofs) | INSN_DEP_LEN(len));
> +}
> +
> +static inline void tcg_out_shd(TCGContext *s, int ret, int hi, int lo,
> +                               unsigned count)
> +{
> +    assert(count < 32);
> +    tcg_out32(s, INSN_SHD | INSN_R1(hi) | INSN_R2(lo) | INSN_T(ret)
> +              | INSN_SHDEP_CP(count));
> +}
> +
> +static void tcg_out_vshd(TCGContext *s, int ret, int hi, int lo, int creg)
> +{
> +    tcg_out_mtctl_sar(s, creg);
> +    tcg_out32(s, INSN_VSHD | INSN_T(ret) | INSN_R1(hi) | INSN_R2(lo));
> +}
> +
> +static void tcg_out_ori(TCGContext *s, int ret, int arg, tcg_target_ulong m)
> +{
> +    if (m == 0) {
> +        tcg_out_mov(s, ret, arg);
> +    } else if (m == -1) {
> +        tcg_out_movi(s, TCG_TYPE_I32, ret, -1);

Those cases are already eliminated in tcg/tcg-op.h. This code looks
redundant.

> +    } else if (or_mask_p(m)) {
> +        int bs0, bs1;
> +
> +        for (bs0 = 0; bs0 < 32; bs0++) {
> +            if ((m & (1u << bs0)) != 0) {
> +                break;
> +            }
> +        }
> +        for (bs1 = bs0; bs1 < 32; bs1++) {
> +            if ((m & (1u << bs1)) == 0) {
> +                break;
> +            }
> +        }
> +        assert(bs1 == 32 || (1ul << bs1) > m);
> +
> +        tcg_out_mov(s, ret, arg);
> +        tcg_out32(s, INSN_DEPI | INSN_R2(ret) | INSN_IM5(-1)
> +                  | INSN_SHDEP_CP(31 - bs0) | INSN_DEP_LEN(bs1 - bs0));
> +    } else {
> +        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R1, m);
> +        tcg_out_arith(s, ret, arg, TCG_REG_R1, INSN_OR);

Do we really want a movi here? It would be better to leave the tcg code
load the constant itself, so that if the same constant is used twice, it
is only loaded once.

> +    }
> +}
> +
> +static void tcg_out_andi(TCGContext *s, int ret, int arg, tcg_target_ulong m)
> +{
> +    if (m == 0) {
> +        tcg_out_mov(s, ret, TCG_REG_R0);
> +    } else if (m == -1) {
> +        tcg_out_mov(s, ret, arg);

Same.

> +    } else if (and_mask_p(m)) {
> +        int ls0, ls1, ms0;
> +
> +        for (ls0 = 0; ls0 < 32; ls0++) {
> +            if ((m & (1u << ls0)) == 0) {
> +                break;
> +            }
> +        }
> +        for (ls1 = ls0; ls1 < 32; ls1++) {
> +            if ((m & (1u << ls1)) != 0) {
> +                break;
> +            }
> +        }
> +        for (ms0 = ls1; ms0 < 32; ms0++) {
> +            if ((m & (1u << ms0)) == 0) {
> +                break;
> +            }
> +        }
> +        assert (ms0 == 32);
> +
> +        if (ls1 == 32) {
> +            tcg_out_extr(s, ret, arg, 0, ls0, 0);
> +        } else {
> +            tcg_out_mov(s, ret, arg);
> +            tcg_out32(s, INSN_DEPI | INSN_R2(ret) | INSN_IM5(0)
> +                      | INSN_SHDEP_CP(31 - ls0) | INSN_DEP_LEN(ls1 - ls0));
> +        }
>      } else {
> -        tcg_out32(s, LDIL | INSN_R2(ret) |
> -                     reassemble_21(lrsel((uint32_t)arg, 0)));
> -        if (arg & 0x7ff)
> -            tcg_out32(s, LDO | INSN_R1(ret) | INSN_R2(ret) |
> -                         reassemble_14(rrsel((uint32_t)arg, 0)));
> +        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R1, m);
> +        tcg_out_arith(s, ret, arg, TCG_REG_R1, INSN_AND);

Same.

>      }
>  }
>  
> -static inline void tcg_out_ld_raw(TCGContext *s, int ret,
> -                                  tcg_target_long arg)
> +static inline void tcg_out_ext8s(TCGContext *s, int ret, int arg)
>  {
> -    tcg_out32(s, LDIL | INSN_R2(ret) |
> -                 reassemble_21(lrsel((uint32_t)arg, 0)));
> -    tcg_out32(s, LDW | INSN_R1(ret) | INSN_R2(ret) |
> -                 reassemble_14(rrsel((uint32_t)arg, 0)));
> +    tcg_out_extr(s, ret, arg, 0, 8, 1);
>  }
>  
> -static inline void tcg_out_ld_ptr(TCGContext *s, int ret,
> -                                  tcg_target_long arg)
> +static inline void tcg_out_ext16s(TCGContext *s, int ret, int arg)
>  {
> -    tcg_out_ld_raw(s, ret, arg);
> +    tcg_out_extr(s, ret, arg, 0, 16, 1);
>  }
>  
> -static inline void tcg_out_ldst(TCGContext *s, int ret, int addr, int offset,
> -                                int op)
> +static void tcg_out_shli(TCGContext *s, int ret, int arg, int count)
>  {
> -    if (offset == (offset & 0xfff))
> -        tcg_out32(s, op | INSN_R1(ret) | INSN_R2(addr) |
> -                 reassemble_14(offset));
> -    else {
> -        fprintf(stderr, "unimplemented %s with offset %d\n", __func__, 
> offset);
> -        tcg_abort();
> -    }
> +    count &= 31;
> +    tcg_out32(s, INSN_ZDEP | INSN_R2(ret) | INSN_R1(arg)
> +              | INSN_SHDEP_CP(31 - count) | INSN_DEP_LEN(32 - count));
>  }
>  
> -static inline void tcg_out_ld(TCGContext *s, TCGType type, int ret,
> -                              int arg1, tcg_target_long arg2)
> +static void tcg_out_shl(TCGContext *s, int ret, int arg, int creg)
>  {
> -    fprintf(stderr, "unimplemented %s\n", __func__);
> -    tcg_abort();
> +    tcg_out_arithi(s, TCG_REG_R20, creg, 31, INSN_SUBI);
> +    tcg_out_mtctl_sar(s, TCG_REG_R20);
> +    tcg_out32(s, INSN_ZVDEP | INSN_R2(ret) | INSN_R1(arg) | 
> INSN_DEP_LEN(32));
>  }
>  
> -static inline void tcg_out_st(TCGContext *s, TCGType type, int ret,
> -                              int arg1, tcg_target_long arg2)
> +static void tcg_out_shri(TCGContext *s, int ret, int arg, int count)
>  {
> -    fprintf(stderr, "unimplemented %s\n", __func__);
> -    tcg_abort();
> +    count &= 31;
> +    tcg_out_extr(s, ret, arg, count, 32 - count, 0);
>  }
>  
> -static inline void tcg_out_arith(TCGContext *s, int t, int r1, int r2, int 
> op)
> +static void tcg_out_shr(TCGContext *s, int ret, int arg, int creg)
>  {
> -    tcg_out32(s, op | INSN_T(t) | INSN_R1(r1) | INSN_R2(r2));
> +    tcg_out_vshd(s, ret, TCG_REG_R0, arg, creg);
>  }
>  
> -static inline void tcg_out_arithi(TCGContext *s, int t, int r1,
> -                                  tcg_target_long val, int op)
> +static void tcg_out_sari(TCGContext *s, int ret, int arg, int count)
>  {
> -    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R20, val);
> -    tcg_out_arith(s, t, r1, TCG_REG_R20, op);
> +    count &= 31;
> +    tcg_out_extr(s, ret, arg, count, 32 - count, 1);
>  }
>  
> -static inline void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
> +static void tcg_out_sar(TCGContext *s, int ret, int arg, int creg)
>  {
> -    tcg_out_arithi(s, reg, reg, val, ARITH_ADD);
> +    tcg_out_arithi(s, TCG_REG_R20, creg, 31, INSN_SUBI);
> +    tcg_out_mtctl_sar(s, TCG_REG_R20);
> +    tcg_out32(s, INSN_VEXTRS | INSN_R1(ret) | INSN_R2(arg) | 
> INSN_DEP_LEN(32));
>  }
>  
> -static inline void tcg_out_nop(TCGContext *s)
> +static void tcg_out_rotli(TCGContext *s, int ret, int arg, int count)
>  {
> -    tcg_out32(s, ARITH_OR | INSN_T(TCG_REG_R0) | INSN_R1(TCG_REG_R0) |
> -                 INSN_R2(TCG_REG_R0));
> +    count &= 31;
> +    tcg_out_shd(s, ret, arg, arg, 32 - count);
>  }
>  
> -static inline void tcg_out_ext8s(TCGContext *s, int ret, int arg) {
> -    tcg_out32(s, EXTRS | INSN_R1(ret) | INSN_R2(arg) |
> -                 INSN_SHDEP_P(31) | INSN_DEP_LEN(8));
> +static void tcg_out_rotl(TCGContext *s, int ret, int arg, int creg)
> +{
> +    tcg_out_arithi(s, TCG_REG_R20, creg, 32, INSN_SUBI);
> +    tcg_out_vshd(s, ret, arg, arg, TCG_REG_R20);
>  }
>  
> -static inline void tcg_out_ext16s(TCGContext *s, int ret, int arg) {
> -    tcg_out32(s, EXTRS | INSN_R1(ret) | INSN_R2(arg) |
> -                 INSN_SHDEP_P(31) | INSN_DEP_LEN(16));
> +static void tcg_out_rotri(TCGContext *s, int ret, int arg, int count)
> +{
> +    count &= 31;
> +    tcg_out_shd(s, ret, arg, arg, count);
>  }
>  
> -static inline void tcg_out_bswap16(TCGContext *s, int ret, int arg) {
> -    if(ret != arg)
> -        tcg_out_mov(s, ret, arg);
> -    tcg_out32(s, DEP | INSN_R2(ret) | INSN_R1(ret) |
> -                 INSN_SHDEP_CP(15) | INSN_DEP_LEN(8));
> -    tcg_out32(s, SHD | INSN_T(ret) | INSN_R1(TCG_REG_R0) |
> -                 INSN_R2(ret) | INSN_SHDEP_CP(8));
> +static void tcg_out_rotr(TCGContext *s, int ret, int arg, int creg)
> +{
> +    tcg_out_vshd(s, ret, arg, arg, creg);
>  }
>  
> -static inline void tcg_out_bswap32(TCGContext *s, int ret, int arg, int 
> temp) {
> -    tcg_out32(s, SHD | INSN_T(temp) | INSN_R1(arg) |
> -                 INSN_R2(arg) | INSN_SHDEP_CP(16));
> -    tcg_out32(s, DEP | INSN_R2(temp) | INSN_R1(temp) |
> -                 INSN_SHDEP_CP(15) | INSN_DEP_LEN(8));
> -    tcg_out32(s, SHD | INSN_T(ret) | INSN_R1(arg) |
> -                 INSN_R2(temp) | INSN_SHDEP_CP(8));
> +static void tcg_out_bswap16(TCGContext *s, int ret, int arg, int sign)
> +{
> +    if (ret != arg) {
> +        tcg_out_mov(s, ret, arg);             /* arg =  xxAB */
> +    }
> +    tcg_out_dep(s, ret, ret, 16, 8);          /* ret =  xBAB */
> +    tcg_out_extr(s, ret, ret, 8, 16, sign);   /* ret =  ..BA */
>  }
>  
> -static inline void tcg_out_call(TCGContext *s, void *func)
> +static void tcg_out_bswap32(TCGContext *s, int ret, int arg, int temp)
>  {
> -    uint32_t val = (uint32_t)__canonicalize_funcptr_for_compare(func);
> -    tcg_out32(s, LDIL | INSN_R2(TCG_REG_R20) |
> -                 reassemble_21(lrsel(val, 0)));
> -    tcg_out32(s, BLE_SR4 | INSN_R2(TCG_REG_R20) |
> -                 reassemble_17(rrsel(val, 0) >> 2));
> -    tcg_out_mov(s, TCG_REG_RP, TCG_REG_R31);
> +                                          /* arg =  ABCD */
> +    tcg_out_rotri(s, temp, arg, 16);      /* temp = CDAB */
> +    tcg_out_dep(s, temp, temp, 16, 8);    /* temp = CBAB */
> +    tcg_out_shd(s, ret, arg, temp, 8);    /* ret =  DCBA */
>  }
>  
> -#if defined(CONFIG_SOFTMMU)
> +static void tcg_out_call(TCGContext *s, void *func)
> +{
> +    tcg_target_long val, hi, lo, disp;
> +
> +    val = (uint32_t)__canonicalize_funcptr_for_compare(func);
> +    disp = (val - ((tcg_target_long)s->code_ptr + 8)) >> 2;
> +
> +    if (check_fit_tl(disp, 17)) {
> +        tcg_out32(s, INSN_BL_N | INSN_R2(TCG_REG_RP) | reassemble_17(disp));
> +    } else {
> +        hi = val >> 11;
> +        lo = val & 0x7ff;
> +
> +        tcg_out32(s, INSN_LDIL | INSN_R2(TCG_REG_R20) | reassemble_21(hi));
> +        tcg_out32(s, INSN_BLE_SR4 | INSN_R2(TCG_REG_R20)
> +                  | reassemble_17(lo >> 2));
> +        tcg_out_mov(s, TCG_REG_RP, TCG_REG_R31);
> +    }
> +}
>  
> +static void tcg_out_xmpyu(TCGContext *s, int retl, int reth,
> +                          int arg1, int arg2)
> +{
> +    /* Store both words into the stack for copy to the FPU.  */
> +    tcg_out_ldst(s, arg1, TCG_REG_SP, STACK_TEMP_OFS, INSN_STW);
> +    tcg_out_ldst(s, arg2, TCG_REG_SP, STACK_TEMP_OFS + 4, INSN_STW);
> +
> +    /* Load both words into the FPU at the same time.  We get away
> +       with this because we can address the left and right half of the
> +       FPU registers individually once loaded.  */
> +    /* fldds stack_temp(sp),fr22 */
> +    tcg_out32(s, INSN_FLDDS | INSN_R2(TCG_REG_SP)
> +              | INSN_IM5(STACK_TEMP_OFS) | INSN_T(22));
> +
> +    /* xmpyu fr22r,fr22,fr22 */
> +    tcg_out32(s, 0x3ad64796);
> +
> +    /* Store the 64-bit result back into the stack.  */
> +    /* fstds stack_temp(sp),fr22 */
> +    tcg_out32(s, INSN_FSTDS | INSN_R2(TCG_REG_SP)
> +              | INSN_IM5(STACK_TEMP_OFS) | INSN_T(22));
> +
> +    /* Load the pieces of the result that the caller requested.  */
> +    if (reth) {
> +        tcg_out_ldst(s, reth, TCG_REG_SP, STACK_TEMP_OFS, INSN_LDW);
> +    }
> +    if (retl) {
> +        tcg_out_ldst(s, retl, TCG_REG_SP, STACK_TEMP_OFS + 4, INSN_LDW);
> +    }
> +}
> +
> +static void tcg_out_branch(TCGContext *s, int label_index, int nul)
> +{
> +    TCGLabel *l = &s->labels[label_index];
> +    uint32_t op = nul ? INSN_BL_N : INSN_BL;
> +
> +    if (l->has_value) {
> +        tcg_target_long val = l->u.value;
> +
> +        val -= (tcg_target_long)s->code_ptr + 8;
> +        val >>= 2;
> +        assert(check_fit_tl(val, 17));
> +
> +        tcg_out32(s, op | reassemble_17(val));
> +    } else {
> +        tcg_out_reloc(s, s->code_ptr, R_PARISC_PCREL17F, label_index, 0);
> +        tcg_out32(s, op);

This breaks partial retranslation. The bits corresponding to the offset
should be preserved.

> +    }
> +}
> +
> +static const uint8_t tcg_cond_to_cmp_cond[10] =
> +{
> +    [TCG_COND_EQ] = COND_EQ,
> +    [TCG_COND_NE] = COND_EQ | COND_FALSE,
> +    [TCG_COND_LT] = COND_LT,
> +    [TCG_COND_GE] = COND_LT | COND_FALSE,
> +    [TCG_COND_LE] = COND_LE,
> +    [TCG_COND_GT] = COND_LE | COND_FALSE,
> +    [TCG_COND_LTU] = COND_LTU,
> +    [TCG_COND_GEU] = COND_LTU | COND_FALSE,
> +    [TCG_COND_LEU] = COND_LEU,
> +    [TCG_COND_GTU] = COND_LEU | COND_FALSE,
> +};
> +
> +static void tcg_out_brcond(TCGContext *s, int cond, TCGArg c1,
> +                           TCGArg c2, int c2const, int label_index)
> +{
> +    TCGLabel *l = &s->labels[label_index];
> +    int op, pacond;
> +
> +    /* Note that COMIB operates as if the immediate is the first
> +       operand.  We model brcond with the immediate in the second
> +       to better match what targets are likely to give us.  For
> +       consistency, model COMB with reversed operands as well.  */
> +    pacond = tcg_cond_to_cmp_cond[tcg_swap_cond(cond)];
> +
> +    if (c2const) {
> +        op = (pacond & COND_FALSE ? INSN_COMIBF : INSN_COMIBT);
> +        op |= INSN_IM5(c2);
> +    } else {
> +        op = (pacond & COND_FALSE ? INSN_COMBF : INSN_COMBT);
> +        op |= INSN_R1(c2);
> +    }
> +    op |= INSN_R2(c1);
> +    op |= INSN_COND(pacond & 7);
> +
> +    if (l->has_value) {
> +        tcg_target_long val = l->u.value;
> +
> +        val -= (tcg_target_long)s->code_ptr + 8;
> +        val >>= 2;
> +        assert(check_fit_tl(val, 12));
> +
> +        /* ??? Assume that all branches to defined labels are backward.
> +           Which means that if the nul bit is set, the delay slot is
> +           executed if the branch is taken, and not executed in fallthru.  */
> +        tcg_out32(s, op | reassemble_12(val));
> +        tcg_out_nop(s);
> +    } else {
> +        tcg_out_reloc(s, s->code_ptr, R_PARISC_PCREL12F, label_index, 0);
> +        /* ??? Assume that all branches to undefined labels are forward.
> +           Which means that if the nul bit is set, the delay slot is
> +           not executed if the branch is taken, which is what we want.  */
> +        tcg_out32(s, op | 2);

Same problem about partial retranslation here.

> +    }
> +}
> +
> +static void tcg_out_comclr(TCGContext *s, int cond, TCGArg ret,
> +                           TCGArg c1, TCGArg c2, int c2const)
> +{
> +    int op, pacond;
> +
> +    /* Note that COMICLR operates as if the immediate is the first
> +       operand.  We model setcond with the immediate in the second
> +       to better match what targets are likely to give us.  For
> +       consistency, model COMCLR with reversed operands as well.  */
> +    pacond = tcg_cond_to_cmp_cond[tcg_swap_cond(cond)];
> +
> +    if (c2const) {
> +        op = INSN_COMICLR | INSN_R2(c1) | INSN_R1(ret) | INSN_IM11(c2);
> +    } else {
> +        op = INSN_COMCLR | INSN_R2(c1) | INSN_R1(c2) | INSN_T(ret);
> +    }
> +    op |= INSN_COND(pacond & 7);
> +    op |= pacond & COND_FALSE ? 1 << 12 : 0;
> +
> +    tcg_out32(s, op);
> +}
> +
> +static void tcg_out_brcond2(TCGContext *s, int cond, TCGArg al, TCGArg ah,
> +                            TCGArg bl, int blconst, TCGArg bh, int bhconst,
> +                            int label_index)
> +{
> +    switch (cond) {
> +    case TCG_COND_EQ:
> +    case TCG_COND_NE:
> +        tcg_out_comclr(s, tcg_invert_cond(cond), TCG_REG_R0, al, bl, 
> blconst);
> +        tcg_out_brcond(s, cond, ah, bh, bhconst, label_index);
> +        break;
> +
> +    default:
> +        tcg_out_brcond(s, cond, ah, bh, bhconst, label_index);
> +        tcg_out_comclr(s, TCG_COND_NE, TCG_REG_R0, ah, bh, bhconst);
> +        tcg_out_brcond(s, tcg_unsigned_cond(cond),
> +                       al, bl, blconst, label_index);
> +        break;
> +    }
> +}
> +
> +static void tcg_out_setcond(TCGContext *s, int cond, TCGArg ret,
> +                            TCGArg c1, TCGArg c2, int c2const)
> +{
> +    tcg_out_comclr(s, tcg_invert_cond(cond), ret, c1, c2, c2const);
> +    tcg_out_movi(s, TCG_TYPE_I32, ret, 1);
> +}
> +
> +static void tcg_out_setcond2(TCGContext *s, int cond, TCGArg ret,
> +                             TCGArg al, TCGArg ah, TCGArg bl, int blconst,
> +                             TCGArg bh, int bhconst)
> +{
> +    int scratch = TCG_REG_R20;
> +
> +    if (ret != al && ret != ah
> +        && (blconst || ret != bl)
> +        && (bhconst || ret != bh)) {
> +        scratch = ret;
> +    }
> +
> +    switch (cond) {
> +    case TCG_COND_EQ:
> +    case TCG_COND_NE:
> +        tcg_out_setcond(s, cond, scratch, al, bl, blconst);
> +        tcg_out_comclr(s, TCG_COND_EQ, TCG_REG_R0, ah, bh, bhconst);
> +        tcg_out_movi(s, TCG_TYPE_I32, scratch, cond == TCG_COND_NE);
> +        break;
> +
> +    default:
> +        tcg_out_setcond(s, tcg_unsigned_cond(cond), scratch, al, bl, 
> blconst);
> +        tcg_out_comclr(s, TCG_COND_EQ, TCG_REG_R0, ah, bh, bhconst);
> +        tcg_out_movi(s, TCG_TYPE_I32, scratch, 0);
> +        tcg_out_comclr(s, cond, TCG_REG_R0, ah, bh, bhconst);
> +        tcg_out_movi(s, TCG_TYPE_I32, scratch, 1);
> +        break;
> +    }
> +
> +    tcg_out_mov(s, ret, scratch);
> +}
> +
> +#if defined(CONFIG_SOFTMMU)
>  #include "../../softmmu_defs.h"
>  
>  static void *qemu_ld_helpers[4] = {
> @@ -363,30 +848,77 @@ static void *qemu_st_helpers[4] = {
>      __stl_mmu,
>      __stq_mmu,
>  };
> +
> +/* Load and compare a TLB entry, and branch if TLB miss.  OFFSET is set to
> +   the offset of the first ADDR_READ or ADDR_WRITE member of the appropriate
> +   TLB for the memory index.  The return value is the offset from ENV 
> +   contained in R1 afterward (to be used when loading ADDEND); if the
> +   return value is 0, R1 is not used.  */
> +
> +static int tcg_out_tlb_read(TCGContext *s, int r0, int r1, int addrlo,
> +                            int addrhi, int s_bits, int lab_miss, int offset)
> +{
> +    int ret;
> +
> +    /* Extracting the index into the TLB.  The "normal C operation" is
> +          r1 = addr_reg >> TARGET_PAGE_BITS;
> +          r1 &= CPU_TLB_SIZE - 1;
> +          r1 <<= CPU_TLB_ENTRY_BITS;
> +       What this does is extract CPU_TLB_BITS beginning at TARGET_PAGE_BITS
> +       and place them at CPU_TLB_ENTRY_BITS.  We can combine the first two
> +       operations with an EXTRU.  Unfortunately, the current value of
> +       CPU_TLB_ENTRY_BITS is > 3, so we can't merge that shift with the
> +       add that follows.  */
> +    tcg_out_extr(s, r1, addrlo, TARGET_PAGE_BITS, CPU_TLB_BITS, 0);
> +    tcg_out_andi(s, r0, addrlo, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
> +    tcg_out_shli(s, r1, r1, CPU_TLB_ENTRY_BITS);
> +    tcg_out_arith(s, r1, r1, TCG_AREG0, INSN_ADDL);
> +
> +    /* Make sure that both the addr_{read,write} and addend can be
> +       read with a 14-bit offset from the same base register.  */
> +    if (check_fit_tl(offset + CPU_TLB_SIZE, 14)) {
> +        ret = 0;
> +    } else {
> +        ret = (offset + 0x400) & ~0x7ff;
> +        offset = ret - offset;
> +        tcg_out_addi2(s, TCG_REG_R1, r1, ret);
> +        r1 = TCG_REG_R1;
> +    }
> +
> +    /* Load the entry from the computed slot.  */
> +    if (TARGET_LONG_BITS == 64) {
> +        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R23, r1, offset);
> +        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, r1, offset + 4);
> +    } else {
> +        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, r1, offset);
> +    }
> +
> +    /* If not equal, jump to lab_miss. */
> +    if (TARGET_LONG_BITS == 64) {
> +        tcg_out_brcond2(s, TCG_COND_NE, TCG_REG_R20, TCG_REG_R23,
> +                        r0, 0, addrhi, 0, lab_miss);
> +    } else {
> +        tcg_out_brcond(s, TCG_COND_NE, TCG_REG_R20, r0, 0, lab_miss);
> +    }
> +
> +    return ret;
> +}
>  #endif
>  
>  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
>  {
> -    int addr_reg, data_reg, data_reg2, r0, r1, mem_index, s_bits, bswap;
> +    int addr_reg, addr_reg2;
> +    int data_reg, data_reg2;
> +    int r0, r1, mem_index, s_bits, bswap;
> +    tcg_target_long offset;
>  #if defined(CONFIG_SOFTMMU)
> -    uint32_t *label1_ptr, *label2_ptr;
> -#endif
> -#if TARGET_LONG_BITS == 64
> -#if defined(CONFIG_SOFTMMU)
> -    uint32_t *label3_ptr;
> -#endif
> -    int addr_reg2;
> +    int lab1, lab2, argreg;
>  #endif
>  
>      data_reg = *args++;
> -    if (opc == 3)
> -        data_reg2 = *args++;
> -    else
> -        data_reg2 = 0; /* suppress warning */
> +    data_reg2 = (opc == 3 ? *args++ : TCG_REG_R0);

I am not sure TCG_REG_R0 is really correct here, and I find it confusing.
While it's value is zero, the assignment there is just to make GCC
happy, it won't be used after

>      addr_reg = *args++;
> -#if TARGET_LONG_BITS == 64
> -    addr_reg2 = *args++;
> -#endif
> +    addr_reg2 = (TARGET_LONG_BITS == 64 ? *args++ : TCG_REG_R0);

Same here.

>      mem_index = *args;
>      s_bits = opc & 3;
>  
> @@ -394,96 +926,22 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg 
> *args, int opc)
>      r1 = TCG_REG_R25;
>  
>  #if defined(CONFIG_SOFTMMU)
> -    tcg_out_mov(s, r1, addr_reg);
> +    lab1 = gen_new_label();
> +    lab2 = gen_new_label();

Do you really want to use label here? load/store are the most common
instructions, I am not really sure of the resulting performance.

> -    tcg_out_mov(s, r0, addr_reg);
> +    offset = tcg_out_tlb_read(s, r0, r1, addr_reg, addr_reg2, s_bits, lab1,
> +                              offsetof(CPUState,
> +                                       tlb_table[mem_index][0].addr_read));
>  
> -    tcg_out32(s, SHD | INSN_T(r1) | INSN_R1(TCG_REG_R0) | INSN_R2(r1) |
> -                 INSN_SHDEP_CP(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
> +    /* TLB Hit.  */
> +    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, (offset ? TCG_REG_R1 : r1),
> +               offsetof(CPUState, tlb_table[mem_index][0].addend) - offset);
>  
> -    tcg_out_arithi(s, r0, r0, TARGET_PAGE_MASK | ((1 << s_bits) - 1),
> -                   ARITH_AND);
> -
> -    tcg_out_arithi(s, r1, r1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS,
> -                   ARITH_AND);
> -
> -    tcg_out_arith(s, r1, r1, TCG_AREG0, ARITH_ADD);
> -    tcg_out_arithi(s, r1, r1,
> -                   offsetof(CPUState, tlb_table[mem_index][0].addr_read),
> -                   ARITH_ADD);
> -
> -    tcg_out_ldst(s, TCG_REG_R20, r1, 0, LDW);
> -
> -#if TARGET_LONG_BITS == 32
> -    /* if equal, jump to label1 */
> -    label1_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, COMBT | INSN_R1(TCG_REG_R20) | INSN_R2(r0) |
> -                 INSN_COND(COND_EQUAL));
> -    tcg_out_mov(s, r0, addr_reg); /* delay slot */
> -#else
> -    /* if not equal, jump to label3 */
> -    label3_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, COMBF | INSN_R1(TCG_REG_R20) | INSN_R2(r0) |
> -                 INSN_COND(COND_EQUAL));
> -    tcg_out_mov(s, r0, addr_reg); /* delay slot */
> -
> -    tcg_out_ldst(s, TCG_REG_R20, r1, 4, LDW);
> -
> -    /* if equal, jump to label1 */
> -    label1_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, COMBT | INSN_R1(TCG_REG_R20) | INSN_R2(addr_reg2) |
> -                 INSN_COND(COND_EQUAL));
> -    tcg_out_nop(s); /* delay slot */
> -
> -    /* label3: */
> -    *label3_ptr |= reassemble_12((uint32_t *)s->code_ptr - label3_ptr - 2);
> -#endif
> -
> -#if TARGET_LONG_BITS == 32
> -    tcg_out_mov(s, TCG_REG_R26, addr_reg);
> -    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R25, mem_index);
> -#else
> -    tcg_out_mov(s, TCG_REG_R26, addr_reg);
> -    tcg_out_mov(s, TCG_REG_R25, addr_reg2);
> -    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R24, mem_index);
> -#endif
> -
> -    tcg_out_call(s, qemu_ld_helpers[s_bits]);
> -
> -    switch(opc) {
> -        case 0 | 4:
> -            tcg_out_ext8s(s, data_reg, TCG_REG_RET0);
> -            break;
> -        case 1 | 4:
> -            tcg_out_ext16s(s, data_reg, TCG_REG_RET0);
> -            break;
> -        case 0:
> -        case 1:
> -        case 2:
> -        default:
> -            tcg_out_mov(s, data_reg, TCG_REG_RET0);
> -            break;
> -        case 3:
> -            tcg_abort();
> -            tcg_out_mov(s, data_reg, TCG_REG_RET0);
> -            tcg_out_mov(s, data_reg2, TCG_REG_RET1);
> -            break;
> -    }
> -
> -    /* jump to label2 */
> -    label2_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, BL | INSN_R2(TCG_REG_R0) | 2);
> -
> -    /* label1: */
> -    *label1_ptr |= reassemble_12((uint32_t *)s->code_ptr - label1_ptr - 2);
> -
> -    tcg_out_arithi(s, TCG_REG_R20, r1,
> -                   offsetof(CPUTLBEntry, addend) - offsetof(CPUTLBEntry, 
> addr_read),
> -                   ARITH_ADD);
> -    tcg_out_ldst(s, TCG_REG_R20, TCG_REG_R20, 0, LDW);
> -    tcg_out_arith(s, r0, r0, TCG_REG_R20, ARITH_ADD);
> +    tcg_out_arith(s, r0, addr_reg, TCG_REG_R20, INSN_ADDL);
> +    offset = TCG_REG_R0;
>  #else
>      r0 = addr_reg;
> +    offset = GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_R0;
>  #endif
>  
>  #ifdef TARGET_WORDS_BIGENDIAN
> @@ -492,190 +950,151 @@ static void tcg_out_qemu_ld(TCGContext *s, const 
> TCGArg *args, int opc)
>      bswap = 1;
>  #endif
>      switch (opc) {
> -        case 0:
> -            tcg_out_ldst(s, data_reg, r0, 0, LDB);
> -            break;
> -        case 0 | 4:
> -            tcg_out_ldst(s, data_reg, r0, 0, LDB);
> -            tcg_out_ext8s(s, data_reg, data_reg);
> -            break;
> -        case 1:
> -            tcg_out_ldst(s, data_reg, r0, 0, LDH);
> -            if (bswap)
> -                tcg_out_bswap16(s, data_reg, data_reg);
> -            break;
> -        case 1 | 4:
> -            tcg_out_ldst(s, data_reg, r0, 0, LDH);
> -            if (bswap)
> -                tcg_out_bswap16(s, data_reg, data_reg);
> +    case 0:
> +        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDBX);
> +        break;
> +    case 0 | 4:
> +        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDBX);
> +        tcg_out_ext8s(s, data_reg, data_reg);
> +        break;
> +    case 1:
> +        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDHX);
> +        if (bswap) {
> +            tcg_out_bswap16(s, data_reg, data_reg, 0);
> +        }
> +        break;
> +    case 1 | 4:
> +        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDHX);
> +        if (bswap) {
> +            tcg_out_bswap16(s, data_reg, data_reg, 1);
> +        } else {
>              tcg_out_ext16s(s, data_reg, data_reg);
> -            break;
> -        case 2:
> -            tcg_out_ldst(s, data_reg, r0, 0, LDW);
> -            if (bswap)
> -                tcg_out_bswap32(s, data_reg, data_reg, TCG_REG_R20);
> -            break;
> -        case 3:
> -            tcg_abort();
> -            if (!bswap) {
> -                tcg_out_ldst(s, data_reg, r0, 0, LDW);
> -                tcg_out_ldst(s, data_reg2, r0, 4, LDW);
> +        }
> +        break;
> +    case 2:
> +        tcg_out_ldst_index(s, data_reg, r0, offset, INSN_LDWX);
> +        if (bswap) {
> +            tcg_out_bswap32(s, data_reg, data_reg, TCG_REG_R20);
> +        }
> +        break;
> +    case 3:
> +        if (bswap) {
> +            int t = data_reg2;
> +            data_reg2 = data_reg;
> +            data_reg = t;
> +        }
> +        if (offset == TCG_REG_R0) {
> +            /* Make sure not to clobber the base register.  */
> +            if (data_reg2 == r0) {
> +                tcg_out_ldst(s, data_reg, r0, 4, INSN_LDW);
> +                tcg_out_ldst(s, data_reg2, r0, 0, INSN_LDW);
>              } else {
> -                tcg_out_ldst(s, data_reg, r0, 4, LDW);
> -                tcg_out_bswap32(s, data_reg, data_reg, TCG_REG_R20);
> -                tcg_out_ldst(s, data_reg2, r0, 0, LDW);
> -                tcg_out_bswap32(s, data_reg2, data_reg2, TCG_REG_R20);
> +                tcg_out_ldst(s, data_reg2, r0, 0, INSN_LDW);
> +                tcg_out_ldst(s, data_reg, r0, 4, INSN_LDW);
>              }
> -            break;
> -        default:
> -            tcg_abort();
> +        } else {
> +            tcg_out_addi2(s, TCG_REG_R20, r0, 4);
> +            tcg_out_ldst_index(s, data_reg2, r0, offset, INSN_LDWX);
> +            tcg_out_ldst_index(s, data_reg, TCG_REG_R20, offset, INSN_LDWX);
> +        }
> +        if (bswap) {
> +            tcg_out_bswap32(s, data_reg, data_reg, TCG_REG_R20);
> +            tcg_out_bswap32(s, data_reg2, data_reg2, TCG_REG_R20);
> +        }
> +        break;
> +    default:
> +        tcg_abort();
>      }
>  
>  #if defined(CONFIG_SOFTMMU)
> +    tcg_out_branch(s, lab2, 1);
> +
> +    /* TLB Miss.  */
> +    /* label1: */
> +    tcg_out_label(s, lab1, (tcg_target_long)s->code_ptr);
> +
> +    argreg = TCG_REG_R26;
> +    tcg_out_mov(s, argreg--, addr_reg);
> +    if (TARGET_LONG_BITS == 64) {
> +        tcg_out_mov(s, argreg--, addr_reg2);
> +    }
> +    tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
> +
> +    tcg_out_call(s, qemu_ld_helpers[s_bits]);
> +
> +    switch (opc) {
> +    case 0:
> +        tcg_out_andi(s, data_reg, TCG_REG_RET0, 0xff);
> +        break;
> +    case 0 | 4:
> +        tcg_out_ext8s(s, data_reg, TCG_REG_RET0);
> +        break;
> +    case 1:
> +        tcg_out_andi(s, data_reg, TCG_REG_RET0, 0xffff);
> +        break;
> +    case 1 | 4:
> +        tcg_out_ext16s(s, data_reg, TCG_REG_RET0);
> +        break;
> +    case 2:
> +    case 2 | 4:
> +        tcg_out_mov(s, data_reg, TCG_REG_RET0);
> +        break;
> +    case 3:
> +        tcg_out_mov(s, data_reg, TCG_REG_RET0);
> +        tcg_out_mov(s, data_reg2, TCG_REG_RET1);
> +        break;
> +    default:
> +        tcg_abort();
> +    }
> +
>      /* label2: */
> -    *label2_ptr |= reassemble_17((uint32_t *)s->code_ptr - label2_ptr - 2);
> +    tcg_out_label(s, lab2, (tcg_target_long)s->code_ptr);
>  #endif
>  }
>  
>  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
>  {
> -    int addr_reg, data_reg, data_reg2, r0, r1, mem_index, s_bits, bswap;
> -#if defined(CONFIG_SOFTMMU)
> -    uint32_t *label1_ptr, *label2_ptr;
> -#endif
> -#if TARGET_LONG_BITS == 64
> +    int addr_reg, addr_reg2;
> +    int data_reg, data_reg2;
> +    int r0, r1, mem_index, s_bits, bswap;
>  #if defined(CONFIG_SOFTMMU)
> -    uint32_t *label3_ptr;
> -#endif
> -    int addr_reg2;
> +    tcg_target_long offset;
> +    int lab1, lab2, argreg;
>  #endif
>  
>      data_reg = *args++;
> -    if (opc == 3)
> -        data_reg2 = *args++;
> -    else
> -        data_reg2 = 0; /* suppress warning */
> +    data_reg2 = (opc == 3 ? *args++ : 0);
>      addr_reg = *args++;
> -#if TARGET_LONG_BITS == 64
> -    addr_reg2 = *args++;
> -#endif
> +    addr_reg2 = (TARGET_LONG_BITS == 64 ? *args++ : 0);

Here it makes more sense ;-)

>      mem_index = *args;
> -
>      s_bits = opc;
>  
>      r0 = TCG_REG_R26;
>      r1 = TCG_REG_R25;
>  
>  #if defined(CONFIG_SOFTMMU)
> -    tcg_out_mov(s, r1, addr_reg);
> -
> -    tcg_out_mov(s, r0, addr_reg);
> -
> -    tcg_out32(s, SHD | INSN_T(r1) | INSN_R1(TCG_REG_R0) | INSN_R2(r1) |
> -                 INSN_SHDEP_CP(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
> -
> -    tcg_out_arithi(s, r0, r0, TARGET_PAGE_MASK | ((1 << s_bits) - 1),
> -                   ARITH_AND);
> -
> -    tcg_out_arithi(s, r1, r1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS,
> -                   ARITH_AND);
> +    lab1 = gen_new_label();
> +    lab2 = gen_new_label();

Same here.

> -    tcg_out_arith(s, r1, r1, TCG_AREG0, ARITH_ADD);
> -    tcg_out_arithi(s, r1, r1,
> -                   offsetof(CPUState, tlb_table[mem_index][0].addr_write),
> -                   ARITH_ADD);
> +    offset = tcg_out_tlb_read(s, r0, r1, addr_reg, addr_reg2, s_bits, lab1,
> +                              offsetof(CPUState,
> +                                       tlb_table[mem_index][0].addr_write));
>  
> -    tcg_out_ldst(s, TCG_REG_R20, r1, 0, LDW);
> +    /* TLB Hit.  */
> +    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, (offset ? TCG_REG_R1 : r1),
> +               offsetof(CPUState, tlb_table[mem_index][0].addend) - offset);
>  
> -#if TARGET_LONG_BITS == 32
> -    /* if equal, jump to label1 */
> -    label1_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, COMBT | INSN_R1(TCG_REG_R20) | INSN_R2(r0) |
> -                 INSN_COND(COND_EQUAL));
> -    tcg_out_mov(s, r0, addr_reg); /* delay slot */
> +    tcg_out_arith(s, r0, addr_reg, TCG_REG_R20, INSN_ADDL);
>  #else
> -    /* if not equal, jump to label3 */
> -    label3_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, COMBF | INSN_R1(TCG_REG_R20) | INSN_R2(r0) |
> -                 INSN_COND(COND_EQUAL));
> -    tcg_out_mov(s, r0, addr_reg); /* delay slot */
> -
> -    tcg_out_ldst(s, TCG_REG_R20, r1, 4, LDW);
> -
> -    /* if equal, jump to label1 */
> -    label1_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, COMBT | INSN_R1(TCG_REG_R20) | INSN_R2(addr_reg2) |
> -                 INSN_COND(COND_EQUAL));
> -    tcg_out_nop(s); /* delay slot */
> -
> -    /* label3: */
> -    *label3_ptr |= reassemble_12((uint32_t *)s->code_ptr - label3_ptr - 2);
> -#endif
> -
> -    tcg_out_mov(s, TCG_REG_R26, addr_reg);
> -#if TARGET_LONG_BITS == 64
> -    tcg_out_mov(s, TCG_REG_R25, addr_reg2);
> -    if (opc == 3) {
> -        tcg_abort();
> -        tcg_out_mov(s, TCG_REG_R24, data_reg);
> -        tcg_out_mov(s, TCG_REG_R23, data_reg2);
> -        /* TODO: push mem_index */
> -        tcg_abort();
> +    /* There are no indexed stores, so if GUEST_BASE is set
> +       we must do the add explicitly.  Careful to avoid R20,
> +       which is used for the bswaps to follow.  */
> +    if (GUEST_BASE == 0) {
> +        r0 = addr_reg;
>      } else {
> -        switch(opc) {
> -        case 0:
> -            tcg_out32(s, EXTRU | INSN_R1(TCG_REG_R24) | INSN_R2(data_reg) |
> -                         INSN_SHDEP_P(31) | INSN_DEP_LEN(8));
> -            break;
> -        case 1:
> -            tcg_out32(s, EXTRU | INSN_R1(TCG_REG_R24) | INSN_R2(data_reg) |
> -                         INSN_SHDEP_P(31) | INSN_DEP_LEN(16));
> -            break;
> -        case 2:
> -            tcg_out_mov(s, TCG_REG_R24, data_reg);
> -            break;
> -        }
> -        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R23, mem_index);
> +        tcg_out_arith(s, TCG_REG_R31, addr_reg, TCG_GUEST_BASE_REG, 
> INSN_ADDL);
> +        r0 = TCG_REG_R31;
>      }
> -#else
> -    if (opc == 3) {
> -        tcg_abort();
> -        tcg_out_mov(s, TCG_REG_R25, data_reg);
> -        tcg_out_mov(s, TCG_REG_R24, data_reg2);
> -        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R23, mem_index);
> -    } else {
> -        switch(opc) {
> -        case 0:
> -            tcg_out32(s, EXTRU | INSN_R1(TCG_REG_R25) | INSN_R2(data_reg) |
> -                         INSN_SHDEP_P(31) | INSN_DEP_LEN(8));
> -            break;
> -        case 1:
> -            tcg_out32(s, EXTRU | INSN_R1(TCG_REG_R25) | INSN_R2(data_reg) |
> -                         INSN_SHDEP_P(31) | INSN_DEP_LEN(16));
> -            break;
> -        case 2:
> -            tcg_out_mov(s, TCG_REG_R25, data_reg);
> -            break;
> -        }
> -        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R24, mem_index);
> -    }
> -#endif
> -    tcg_out_call(s, qemu_st_helpers[s_bits]);
> -
> -    /* jump to label2 */
> -    label2_ptr = (uint32_t *)s->code_ptr;
> -    tcg_out32(s, BL | INSN_R2(TCG_REG_R0) | 2);
> -
> -    /* label1: */
> -    *label1_ptr |= reassemble_12((uint32_t *)s->code_ptr - label1_ptr - 2);
> -
> -    tcg_out_arithi(s, TCG_REG_R20, r1,
> -                   offsetof(CPUTLBEntry, addend) - offsetof(CPUTLBEntry, 
> addr_write),
> -                   ARITH_ADD);
> -    tcg_out_ldst(s, TCG_REG_R20, TCG_REG_R20, 0, LDW);
> -    tcg_out_arith(s, r0, r0, TCG_REG_R20, ARITH_ADD);
> -#else
> -    r0 = addr_reg;
>  #endif
>  
>  #ifdef TARGET_WORDS_BIGENDIAN
> @@ -685,170 +1104,345 @@ static void tcg_out_qemu_st(TCGContext *s, const 
> TCGArg *args, int opc)
>  #endif
>      switch (opc) {
>      case 0:
> -        tcg_out_ldst(s, data_reg, r0, 0, STB);
> +        tcg_out_ldst(s, data_reg, r0, 0, INSN_STB);
>          break;
>      case 1:
>          if (bswap) {
> -            tcg_out_bswap16(s, TCG_REG_R20, data_reg);
> +            tcg_out_bswap16(s, TCG_REG_R20, data_reg, 0);
>              data_reg = TCG_REG_R20;
>          }
> -        tcg_out_ldst(s, data_reg, r0, 0, STH);
> +        tcg_out_ldst(s, data_reg, r0, 0, INSN_STH);
>          break;
>      case 2:
>          if (bswap) {
>              tcg_out_bswap32(s, TCG_REG_R20, data_reg, TCG_REG_R20);
>              data_reg = TCG_REG_R20;
>          }
> -        tcg_out_ldst(s, data_reg, r0, 0, STW);
> +        tcg_out_ldst(s, data_reg, r0, 0, INSN_STW);
>          break;
>      case 3:
> -        tcg_abort();
> -        if (!bswap) {
> -            tcg_out_ldst(s, data_reg, r0, 0, STW);
> -            tcg_out_ldst(s, data_reg2, r0, 4, STW);
> -        } else {
> +        if (bswap) {
>              tcg_out_bswap32(s, TCG_REG_R20, data_reg, TCG_REG_R20);
> -            tcg_out_ldst(s, TCG_REG_R20, r0, 4, STW);
> -            tcg_out_bswap32(s, TCG_REG_R20, data_reg2, TCG_REG_R20);
> -            tcg_out_ldst(s, TCG_REG_R20, r0, 0, STW);
> +            tcg_out_bswap32(s, TCG_REG_R23, data_reg2, TCG_REG_R23);
> +            data_reg2 = TCG_REG_R20;
> +            data_reg = TCG_REG_R23;
>          }
> +        tcg_out_ldst(s, data_reg2, r0, 0, INSN_STW);
> +        tcg_out_ldst(s, data_reg, r0, 4, INSN_STW);
>          break;
>      default:
>          tcg_abort();
>      }
>  
>  #if defined(CONFIG_SOFTMMU)
> +    tcg_out_branch(s, lab2, 1);
> +
> +    /* TLB Miss.  */
> +    /* label1: */
> +    tcg_out_label(s, lab1, (tcg_target_long)s->code_ptr);
> +
> +    argreg = TCG_REG_R26;
> +    tcg_out_mov(s, argreg--, addr_reg);
> +    if (TARGET_LONG_BITS == 64) {
> +        tcg_out_mov(s, argreg--, addr_reg2);
> +    }
> +
> +    switch(opc) {
> +    case 0:
> +        tcg_out_andi(s, argreg--, data_reg, 0xff);
> +        tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
> +        break;
> +    case 1:
> +        tcg_out_andi(s, argreg--, data_reg, 0xffff);
> +        tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
> +        break;
> +    case 2:
> +        tcg_out_mov(s, argreg--, data_reg);
> +        tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
> +        break;
> +    case 3:
> +        /* Because of the alignment required by the 64-bit data argument,
> +           we will always use R23/R24.  Also, we will always run out of
> +           argument registers for storing mem_index, so that will have 
> +           to go on the stack.  */
> +        if (mem_index == 0) {
> +            argreg = TCG_REG_R0;
> +        } else {
> +            argreg = TCG_REG_R20;
> +            tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
> +        }
> +        tcg_out_mov(s, TCG_REG_R23, data_reg2);
> +        tcg_out_mov(s, TCG_REG_R24, data_reg);
> +        tcg_out_st(s, TCG_TYPE_I32, argreg, TCG_REG_SP,
> +                   TCG_TARGET_CALL_STACK_OFFSET - 4);
> +        break;
> +    default:
> +        tcg_abort();
> +    }
> +
> +    tcg_out_call(s, qemu_st_helpers[s_bits]);
> +
>      /* label2: */
> -    *label2_ptr |= reassemble_17((uint32_t *)s->code_ptr - label2_ptr - 2);
> +    tcg_out_label(s, lab2, (tcg_target_long)s->code_ptr);
>  #endif
>  }
>  
> +static void tcg_out_exit_tb(TCGContext *s, TCGArg arg)
> +{
> +    if (!check_fit_tl(arg, 14)) {
> +        uint32_t hi, lo;
> +        hi = arg & ~0x7ff;
> +        lo = arg & 0x7ff;
> +        if (lo) {
> +            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RET0, hi);
> +            tcg_out32(s, INSN_BV | INSN_R2(TCG_REG_R18));
> +            tcg_out_addi(s, TCG_REG_RET0, lo);
> +            return;
> +        }
> +        arg = hi;
> +    }
> +    tcg_out32(s, INSN_BV | INSN_R2(TCG_REG_R18));
> +    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RET0, arg);
> +}
> +
> +static void tcg_out_goto_tb(TCGContext *s, TCGArg arg)
> +{
> +    if (s->tb_jmp_offset) {
> +        /* direct jump method */
> +        fprintf(stderr, "goto_tb direct\n");
> +        tcg_abort();
> +    } else {
> +        /* indirect jump method */
> +        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, TCG_REG_R0,
> +                   (tcg_target_long)(s->tb_next + arg));
> +        tcg_out32(s, INSN_BV_N | INSN_R2(TCG_REG_R20));
> +    }
> +    s->tb_next_offset[arg] = s->code_ptr - s->code_buf;
> +}
> +
>  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg 
> *args,
>                                const int *const_args)
>  {
> -    int c;
> -
>      switch (opc) {
>      case INDEX_op_exit_tb:
> -        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RET0, args[0]);
> -        tcg_out32(s, BV_N | INSN_R2(TCG_REG_R18));
> +        tcg_out_exit_tb(s, args[0]);
>          break;
>      case INDEX_op_goto_tb:
> -        if (s->tb_jmp_offset) {
> -            /* direct jump method */
> -            fprintf(stderr, "goto_tb direct\n");
> -            tcg_abort();
> -            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R20, args[0]);
> -            tcg_out32(s, BV_N | INSN_R2(TCG_REG_R20));
> -            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
> -        } else {
> -            /* indirect jump method */
> -            tcg_out_ld_ptr(s, TCG_REG_R20,
> -                           (tcg_target_long)(s->tb_next + args[0]));
> -            tcg_out32(s, BV_N | INSN_R2(TCG_REG_R20));
> -        }
> -        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
> +        tcg_out_goto_tb(s, args[0]);
>          break;
> +
>      case INDEX_op_call:
> -        tcg_out32(s, BLE_SR4 | INSN_R2(args[0]));
> -        tcg_out_mov(s, TCG_REG_RP, TCG_REG_R31);
> +        if (const_args[0]) {
> +            tcg_out_call(s, (void *)args[0]);
> +        } else {
> +            tcg_out32(s, INSN_BLE_SR4 | INSN_R2(args[0]));
> +            tcg_out_mov(s, TCG_REG_RP, TCG_REG_R31);
> +        }
>          break;
> +
>      case INDEX_op_jmp:
>          fprintf(stderr, "unimplemented jmp\n");
>          tcg_abort();
>          break;
> +
>      case INDEX_op_br:
> -        fprintf(stderr, "unimplemented br\n");
> -        tcg_abort();
> +        tcg_out_branch(s, args[0], 1);
>          break;
> +
>      case INDEX_op_movi_i32:
>          tcg_out_movi(s, TCG_TYPE_I32, args[0], (uint32_t)args[1]);
>          break;
>  
>      case INDEX_op_ld8u_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], LDB);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDB);
>          break;
>      case INDEX_op_ld8s_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], LDB);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDB);
>          tcg_out_ext8s(s, args[0], args[0]);
>          break;
>      case INDEX_op_ld16u_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], LDH);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDH);
>          break;
>      case INDEX_op_ld16s_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], LDH);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDH);
>          tcg_out_ext16s(s, args[0], args[0]);
>          break;
>      case INDEX_op_ld_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], LDW);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_LDW);
>          break;
>  
>      case INDEX_op_st8_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], STB);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_STB);
>          break;
>      case INDEX_op_st16_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], STH);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_STH);
>          break;
>      case INDEX_op_st_i32:
> -        tcg_out_ldst(s, args[0], args[1], args[2], STW);
> +        tcg_out_ldst(s, args[0], args[1], args[2], INSN_STW);
> +        break;
> +
> +    case INDEX_op_add_i32:
> +        if (const_args[2]) {
> +            tcg_out_addi2(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_arith(s, args[0], args[1], args[2], INSN_ADDL);
> +        }
>          break;
>  
>      case INDEX_op_sub_i32:
> -        c = ARITH_SUB;
> -        goto gen_arith;
> +        if (const_args[1]) {
> +            if (const_args[2]) {
> +                tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1] - args[2]);
> +            } else {
> +                /* Recall that SUBI is a reversed subtract.  */
> +                tcg_out_arithi(s, args[0], args[2], args[1], INSN_SUBI);
> +            }
> +        } else if (const_args[2]) {
> +            tcg_out_addi2(s, args[0], args[1], -args[2]);
> +        } else {
> +            tcg_out_arith(s, args[0], args[1], args[2], INSN_SUB);
> +        }
> +        break;
> +
>      case INDEX_op_and_i32:
> -        c = ARITH_AND;
> -        goto gen_arith;
> +        if (const_args[2]) {
> +            tcg_out_andi(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_arith(s, args[0], args[1], args[2], INSN_AND);
> +        }
> +        break;
> +
>      case INDEX_op_or_i32:
> -        c = ARITH_OR;
> -        goto gen_arith;
> +        if (const_args[2]) {
> +            tcg_out_ori(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_arith(s, args[0], args[1], args[2], INSN_OR);
> +        }
> +        break;
> +
>      case INDEX_op_xor_i32:
> -        c = ARITH_XOR;
> -        goto gen_arith;
> -    case INDEX_op_add_i32:
> -        c = ARITH_ADD;
> -        goto gen_arith;
> +        tcg_out_arith(s, args[0], args[1], args[2], INSN_XOR);
> +        break;
> +
> +    case INDEX_op_andc_i32:
> +        if (const_args[2]) {
> +            tcg_out_andi(s, args[0], args[1], ~args[2]);
> +        } else {
> +            tcg_out_arith(s, args[0], args[1], args[2], INSN_ANDCM);
> +        }
> +        break;
>  
>      case INDEX_op_shl_i32:
> -        tcg_out32(s, SUBI | INSN_R1(TCG_REG_R20) | INSN_R2(args[2]) |
> -                     lowsignext(0x1f, 0, 11));
> -        tcg_out32(s, MTCTL | INSN_R2(11) | INSN_R1(TCG_REG_R20));
> -        tcg_out32(s, ZVDEP | INSN_R2(args[0]) | INSN_R1(args[1]) |
> -                     INSN_DEP_LEN(32));
> +        if (const_args[2]) {
> +            tcg_out_shli(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_shl(s, args[0], args[1], args[2]);
> +        }
>          break;
> +
>      case INDEX_op_shr_i32:
> -        tcg_out32(s, MTCTL | INSN_R2(11) | INSN_R1(args[2]));
> -        tcg_out32(s, VSHD | INSN_T(args[0]) | INSN_R1(TCG_REG_R0) |
> -                     INSN_R2(args[1]));
> +        if (const_args[2]) {
> +            tcg_out_shri(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_shr(s, args[0], args[1], args[2]);
> +        }
>          break;
> +
>      case INDEX_op_sar_i32:
> -        tcg_out32(s, SUBI | INSN_R1(TCG_REG_R20) | INSN_R2(args[2]) |
> -                     lowsignext(0x1f, 0, 11));
> -        tcg_out32(s, MTCTL | INSN_R2(11) | INSN_R1(TCG_REG_R20));
> -        tcg_out32(s, VEXTRS | INSN_R1(args[0]) | INSN_R2(args[1]) |
> -                     INSN_DEP_LEN(32));
> +        if (const_args[2]) {
> +            tcg_out_sari(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_sar(s, args[0], args[1], args[2]);
> +        }
> +        break;
> +
> +    case INDEX_op_rotl_i32:
> +        if (const_args[2]) {
> +            tcg_out_rotli(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_rotl(s, args[0], args[1], args[2]);
> +        }
> +        break;
> +
> +    case INDEX_op_rotr_i32:
> +        if (const_args[2]) {
> +            tcg_out_rotri(s, args[0], args[1], args[2]);
> +        } else {
> +            tcg_out_rotr(s, args[0], args[1], args[2]);
> +        }
>          break;
>  
>      case INDEX_op_mul_i32:
> -        fprintf(stderr, "unimplemented mul\n");
> -        tcg_abort();
> +        tcg_out_xmpyu(s, args[0], TCG_REG_R0, args[1], args[2]);
>          break;
>      case INDEX_op_mulu2_i32:
> -        fprintf(stderr, "unimplemented mulu2\n");
> -        tcg_abort();
> +        tcg_out_xmpyu(s, args[0], args[1], args[2], args[3]);
>          break;
> -    case INDEX_op_div2_i32:
> -        fprintf(stderr, "unimplemented div2\n");
> -        tcg_abort();
> +
> +    case INDEX_op_bswap16_i32:
> +        tcg_out_bswap16(s, args[0], args[1], 0);
>          break;
> -    case INDEX_op_divu2_i32:
> -        fprintf(stderr, "unimplemented divu2\n");
> -        tcg_abort();
> +    case INDEX_op_bswap32_i32:
> +        tcg_out_bswap32(s, args[0], args[1], TCG_REG_R20);
> +        break;
> +
> +    case INDEX_op_not_i32:
> +        tcg_out_arithi(s, args[0], args[1], -1, INSN_SUBI);
> +        break;
> +    case INDEX_op_ext8s_i32:
> +        tcg_out_ext8s(s, args[0], args[1]);
> +        break;
> +    case INDEX_op_ext16s_i32:
> +        tcg_out_ext16s(s, args[0], args[1]);
> +        break;
> +
> +    /* These three correspond exactly to the fallback implementation.
> +       But by including them we reduce the number of TCG ops that 
> +       need to be generated, and these opcodes are fairly common.  */

Are you sure it really makes a difference?

> +    case INDEX_op_neg_i32:
> +        tcg_out_arith(s, args[0], TCG_REG_R0, args[1], INSN_SUB);
> +        break;
> +    case INDEX_op_ext8u_i32:
> +        tcg_out_andi(s, args[0], args[1], 0xff);
> +        break;
> +    case INDEX_op_ext16u_i32:
> +        tcg_out_andi(s, args[0], args[1], 0xffff);
>          break;
>  
>      case INDEX_op_brcond_i32:
> -        fprintf(stderr, "unimplemented brcond\n");
> -        tcg_abort();
> +        tcg_out_brcond(s, args[2], args[0], args[1], const_args[1], args[3]);
> +        break;
> +    case INDEX_op_brcond2_i32:
> +        tcg_out_brcond2(s, args[4], args[0], args[1],
> +                        args[2], const_args[2],
> +                        args[3], const_args[3], args[5]);
> +        break;
> +
> +    case INDEX_op_setcond_i32:
> +        tcg_out_setcond(s, args[3], args[0], args[1], args[2], 
> const_args[2]);
> +        break;
> +    case INDEX_op_setcond2_i32:
> +        tcg_out_setcond2(s, args[5], args[0], args[1], args[2],
> +                         args[3], const_args[3], args[4], const_args[4]);
> +        break;
> +
> +    case INDEX_op_add2_i32:
> +        if (const_args[4]) {
> +            tcg_out_arithi(s, args[0], args[2], args[4], INSN_ADDI);
> +        } else {
> +            tcg_out_arith(s, args[0], args[2], args[4], INSN_ADD);
> +        }
> +        tcg_out_arith(s, args[1], args[3], args[5], INSN_ADDC);
> +        break;
> +
> +    case INDEX_op_sub2_i32:
> +        if (const_args[2]) {
> +            /* Recall that SUBI is a reversed subtract.  */
> +            tcg_out_arithi(s, args[0], args[4], args[2], INSN_SUBI);
> +        } else {
> +            tcg_out_arith(s, args[0], args[2], args[4], INSN_SUB);
> +        }
> +        tcg_out_arith(s, args[1], args[3], args[5], INSN_SUBB);
>          break;
>  
>      case INDEX_op_qemu_ld8u:
> @@ -866,6 +1460,9 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
> opc, const TCGArg *args,
>      case INDEX_op_qemu_ld32:
>          tcg_out_qemu_ld(s, args, 2);
>          break;
> +    case INDEX_op_qemu_ld64:
> +        tcg_out_qemu_ld(s, args, 3);
> +        break;
>  
>      case INDEX_op_qemu_st8:
>          tcg_out_qemu_st(s, args, 0);
> @@ -876,47 +1473,70 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
> opc, const TCGArg *args,
>      case INDEX_op_qemu_st32:
>          tcg_out_qemu_st(s, args, 2);
>          break;
> +    case INDEX_op_qemu_st64:
> +        tcg_out_qemu_st(s, args, 3);
> +        break;
>  
>      default:
>          fprintf(stderr, "unknown opcode 0x%x\n", opc);
>          tcg_abort();
>      }
> -    return;
> -
> -gen_arith:
> -    tcg_out_arith(s, args[0], args[1], args[2], c);
>  }
>  
>  static const TCGTargetOpDef hppa_op_defs[] = {
>      { INDEX_op_exit_tb, { } },
>      { INDEX_op_goto_tb, { } },
>  
> -    { INDEX_op_call, { "r" } },
> +    { INDEX_op_call, { "ri" } },
>      { INDEX_op_jmp, { "r" } },
>      { INDEX_op_br, { } },
>  
>      { INDEX_op_mov_i32, { "r", "r" } },
>      { INDEX_op_movi_i32, { "r" } },
> +
>      { INDEX_op_ld8u_i32, { "r", "r" } },
>      { INDEX_op_ld8s_i32, { "r", "r" } },
>      { INDEX_op_ld16u_i32, { "r", "r" } },
>      { INDEX_op_ld16s_i32, { "r", "r" } },
>      { INDEX_op_ld_i32, { "r", "r" } },
> -    { INDEX_op_st8_i32, { "r", "r" } },
> -    { INDEX_op_st16_i32, { "r", "r" } },
> -    { INDEX_op_st_i32, { "r", "r" } },
> +    { INDEX_op_st8_i32, { "rZ", "r" } },
> +    { INDEX_op_st16_i32, { "rZ", "r" } },
> +    { INDEX_op_st_i32, { "rZ", "r" } },
> +
> +    { INDEX_op_add_i32, { "r", "rZ", "ri" } },
> +    { INDEX_op_sub_i32, { "r", "rI", "ri" } },
> +    { INDEX_op_and_i32, { "r", "rZ", "ri" } },
> +    { INDEX_op_or_i32, { "r", "rZ", "ri" } },

Already commented for "and" and "or", but the same apply for add and 
sub. Do we really need a "i" contraints here if the constant is going 
to be loaded with a movi.

> +    { INDEX_op_xor_i32, { "r", "rZ", "rZ" } },
> +    { INDEX_op_andc_i32, { "r", "rZ", "ri" } },

same here.

> +
> +    { INDEX_op_mul_i32, { "r", "r", "r" } },
> +    { INDEX_op_mulu2_i32, { "r", "r", "r", "r" } },
>  
> -    { INDEX_op_add_i32, { "r", "r", "r" } },
> -    { INDEX_op_sub_i32, { "r", "r", "r" } },
> -    { INDEX_op_and_i32, { "r", "r", "r" } },
> -    { INDEX_op_or_i32, { "r", "r", "r" } },
> -    { INDEX_op_xor_i32, { "r", "r", "r" } },
> +    { INDEX_op_shl_i32, { "r", "r", "ri" } },
> +    { INDEX_op_shr_i32, { "r", "r", "ri" } },
> +    { INDEX_op_sar_i32, { "r", "r", "ri" } },
> +    { INDEX_op_rotl_i32, { "r", "r", "ri" } },
> +    { INDEX_op_rotr_i32, { "r", "r", "ri" } },
>  
> -    { INDEX_op_shl_i32, { "r", "r", "r" } },
> -    { INDEX_op_shr_i32, { "r", "r", "r" } },
> -    { INDEX_op_sar_i32, { "r", "r", "r" } },
> +    { INDEX_op_bswap16_i32, { "r", "r" } },
> +    { INDEX_op_bswap32_i32, { "r", "r" } },
> +    { INDEX_op_neg_i32, { "r", "r" } },
> +    { INDEX_op_not_i32, { "r", "r" } },
>  
> -    { INDEX_op_brcond_i32, { "r", "r" } },
> +    { INDEX_op_ext8s_i32, { "r", "r" } },
> +    { INDEX_op_ext8u_i32, { "r", "r" } },
> +    { INDEX_op_ext16s_i32, { "r", "r" } },
> +    { INDEX_op_ext16u_i32, { "r", "r" } },
> +
> +    { INDEX_op_brcond_i32, { "rZ", "rJ" } },
> +    { INDEX_op_brcond2_i32,  { "rZ", "rZ", "rJ", "rJ" } },
> +
> +    { INDEX_op_setcond_i32, { "r", "rZ", "rI" } },
> +    { INDEX_op_setcond2_i32, { "r", "rZ", "rZ", "rI", "rI" } },
> +
> +    { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rI", "rZ" } },
> +    { INDEX_op_sub2_i32, { "r", "r", "rI", "rZ", "rZ", "rZ" } },
>  
>  #if TARGET_LONG_BITS == 32
>      { INDEX_op_qemu_ld8u, { "r", "L" } },
> @@ -926,10 +1546,10 @@ static const TCGTargetOpDef hppa_op_defs[] = {
>      { INDEX_op_qemu_ld32, { "r", "L" } },
>      { INDEX_op_qemu_ld64, { "r", "r", "L" } },
>  
> -    { INDEX_op_qemu_st8, { "L", "L" } },
> -    { INDEX_op_qemu_st16, { "L", "L" } },
> -    { INDEX_op_qemu_st32, { "L", "L" } },
> -    { INDEX_op_qemu_st64, { "L", "L", "L" } },
> +    { INDEX_op_qemu_st8, { "LZ", "L" } },
> +    { INDEX_op_qemu_st16, { "LZ", "L" } },
> +    { INDEX_op_qemu_st32, { "LZ", "L" } },
> +    { INDEX_op_qemu_st64, { "LZ", "LZ", "L" } },
>  #else
>      { INDEX_op_qemu_ld8u, { "r", "L", "L" } },
>      { INDEX_op_qemu_ld8s, { "r", "L", "L" } },
> @@ -938,25 +1558,98 @@ static const TCGTargetOpDef hppa_op_defs[] = {
>      { INDEX_op_qemu_ld32, { "r", "L", "L" } },
>      { INDEX_op_qemu_ld64, { "r", "r", "L", "L" } },
>  
> -    { INDEX_op_qemu_st8, { "L", "L", "L" } },
> -    { INDEX_op_qemu_st16, { "L", "L", "L" } },
> -    { INDEX_op_qemu_st32, { "L", "L", "L" } },
> -    { INDEX_op_qemu_st64, { "L", "L", "L", "L" } },
> +    { INDEX_op_qemu_st8, { "LZ", "L", "L" } },
> +    { INDEX_op_qemu_st16, { "LZ", "L", "L" } },
> +    { INDEX_op_qemu_st32, { "LZ", "L", "L" } },
> +    { INDEX_op_qemu_st64, { "LZ", "LZ", "L", "L" } },
>  #endif
>      { -1 },
>  };
>  
> +static int tcg_target_callee_save_regs[] = {
> +    /* R2, the return address register, is saved specially
> +       in the caller's frame.  */
> +    /* R3, the frame pointer, is not currently modified.  */
> +    TCG_REG_R4,
> +    TCG_REG_R5,
> +    TCG_REG_R6,
> +    TCG_REG_R7,
> +    TCG_REG_R8,
> +    TCG_REG_R9,
> +    TCG_REG_R10,
> +    TCG_REG_R11,
> +    TCG_REG_R12,
> +    TCG_REG_R13,
> +    TCG_REG_R14,
> +    TCG_REG_R15,
> +    TCG_REG_R16,
> +    /* R17 is the global env, so no need to save.  */
> +    TCG_REG_R18
> +};
> +
> +void tcg_target_qemu_prologue(TCGContext *s)
> +{
> +    int frame_size, i;
> +
> +    /* Allocate space for the fixed frame marker.  */
> +    frame_size = -TCG_TARGET_CALL_STACK_OFFSET;
> +    frame_size += TCG_TARGET_STATIC_CALL_ARGS_SIZE;
> +
> +    /* Allocate space for the saved registers.  */
> +    frame_size += ARRAY_SIZE(tcg_target_callee_save_regs) * 4;
> +
> +    /* Align the allocated space.  */
> +    frame_size = ((frame_size + TCG_TARGET_STACK_ALIGN - 1)
> +                  & -TCG_TARGET_STACK_ALIGN);
> +
> +    /* The return address is stored in the caller's frame.  */
> +    tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_RP, TCG_REG_SP, -20);
> +
> +    /* Allocate stack frame, saving the first register at the same time.  */
> +    tcg_out_ldst(s, tcg_target_callee_save_regs[0],
> +                 TCG_REG_SP, frame_size, INSN_STWM);
> +
> +    /* Save all callee saved registers.  */
> +    for (i = 1; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
> +        tcg_out_st(s, TCG_TYPE_PTR, tcg_target_callee_save_regs[i],
> +                   TCG_REG_SP, -frame_size + i * 4);
> +    }
> +
> +    if (GUEST_BASE != 0) {
> +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, GUEST_BASE);
> +    }

The final GUEST_BASE value is computed after the prologue has been
generated. The value is modified in two cases:
- The user specify a non-aligned base address.
- /proc/sys/vm/mmap_min_addr is different than 0, which is now the
  in default configuration for more than one year.

When it happens, the guest crashes almost immediately.

> +    /* Jump to TB, and adjust R18 to be the return address.  */
> +    tcg_out32(s, INSN_BLE_SR4 | INSN_R2(TCG_REG_R26));
> +    tcg_out_mov(s, TCG_REG_R18, TCG_REG_R31);
> +
> +    /* Restore callee saved registers.  */
> +    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_RP, TCG_REG_SP, -frame_size - 20);
> +    for (i = 1; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
> +        tcg_out_ld(s, TCG_TYPE_PTR, tcg_target_callee_save_regs[i],
> +                   TCG_REG_SP, -frame_size + i * 4);
> +    }
> +
> +    /* Deallocate stack frame and return.  */
> +    tcg_out32(s, INSN_BV | INSN_R2(TCG_REG_RP));
> +    tcg_out_ldst(s, tcg_target_callee_save_regs[0],
> +                 TCG_REG_SP, -frame_size, INSN_LDWM);
> +}
> +
>  void tcg_target_init(TCGContext *s)
>  {
>      tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffffffff);
> -    tcg_regset_set32(tcg_target_call_clobber_regs, 0,
> -                     (1 << TCG_REG_R20) |
> -                     (1 << TCG_REG_R21) |
> -                     (1 << TCG_REG_R22) |
> -                     (1 << TCG_REG_R23) |
> -                     (1 << TCG_REG_R24) |
> -                     (1 << TCG_REG_R25) |
> -                     (1 << TCG_REG_R26));
> +
> +    tcg_regset_clear(tcg_target_call_clobber_regs);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R20);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R21);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R22);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R23);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R24);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R25);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R26);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RET0);
> +    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RET1);
>  
>      tcg_regset_clear(s->reserved_regs);
>      tcg_regset_set_reg(s->reserved_regs, TCG_REG_R0);  /* hardwired to zero 
> */
> @@ -969,6 +1662,9 @@ void tcg_target_init(TCGContext *s)
>      tcg_regset_set_reg(s->reserved_regs, TCG_REG_DP);  /* data pointer */
>      tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);  /* stack pointer */
>      tcg_regset_set_reg(s->reserved_regs, TCG_REG_R31); /* ble link reg */
> +    if (GUEST_BASE != 0) {
> +        tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
> +    }
>  
>      tcg_add_target_add_op_defs(hppa_op_defs);
>  }
> diff --git a/tcg/hppa/tcg-target.h b/tcg/hppa/tcg-target.h
> index e956e71..36b6949 100644
> --- a/tcg/hppa/tcg-target.h
> +++ b/tcg/hppa/tcg-target.h
> @@ -69,17 +69,33 @@ enum {
>      TCG_REG_R31,
>  };
>  
> +#define TCG_CT_CONST_0    0x0100
> +#define TCG_CT_CONST_S5   0x0200
> +#define TCG_CT_CONST_S11  0x0400
> +
>  /* used for function call generation */
>  #define TCG_REG_CALL_STACK TCG_REG_SP
> -#define TCG_TARGET_STACK_ALIGN 16
> +#define TCG_TARGET_STACK_ALIGN 64
> +#define TCG_TARGET_CALL_STACK_OFFSET -48
> +#define TCG_TARGET_STATIC_CALL_ARGS_SIZE 8*4
> +#define TCG_TARGET_CALL_ALIGN_ARGS 1
>  #define TCG_TARGET_STACK_GROWSUP
>  
>  /* optional instructions */
> -#define TCG_TARGET_HAS_div2_i32
> -//#define TCG_TARGET_HAS_ext8s_i32
> -//#define TCG_TARGET_HAS_ext16s_i32
> -//#define TCG_TARGET_HAS_bswap16_i32
> -//#define TCG_TARGET_HAS_bswap32_i32
> +// #define TCG_TARGET_HAS_div_i32
> +#define TCG_TARGET_HAS_rot_i32
> +#define TCG_TARGET_HAS_ext8s_i32
> +#define TCG_TARGET_HAS_ext16s_i32
> +#define TCG_TARGET_HAS_ext8u_i32
> +#define TCG_TARGET_HAS_ext16u_i32
> +#define TCG_TARGET_HAS_bswap16_i32
> +#define TCG_TARGET_HAS_bswap32_i32
> +#define TCG_TARGET_HAS_not_i32
> +#define TCG_TARGET_HAS_neg_i32
> +#define TCG_TARGET_HAS_andc_i32
> +// #define TCG_TARGET_HAS_orc_i32
> +
> +#define TCG_TARGET_HAS_GUEST_BASE
>  
>  /* Note: must be synced with dyngen-exec.h */
>  #define TCG_AREG0 TCG_REG_R17
> @@ -87,116 +103,12 @@ enum {
>  static inline void flush_icache_range(unsigned long start, unsigned long 
> stop)
>  {
>      start &= ~31;
> -    while (start <= stop)
> -    {
> -        asm volatile ("fdc 0(%0)\n"
> -                      "sync\n"
> -                      "fic 0(%%sr4, %0)\n"
> -                      "sync\n"
> +    while (start <= stop) {
> +        asm volatile ("fdc 0(%0)\n\t"
> +                      "sync\n\t"
> +                      "fic 0(%%sr4, %0)\n\t"
> +                      "sync"
>                        : : "r"(start) : "memory");
>          start += 32;
>      }
>  }
> -
> -/* supplied by libgcc */
> -extern void *__canonicalize_funcptr_for_compare(void *);
> -
> -/* Field selection types defined by hppa */
> -#define rnd(x)                  (((x)+0x1000)&~0x1fff)
> -/* lsel: select left 21 bits */
> -#define lsel(v,a)               (((v)+(a))>>11)
> -/* rsel: select right 11 bits */
> -#define rsel(v,a)               (((v)+(a))&0x7ff)
> -/* lrsel with rounding of addend to nearest 8k */
> -#define lrsel(v,a)              (((v)+rnd(a))>>11)
> -/* rrsel with rounding of addend to nearest 8k */
> -#define rrsel(v,a)              ((((v)+rnd(a))&0x7ff)+((a)-rnd(a)))
> -
> -#define mask(x,sz)              ((x) & ~((1<<(sz))-1))
> -
> -static inline int reassemble_12(int as12)
> -{
> -    return (((as12 & 0x800) >> 11) |
> -            ((as12 & 0x400) >> 8) |
> -            ((as12 & 0x3ff) << 3));
> -}
> -
> -static inline int reassemble_14(int as14)
> -{
> -    return (((as14 & 0x1fff) << 1) |
> -            ((as14 & 0x2000) >> 13));
> -}
> -
> -static inline int reassemble_17(int as17)
> -{
> -    return (((as17 & 0x10000) >> 16) |
> -            ((as17 & 0x0f800) << 5) |
> -            ((as17 & 0x00400) >> 8) |
> -            ((as17 & 0x003ff) << 3));
> -}
> -
> -static inline int reassemble_21(int as21)
> -{
> -    return (((as21 & 0x100000) >> 20) |
> -            ((as21 & 0x0ffe00) >> 8) |
> -            ((as21 & 0x000180) << 7) |
> -            ((as21 & 0x00007c) << 14) |
> -            ((as21 & 0x000003) << 12));
> -}
> -
> -static inline void hppa_patch21l(uint32_t *insn, int val, int addend)
> -{
> -    val = lrsel(val, addend);
> -    *insn = mask(*insn, 21) | reassemble_21(val);
> -}
> -
> -static inline void hppa_patch14r(uint32_t *insn, int val, int addend)
> -{
> -    val = rrsel(val, addend);
> -    *insn = mask(*insn, 14) | reassemble_14(val);
> -}
> -
> -static inline void hppa_patch17r(uint32_t *insn, int val, int addend)
> -{
> -    val = rrsel(val, addend);
> -    *insn = (*insn & ~0x1f1ffd) | reassemble_17(val);
> -}
> -
> -
> -static inline void hppa_patch21l_dprel(uint32_t *insn, int val, int addend)
> -{
> -    register unsigned int dp asm("r27");
> -    hppa_patch21l(insn, val - dp, addend);
> -}
> -
> -static inline void hppa_patch14r_dprel(uint32_t *insn, int val, int addend)
> -{
> -    register unsigned int dp asm("r27");
> -    hppa_patch14r(insn, val - dp, addend);
> -}
> -
> -static inline void hppa_patch17f(uint32_t *insn, int val, int addend)
> -{
> -    int dot = (int)insn & ~0x3;
> -    int v = ((val + addend) - dot - 8) / 4;
> -    if (v > (1 << 16) || v < -(1 << 16)) {
> -        printf("cannot fit branch to offset %d [%08x->%08x]\n", v, dot, val);
> -        abort();
> -    }
> -    *insn = (*insn & ~0x1f1ffd) | reassemble_17(v);
> -}
> -
> -static inline void hppa_load_imm21l(uint32_t *insn, int val, int addend)
> -{
> -    /* Transform addil L'sym(%dp) to ldil L'val, %r1 */
> -    *insn = 0x20200000 | reassemble_21(lrsel(val, 0));
> -}
> -
> -static inline void hppa_load_imm14r(uint32_t *insn, int val, int addend)
> -{
> -    /* Transform ldw R'sym(%r1), %rN to ldo R'sym(%r1), %rN */
> -    hppa_patch14r(insn, val, addend);
> -    /* HACK */
> -    if (addend == 0)
> -        *insn = (*insn & ~0xfc000000) | (0x0d << 26);
> -}
> -- 
> 1.6.2.5
> 
> 
> 
> 

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
address@hidden                 http://www.aurel32.net




reply via email to

[Prev in Thread] Current Thread [Next in Thread]