qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available


From: Aurelien Jarno
Subject: Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available
Date: Sun, 22 Dec 2013 21:03:21 +0100
User-agent: Mutt/1.5.21 (2010-09-15)

On Fri, Dec 20, 2013 at 03:00:12PM -0800, Richard Henderson wrote:
> As present on Atom and Haswell processors.
> 
> Signed-off-by: Richard Henderson <address@hidden>
> ---
>  disas/i386.c          |   8 ++--
>  tcg/i386/tcg-target.c | 127 
> ++++++++++++++++++++++++++++++++++----------------
>  2 files changed, 91 insertions(+), 44 deletions(-)
> 
> Here's to "stress testing" a Haswell laptop before Santa delivers it.  ;-)
> 
> 
> r~
> 
> 
> diff --git a/disas/i386.c b/disas/i386.c
> index 47f1f2e..beb33f0 100644
> --- a/disas/i386.c
> +++ b/disas/i386.c
> @@ -2632,16 +2632,16 @@ static const struct dis386 prefix_user_table[][4] = {
>  
>    /* PREGRP87 */
>    {
> -    { "(bad)",       { XX } },
> -    { "(bad)",       { XX } },
> +    { "movbe",       { Gv, M } },
> +    { "movbe",       { Gw, M } },

This is not correct, this disassemble movbe with the REPZ prefix.

>      { "(bad)",       { XX } },

You want it there, that is for the DATA prefix.

>      { "crc32",       { Gdq, { CRC32_Fixup, b_mode } } },
>    },
>  
>    /* PREGRP88 */
>    {
> -    { "(bad)",       { XX } },
> -    { "(bad)",       { XX } },
> +    { "movbe",       { M, Gv } },
> +    { "movbe",       { M, Gw } },
>      { "(bad)",       { XX } },

Ditto.

>      { "crc32",       { Gdq, { CRC32_Fixup, v_mode } } },
>    },
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 7ac8e45..e76f3f3 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -99,18 +99,28 @@ static const int tcg_target_call_oarg_regs[] = {
>  # define TCG_REG_L1 TCG_REG_EDX
>  #endif
>  
> +#ifdef CONFIG_CPUID_H
> +#include <cpuid.h>
> +#endif
> +
>  /* For 32-bit, we are going to attempt to determine at runtime whether cmov
>     is available.  However, the host compiler must supply <cpuid.h>, as we're
>     not going to go so far as our own inline assembly.  */
>  #if TCG_TARGET_REG_BITS == 64
>  # define have_cmov 1
>  #elif defined(CONFIG_CPUID_H)
> -#include <cpuid.h>
>  static bool have_cmov;
>  #else
>  # define have_cmov 0
>  #endif
>  
> +/* There is no pre-processor definition for MOVBE to fall back on.  */
> +#ifdef CONFIG_CPUID_H

As you already remarked, you should check for bit_MOVBE, as it is
something which has been introduced in GCC 4.6.

> +static bool have_movbe;
> +#else
> +#define have_movbe 0

Very minor nitpick, but you probably want to indent the define, like
above for cmov.

> +#endif
> +
>  static uint8_t *tb_ret_addr;
>  
>  static void patch_reloc(uint8_t *code_ptr, int type,
> @@ -254,6 +264,7 @@ static inline int tcg_target_const_match(tcg_target_long 
> val,
>  # define P_REXB_RM   0
>  # define P_GS           0
>  #endif
> +#define P_EXT38         0x8000          /* 0x0f 0x38 opcode prefix */
>  
>  #define OPC_ARITH_EvIz       (0x81)
>  #define OPC_ARITH_EvIb       (0x83)
> @@ -279,6 +290,8 @@ static inline int tcg_target_const_match(tcg_target_long 
> val,
>  #define OPC_MOVB_EvIz   (0xc6)
>  #define OPC_MOVL_EvIz        (0xc7)
>  #define OPC_MOVL_Iv     (0xb8)
> +#define OPC_MOVBE_GyMy  (0xf0 | P_EXT | P_EXT38)
> +#define OPC_MOVBE_MyGy  (0xf1 | P_EXT | P_EXT38)
>  #define OPC_MOVSBL   (0xbe | P_EXT)
>  #define OPC_MOVSWL   (0xbf | P_EXT)
>  #define OPC_MOVSLQ   (0x63 | P_REXW)
> @@ -400,6 +413,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, 
> int rm, int x)
>  
>      if (opc & P_EXT) {
>          tcg_out8(s, 0x0f);
> +        if (opc & P_EXT38) {
> +            tcg_out8(s, 0x38);
> +        }
>      }
>      tcg_out8(s, opc);
>  }
> @@ -411,6 +427,9 @@ static void tcg_out_opc(TCGContext *s, int opc)
>      }
>      if (opc & P_EXT) {
>          tcg_out8(s, 0x0f);
> +        if (opc & P_EXT38) {
> +            tcg_out8(s, 0x38);
> +        }
>      }
>      tcg_out8(s, opc);
>  }
> @@ -1336,7 +1355,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, 
> TCGReg datalo, TCGReg datahi,
>                                     TCGReg base, intptr_t ofs, int seg,
>                                     TCGMemOp memop)
>  {
> -    const TCGMemOp bswap = memop & MO_BSWAP;
> +    const TCGMemOp real_bswap = memop & MO_BSWAP;
> +    TCGMemOp bswap = real_bswap;
> +    int movop = OPC_MOVL_GvEv;
> +
> +    if (real_bswap && have_movbe) {
> +        bswap = 0;
> +        movop = OPC_MOVBE_GyMy;
> +    }

Defining movop is clever, that said the name real_bswap doesn't sounds
very self describing. Moreover you still need to check for have_movbe
below...

>      switch (memop & MO_SSIZE) {
>      case MO_UB:
> @@ -1346,32 +1372,45 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, 
> TCGReg datalo, TCGReg datahi,
>          tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW + seg, datalo, base, 
> ofs);
>          break;
>      case MO_UW:
> -        tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
> -        if (bswap) {
> -            tcg_out_rolw_8(s, datalo);
> +        if (real_bswap && have_movbe) {
> +            tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
> +                                 datalo, base, ofs);
> +            tcg_out_ext16u(s, datalo, datalo);
> +        } else {
> +            tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
> +            if (bswap) {
> +                tcg_out_rolw_8(s, datalo);
> +            }

What about keeping the bswap definition like before and do something
like:

        if (bswap && have_movbe) {
            tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
                                 datalo, base, ofs);
            tcg_out_ext16u(s, datalo, datalo);
        } else {
            tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
            if (bswap) {
                tcg_out_rolw_8(s, datalo);
            }
        }


>          }
>          break;
>      case MO_SW:
> -        if (bswap) {
> -            tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
> -            tcg_out_rolw_8(s, datalo);
> -            tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
> +        if (real_bswap) {
> +            if (have_movbe) {
> +                tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
> +                                     datalo, base, ofs);
> +            } else {
> +                tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
> +                tcg_out_rolw_8(s, datalo);
> +            }
> +            tcg_out_ext16s(s, datalo, datalo, P_REXW);
>          } else {
>              tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg,
>                                   datalo, base, ofs);
>          }
>          break;
>      case MO_UL:
> -        tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
> +        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
>          if (bswap) {
>              tcg_out_bswap32(s, datalo);
>          }

And here:
        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
        if (bswap && !has_movbe) {
            tcg_out_bswap32(s, datalo);
        }

>          break;
>  #if TCG_TARGET_REG_BITS == 64
>      case MO_SL:
> -        if (bswap) {
> -            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
> -            tcg_out_bswap32(s, datalo);
> +        if (real_bswap) {
> +            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
> +            if (bswap) {
> +                tcg_out_bswap32(s, datalo);
> +            }
>              tcg_out_ext32s(s, datalo, datalo);
>          } else {

And here:
        if (bswap) {
            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
            if (!has_movbe) {
                tcg_out_bswap32(s, datalo);
            }
            tcg_out_ext32s(s, datalo, datalo);
        } else {


>              tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs);
> @@ -1380,27 +1419,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, 
> TCGReg datalo, TCGReg datahi,
>  #endif
>      case MO_Q:
>          if (TCG_TARGET_REG_BITS == 64) {
> -            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg,
> -                                 datalo, base, ofs);
> +            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
>              if (bswap) {
>                  tcg_out_bswap64(s, datalo);
>              }
>          } else {
> -            if (bswap) {
> +            if (real_bswap) {
>                  int t = datalo;
>                  datalo = datahi;
>                  datahi = t;
>              }
>              if (base != datalo) {
> -                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
> -                                     datalo, base, ofs);
> -                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
> -                                     datahi, base, ofs + 4);
> +                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
> +                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
>              } else {
> -                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
> -                                     datahi, base, ofs + 4);
> -                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
> -                                     datalo, base, ofs);
> +                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
> +                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
>              }
>              if (bswap) {
>                  tcg_out_bswap32(s, datalo);
> @@ -1476,13 +1510,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s, 
> TCGReg datalo, TCGReg datahi,
>                                     TCGReg base, intptr_t ofs, int seg,
>                                     TCGMemOp memop)
>  {
> -    const TCGMemOp bswap = memop & MO_BSWAP;
> -
>      /* ??? Ideally we wouldn't need a scratch register.  For user-only,
>         we could perform the bswap twice to restore the original value
>         instead of moving to the scratch.  But as it is, the L constraint
>         means that TCG_REG_L0 is definitely free here.  */
>      const TCGReg scratch = TCG_REG_L0;
> +    const TCGMemOp real_bswap = memop & MO_BSWAP;
> +    TCGMemOp bswap = real_bswap;
> +    int movop = OPC_MOVL_EvGv;
> +
> +    if (real_bswap && have_movbe) {
> +        bswap = 0;
> +        movop = OPC_MOVBE_MyGy;
> +    }

Ditto here

>      switch (memop & MO_SIZE) {
>      case MO_8:
> @@ -1501,8 +1541,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, 
> TCGReg datalo, TCGReg datahi,
>              tcg_out_rolw_8(s, scratch);
>              datalo = scratch;
>          }
> -        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg,
> -                             datalo, base, ofs);
> +        tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
>          break;
>      case MO_32:
>          if (bswap) {
> @@ -1510,7 +1549,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, 
> TCGReg datalo, TCGReg datahi,
>              tcg_out_bswap32(s, scratch);
>              datalo = scratch;
>          }
> -        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
> +        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
>          break;
>      case MO_64:
>          if (TCG_TARGET_REG_BITS == 64) {
> @@ -1519,8 +1558,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, 
> TCGReg datalo, TCGReg datahi,
>                  tcg_out_bswap64(s, scratch);
>                  datalo = scratch;
>              }
> -            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg,
> -                                 datalo, base, ofs);
> +            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
>          } else if (bswap) {
>              tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
>              tcg_out_bswap32(s, scratch);
> @@ -1529,8 +1567,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s, 
> TCGReg datalo, TCGReg datahi,
>              tcg_out_bswap32(s, scratch);
>              tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, 
> ofs+4);
>          } else {
> -            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
> -            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base, 
> ofs+4);
> +            if (real_bswap) {
> +                int t = datalo;
> +                datalo = datahi;
> +                datahi = t;
> +            }
> +            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
> +            tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
>          }
>          break;
>      default:
> @@ -2157,15 +2200,19 @@ static void tcg_target_qemu_prologue(TCGContext *s)
>  
>  static void tcg_target_init(TCGContext *s)
>  {
> -    /* For 32-bit, 99% certainty that we're running on hardware that supports
> -       cmov, but we still need to check.  In case cmov is not available, 
> we'll
> -       use a small forward branch.  */
> +    unsigned a, b, c, d;
> +
> +    if (__get_cpuid(1, &a, &b, &c, &d)) {
>  #ifndef have_cmov
> -    {
> -        unsigned a, b, c, d;
> -        have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV));
> -    }
> +        /* For 32-bit, 99% certainty that we're running on hardware that
> +           supports cmov, but we still need to check.  In case cmov is not
> +           available, we'll use a small forward branch.  */
> +        have_cmov = (d & bit_CMOV) != 0;
>  #endif
> +#ifndef have_movbe
> +        have_movbe = (c & bit_MOVBE) != 0;
> +#endif
> +    }

Your code doesn't check the result of __get_cpuid anymore to initialize
have_cmov or have_movbe.  While C standard mandates that static
variables are initialized to 0, it might be a good idea to do so
explicitly. 

>      if (TCG_TARGET_REG_BITS == 64) {
>          tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);

Otherwise looks good.


-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
address@hidden                 http://www.aurel32.net



reply via email to

[Prev in Thread] Current Thread [Next in Thread]