[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available
From: |
Aurelien Jarno |
Subject: |
Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available |
Date: |
Sun, 22 Dec 2013 21:03:21 +0100 |
User-agent: |
Mutt/1.5.21 (2010-09-15) |
On Fri, Dec 20, 2013 at 03:00:12PM -0800, Richard Henderson wrote:
> As present on Atom and Haswell processors.
>
> Signed-off-by: Richard Henderson <address@hidden>
> ---
> disas/i386.c | 8 ++--
> tcg/i386/tcg-target.c | 127
> ++++++++++++++++++++++++++++++++++----------------
> 2 files changed, 91 insertions(+), 44 deletions(-)
>
> Here's to "stress testing" a Haswell laptop before Santa delivers it. ;-)
>
>
> r~
>
>
> diff --git a/disas/i386.c b/disas/i386.c
> index 47f1f2e..beb33f0 100644
> --- a/disas/i386.c
> +++ b/disas/i386.c
> @@ -2632,16 +2632,16 @@ static const struct dis386 prefix_user_table[][4] = {
>
> /* PREGRP87 */
> {
> - { "(bad)", { XX } },
> - { "(bad)", { XX } },
> + { "movbe", { Gv, M } },
> + { "movbe", { Gw, M } },
This is not correct, this disassemble movbe with the REPZ prefix.
> { "(bad)", { XX } },
You want it there, that is for the DATA prefix.
> { "crc32", { Gdq, { CRC32_Fixup, b_mode } } },
> },
>
> /* PREGRP88 */
> {
> - { "(bad)", { XX } },
> - { "(bad)", { XX } },
> + { "movbe", { M, Gv } },
> + { "movbe", { M, Gw } },
> { "(bad)", { XX } },
Ditto.
> { "crc32", { Gdq, { CRC32_Fixup, v_mode } } },
> },
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 7ac8e45..e76f3f3 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -99,18 +99,28 @@ static const int tcg_target_call_oarg_regs[] = {
> # define TCG_REG_L1 TCG_REG_EDX
> #endif
>
> +#ifdef CONFIG_CPUID_H
> +#include <cpuid.h>
> +#endif
> +
> /* For 32-bit, we are going to attempt to determine at runtime whether cmov
> is available. However, the host compiler must supply <cpuid.h>, as we're
> not going to go so far as our own inline assembly. */
> #if TCG_TARGET_REG_BITS == 64
> # define have_cmov 1
> #elif defined(CONFIG_CPUID_H)
> -#include <cpuid.h>
> static bool have_cmov;
> #else
> # define have_cmov 0
> #endif
>
> +/* There is no pre-processor definition for MOVBE to fall back on. */
> +#ifdef CONFIG_CPUID_H
As you already remarked, you should check for bit_MOVBE, as it is
something which has been introduced in GCC 4.6.
> +static bool have_movbe;
> +#else
> +#define have_movbe 0
Very minor nitpick, but you probably want to indent the define, like
above for cmov.
> +#endif
> +
> static uint8_t *tb_ret_addr;
>
> static void patch_reloc(uint8_t *code_ptr, int type,
> @@ -254,6 +264,7 @@ static inline int tcg_target_const_match(tcg_target_long
> val,
> # define P_REXB_RM 0
> # define P_GS 0
> #endif
> +#define P_EXT38 0x8000 /* 0x0f 0x38 opcode prefix */
>
> #define OPC_ARITH_EvIz (0x81)
> #define OPC_ARITH_EvIb (0x83)
> @@ -279,6 +290,8 @@ static inline int tcg_target_const_match(tcg_target_long
> val,
> #define OPC_MOVB_EvIz (0xc6)
> #define OPC_MOVL_EvIz (0xc7)
> #define OPC_MOVL_Iv (0xb8)
> +#define OPC_MOVBE_GyMy (0xf0 | P_EXT | P_EXT38)
> +#define OPC_MOVBE_MyGy (0xf1 | P_EXT | P_EXT38)
> #define OPC_MOVSBL (0xbe | P_EXT)
> #define OPC_MOVSWL (0xbf | P_EXT)
> #define OPC_MOVSLQ (0x63 | P_REXW)
> @@ -400,6 +413,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r,
> int rm, int x)
>
> if (opc & P_EXT) {
> tcg_out8(s, 0x0f);
> + if (opc & P_EXT38) {
> + tcg_out8(s, 0x38);
> + }
> }
> tcg_out8(s, opc);
> }
> @@ -411,6 +427,9 @@ static void tcg_out_opc(TCGContext *s, int opc)
> }
> if (opc & P_EXT) {
> tcg_out8(s, 0x0f);
> + if (opc & P_EXT38) {
> + tcg_out8(s, 0x38);
> + }
> }
> tcg_out8(s, opc);
> }
> @@ -1336,7 +1355,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s,
> TCGReg datalo, TCGReg datahi,
> TCGReg base, intptr_t ofs, int seg,
> TCGMemOp memop)
> {
> - const TCGMemOp bswap = memop & MO_BSWAP;
> + const TCGMemOp real_bswap = memop & MO_BSWAP;
> + TCGMemOp bswap = real_bswap;
> + int movop = OPC_MOVL_GvEv;
> +
> + if (real_bswap && have_movbe) {
> + bswap = 0;
> + movop = OPC_MOVBE_GyMy;
> + }
Defining movop is clever, that said the name real_bswap doesn't sounds
very self describing. Moreover you still need to check for have_movbe
below...
> switch (memop & MO_SSIZE) {
> case MO_UB:
> @@ -1346,32 +1372,45 @@ static void tcg_out_qemu_ld_direct(TCGContext *s,
> TCGReg datalo, TCGReg datahi,
> tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW + seg, datalo, base,
> ofs);
> break;
> case MO_UW:
> - tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
> - if (bswap) {
> - tcg_out_rolw_8(s, datalo);
> + if (real_bswap && have_movbe) {
> + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
> + datalo, base, ofs);
> + tcg_out_ext16u(s, datalo, datalo);
> + } else {
> + tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
> + if (bswap) {
> + tcg_out_rolw_8(s, datalo);
> + }
What about keeping the bswap definition like before and do something
like:
if (bswap && have_movbe) {
tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
datalo, base, ofs);
tcg_out_ext16u(s, datalo, datalo);
} else {
tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
if (bswap) {
tcg_out_rolw_8(s, datalo);
}
}
> }
> break;
> case MO_SW:
> - if (bswap) {
> - tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
> - tcg_out_rolw_8(s, datalo);
> - tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
> + if (real_bswap) {
> + if (have_movbe) {
> + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
> + datalo, base, ofs);
> + } else {
> + tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
> + tcg_out_rolw_8(s, datalo);
> + }
> + tcg_out_ext16s(s, datalo, datalo, P_REXW);
> } else {
> tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg,
> datalo, base, ofs);
> }
> break;
> case MO_UL:
> - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
> + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
> if (bswap) {
> tcg_out_bswap32(s, datalo);
> }
And here:
tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
if (bswap && !has_movbe) {
tcg_out_bswap32(s, datalo);
}
> break;
> #if TCG_TARGET_REG_BITS == 64
> case MO_SL:
> - if (bswap) {
> - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
> - tcg_out_bswap32(s, datalo);
> + if (real_bswap) {
> + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
> + if (bswap) {
> + tcg_out_bswap32(s, datalo);
> + }
> tcg_out_ext32s(s, datalo, datalo);
> } else {
And here:
if (bswap) {
tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
if (!has_movbe) {
tcg_out_bswap32(s, datalo);
}
tcg_out_ext32s(s, datalo, datalo);
} else {
> tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs);
> @@ -1380,27 +1419,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s,
> TCGReg datalo, TCGReg datahi,
> #endif
> case MO_Q:
> if (TCG_TARGET_REG_BITS == 64) {
> - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg,
> - datalo, base, ofs);
> + tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
> if (bswap) {
> tcg_out_bswap64(s, datalo);
> }
> } else {
> - if (bswap) {
> + if (real_bswap) {
> int t = datalo;
> datalo = datahi;
> datahi = t;
> }
> if (base != datalo) {
> - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
> - datalo, base, ofs);
> - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
> - datahi, base, ofs + 4);
> + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
> + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
> } else {
> - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
> - datahi, base, ofs + 4);
> - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
> - datalo, base, ofs);
> + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
> + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
> }
> if (bswap) {
> tcg_out_bswap32(s, datalo);
> @@ -1476,13 +1510,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s,
> TCGReg datalo, TCGReg datahi,
> TCGReg base, intptr_t ofs, int seg,
> TCGMemOp memop)
> {
> - const TCGMemOp bswap = memop & MO_BSWAP;
> -
> /* ??? Ideally we wouldn't need a scratch register. For user-only,
> we could perform the bswap twice to restore the original value
> instead of moving to the scratch. But as it is, the L constraint
> means that TCG_REG_L0 is definitely free here. */
> const TCGReg scratch = TCG_REG_L0;
> + const TCGMemOp real_bswap = memop & MO_BSWAP;
> + TCGMemOp bswap = real_bswap;
> + int movop = OPC_MOVL_EvGv;
> +
> + if (real_bswap && have_movbe) {
> + bswap = 0;
> + movop = OPC_MOVBE_MyGy;
> + }
Ditto here
> switch (memop & MO_SIZE) {
> case MO_8:
> @@ -1501,8 +1541,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s,
> TCGReg datalo, TCGReg datahi,
> tcg_out_rolw_8(s, scratch);
> datalo = scratch;
> }
> - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg,
> - datalo, base, ofs);
> + tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
> break;
> case MO_32:
> if (bswap) {
> @@ -1510,7 +1549,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s,
> TCGReg datalo, TCGReg datahi,
> tcg_out_bswap32(s, scratch);
> datalo = scratch;
> }
> - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
> + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
> break;
> case MO_64:
> if (TCG_TARGET_REG_BITS == 64) {
> @@ -1519,8 +1558,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s,
> TCGReg datalo, TCGReg datahi,
> tcg_out_bswap64(s, scratch);
> datalo = scratch;
> }
> - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg,
> - datalo, base, ofs);
> + tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
> } else if (bswap) {
> tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
> tcg_out_bswap32(s, scratch);
> @@ -1529,8 +1567,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s,
> TCGReg datalo, TCGReg datahi,
> tcg_out_bswap32(s, scratch);
> tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base,
> ofs+4);
> } else {
> - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
> - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base,
> ofs+4);
> + if (real_bswap) {
> + int t = datalo;
> + datalo = datahi;
> + datahi = t;
> + }
> + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
> + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
> }
> break;
> default:
> @@ -2157,15 +2200,19 @@ static void tcg_target_qemu_prologue(TCGContext *s)
>
> static void tcg_target_init(TCGContext *s)
> {
> - /* For 32-bit, 99% certainty that we're running on hardware that supports
> - cmov, but we still need to check. In case cmov is not available,
> we'll
> - use a small forward branch. */
> + unsigned a, b, c, d;
> +
> + if (__get_cpuid(1, &a, &b, &c, &d)) {
> #ifndef have_cmov
> - {
> - unsigned a, b, c, d;
> - have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV));
> - }
> + /* For 32-bit, 99% certainty that we're running on hardware that
> + supports cmov, but we still need to check. In case cmov is not
> + available, we'll use a small forward branch. */
> + have_cmov = (d & bit_CMOV) != 0;
> #endif
> +#ifndef have_movbe
> + have_movbe = (c & bit_MOVBE) != 0;
> +#endif
> + }
Your code doesn't check the result of __get_cpuid anymore to initialize
have_cmov or have_movbe. While C standard mandates that static
variables are initialized to 0, it might be a good idea to do so
explicitly.
> if (TCG_TARGET_REG_BITS == 64) {
> tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
Otherwise looks good.
--
Aurelien Jarno GPG: 1024D/F1BCDB73
address@hidden http://www.aurel32.net