[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [RFC][PATCH v2 3/4] tcg: add optimized TCG qemu_ld/st gener
From: |
Yeongkyoon Lee |
Subject: |
[Qemu-devel] [RFC][PATCH v2 3/4] tcg: add optimized TCG qemu_ld/st generation |
Date: |
Thu, 05 Jul 2012 22:23:38 +0900 |
Add optimized TCG qemu_ld/st generation which generates the code for TLB miss
case handling at the end of TB after generating other IRs.
Signed-off-by: Yeongkyoon Lee <address@hidden>
---
tcg/i386/tcg-target.c | 328 +++++++++++++++++++++++++++++++++++++++++++++++++
tcg/tcg.c | 12 ++
tcg/tcg.h | 35 +++++
3 files changed, 375 insertions(+), 0 deletions(-)
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index da17bba..3f2f640 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -984,6 +984,8 @@ static const void *qemu_st_helpers[4] = {
helper_stq_mmu,
};
#else
+
+#ifndef CONFIG_QEMU_LDST_OPTIMIZATION
/* legacy helper signature: __ld_mmu(target_ulong addr, int
mmu_idx) */
static void *qemu_ld_helpers[4] = {
@@ -1001,6 +1003,35 @@ static void *qemu_st_helpers[4] = {
__stl_mmu,
__stq_mmu,
};
+#else
+/* extended legacy helper signature: __ext_ld_mmu(target_ulong addr, int
+ mmu_idx, uintptr raddr) */
+static void *qemu_ld_helpers[4] = {
+ __ext_ldb_mmu,
+ __ext_ldw_mmu,
+ __ext_ldl_mmu,
+ __ext_ldq_mmu,
+};
+
+/* extended legacy helper signature: __ext_st_mmu(target_ulong addr, uintxx_t
val,
+ int mmu_idx) */
+static void *qemu_st_helpers[4] = {
+ __ext_stb_mmu,
+ __ext_stw_mmu,
+ __ext_stl_mmu,
+ __ext_stq_mmu,
+};
+
+static void add_qemu_ldst_label(TCGContext *s,
+ int opc_ext,
+ int data_reg,
+ int data_reg2,
+ int addrlo_reg,
+ int addrhi_reg,
+ int mem_index,
+ uint8_t *raddr,
+ uint8_t **label_ptr);
+#endif /* !CONFIG_QEMU_LDST_OPTIMIZATION */
#endif
/* Perform the TLB load and compare.
@@ -1061,19 +1092,36 @@ static inline void tcg_out_tlb_load(TCGContext *s, int
addrlo_idx,
tcg_out_mov(s, type, r0, addrlo);
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+ /* jne slow_path */
+ tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
+ if (!label_ptr) {
+ tcg_abort();
+ }
+ label_ptr[0] = s->code_ptr;
+ s->code_ptr += 4;
+#else
/* jne label1 */
tcg_out8(s, OPC_JCC_short + JCC_JNE);
label_ptr[0] = s->code_ptr;
s->code_ptr++;
+#endif
if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
/* cmp 4(r1), addrhi */
tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4);
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+ /* jne slow_path */
+ tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
+ label_ptr[1] = s->code_ptr;
+ s->code_ptr += 4;
+#else
/* jne label1 */
tcg_out8(s, OPC_JCC_short + JCC_JNE);
label_ptr[1] = s->code_ptr;
s->code_ptr++;
+#endif
}
/* TLB Hit. */
@@ -1171,11 +1219,13 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg
*args,
int addrlo_idx;
#if defined(CONFIG_SOFTMMU)
int mem_index, s_bits;
+#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION)
#if TCG_TARGET_REG_BITS == 64
int arg_idx;
#else
int stack_adjust;
#endif
+#endif /* !CONFIG_QEMU_LDST_OPTIMIZATION */
uint8_t *label_ptr[3];
#endif
@@ -1197,6 +1247,18 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg
*args,
tcg_out_qemu_ld_direct(s, data_reg, data_reg2,
tcg_target_call_iarg_regs[0], 0, opc);
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+ /* helper stub will be jumped back here */
+ add_qemu_ldst_label(s,
+ opc,
+ data_reg,
+ data_reg2,
+ args[addrlo_idx],
+ args[addrlo_idx + 1],
+ mem_index,
+ s->code_ptr,
+ label_ptr);
+#else
/* jmp label2 */
tcg_out8(s, OPC_JMP_short);
label_ptr[2] = s->code_ptr;
@@ -1292,6 +1354,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg
*args,
/* label2: */
*label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
#else
{
int32_t offset = GUEST_BASE;
@@ -1385,7 +1448,9 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg
*args,
int addrlo_idx;
#if defined(CONFIG_SOFTMMU)
int mem_index, s_bits;
+#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION)
int stack_adjust;
+#endif
uint8_t *label_ptr[3];
#endif
@@ -1407,6 +1472,18 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg
*args,
tcg_out_qemu_st_direct(s, data_reg, data_reg2,
tcg_target_call_iarg_regs[0], 0, opc);
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+ /* helper stub will be jumped back here */
+ add_qemu_ldst_label(s,
+ opc | HL_ST_MASK,
+ data_reg,
+ data_reg2,
+ args[addrlo_idx],
+ args[addrlo_idx + 1],
+ mem_index,
+ s->code_ptr,
+ label_ptr);
+#else
/* jmp label2 */
tcg_out8(s, OPC_JMP_short);
label_ptr[2] = s->code_ptr;
@@ -1469,6 +1546,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg
*args,
/* label2: */
*label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
#else
{
int32_t offset = GUEST_BASE;
@@ -1496,6 +1574,256 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg
*args,
#endif
}
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+/* optimization to reduce jump overheads for qemu_ld/st IRs */
+
+/*
+ * qemu_ld/st code generator call add_qemu_ldst_label,
+ * so that slow case(TLB miss or I/O rw) is handled at the end of TB
+ */
+static void add_qemu_ldst_label(TCGContext *s,
+ int opc_ext,
+ int data_reg,
+ int data_reg2,
+ int addrlo_reg,
+ int addrhi_reg,
+ int mem_index,
+ uint8_t *raddr,
+ uint8_t **label_ptr)
+{
+ int idx;
+ TCGLabelQemuLdst *label;
+
+ if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST)
+ tcg_abort();
+
+ idx = s->nb_qemu_ldst_labels++;
+ label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
+ label->opc_ext = opc_ext;
+ label->datalo_reg = data_reg;
+ label->datahi_reg = data_reg2;
+ label->addrlo_reg = addrlo_reg;
+ label->addrhi_reg = addrhi_reg;
+ label->mem_index = mem_index;
+ label->raddr = raddr;
+ if (!label_ptr) {
+ tcg_abort();
+ }
+ label->label_ptr[0] = label_ptr[0];
+ label->label_ptr[1] = label_ptr[1];
+}
+
+/* generates slow case of qemu_ld at the end of TB */
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+ int s_bits;
+ int opc = label->opc_ext & HL_OPC_MASK;
+ int mem_index = label->mem_index;
+#if TCG_TARGET_REG_BITS == 64
+ int arg_idx;
+#else
+ int stack_adjust;
+ int addrlo_reg = label->addrlo_reg;
+ int addrhi_reg = label->addrhi_reg;
+#endif
+ int data_reg = label->datalo_reg;
+ int data_reg2 = label->datahi_reg;
+ uint8_t *raddr = label->raddr;
+ uint8_t **label_ptr = &label->label_ptr[0];
+
+ s_bits = opc & 3;
+
+ /* resolve label address */
+ *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+ if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+ *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+ }
+
+ /* extended helper signature: __ext_ld_mmu(target_ulong addr, int mmu_idx,
+ uintptr_t raddr) */
+#if TCG_TARGET_REG_BITS == 32
+ tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */
+ stack_adjust = 4;
+ tcg_out_pushi(s, mem_index); /* mmu index */
+ stack_adjust += 4;
+ if (TARGET_LONG_BITS == 64) {
+ tcg_out_push(s, addrhi_reg);
+ stack_adjust += 4;
+ }
+ tcg_out_push(s, addrlo_reg); /* guest addr */
+ stack_adjust += 4;
+#ifdef CONFIG_TCG_PASS_AREG0
+ tcg_out_push(s, TCG_AREG0);
+ stack_adjust += 4;
+#endif
+#else
+ /* The first argument is already loaded with addrlo. */
+ arg_idx = 1;
+ tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++],
+ mem_index);
+ tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++],
+ (uintptr_t)(raddr - 1));
+#ifdef CONFIG_TCG_PASS_AREG0
+ /* XXX/FIXME: suboptimal */
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
+ tcg_target_call_iarg_regs[2]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
+ tcg_target_call_iarg_regs[1]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
+ tcg_target_call_iarg_regs[0]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
+ TCG_AREG0);
+#endif
+#endif
+
+ tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
+
+#if TCG_TARGET_REG_BITS == 32
+ if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
+ /* Pop and discard. This is 2 bytes smaller than the add. */
+ tcg_out_pop(s, TCG_REG_ECX);
+ } else if (stack_adjust != 0) {
+ tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
+ }
+#endif
+
+ switch(opc) {
+ case 0 | 4:
+ tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
+ break;
+ case 1 | 4:
+ tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
+ break;
+ case 0:
+ tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
+ break;
+ case 1:
+ tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
+ break;
+ case 2:
+ tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+ break;
+#if TCG_TARGET_REG_BITS == 64
+ case 2 | 4:
+ tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
+ break;
+#endif
+ case 3:
+ if (TCG_TARGET_REG_BITS == 64) {
+ tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
+ } else if (data_reg == TCG_REG_EDX) {
+ /* xchg %edx, %eax */
+ tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
+ tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
+ } else {
+ tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+ tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
+ }
+ break;
+ default:
+ tcg_abort();
+ }
+
+ /* jump back to original code */
+ tcg_out_jmp(s, (tcg_target_long) raddr);
+}
+
+/* generates slow case of qemu_st at the end of TB */
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+ int s_bits;
+ int stack_adjust;
+ int opc = label->opc_ext & HL_OPC_MASK;
+ int mem_index = label->mem_index;
+ int data_reg = label->datalo_reg;
+#if TCG_TARGET_REG_BITS == 32
+ int data_reg2 = label->datahi_reg;
+ int addrlo_reg = label->addrlo_reg;
+ int addrhi_reg = label->addrhi_reg;
+#endif
+ uint8_t *raddr = label->raddr;
+ uint8_t **label_ptr = &label->label_ptr[0];
+
+ s_bits = opc & 3;
+
+ /* resolve label address */
+ *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+ if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+ *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+ }
+
+ /* extended helper signature: __ext_st_mmu(target_ulong addr, uintxx_t val,
+ int mmu_idx, uintptr_t raddr) */
+#if TCG_TARGET_REG_BITS == 32
+ tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */
+ stack_adjust = 4;
+ tcg_out_pushi(s, mem_index); /* mmu index */
+ stack_adjust += 4;
+ if (opc == 3) {
+ tcg_out_push(s, data_reg2);
+ stack_adjust += 4;
+ }
+ tcg_out_push(s, data_reg); /* guest data */
+ stack_adjust += 4;
+ if (TARGET_LONG_BITS == 64) {
+ tcg_out_push(s, addrhi_reg);
+ stack_adjust += 4;
+ }
+ tcg_out_push(s, addrlo_reg); /* guest addr */
+ stack_adjust += 4;
+#ifdef CONFIG_TCG_PASS_AREG0
+ tcg_out_push(s, TCG_AREG0);
+ stack_adjust += 4;
+#endif
+#else
+ tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),
+ tcg_target_call_iarg_regs[1], data_reg);
+ tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index);
+ tcg_out_movi(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
(uintptr_t)(raddr - 1));
+ stack_adjust = 0;
+#ifdef CONFIG_TCG_PASS_AREG0
+ /* XXX/FIXME: suboptimal */
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
+ tcg_target_call_iarg_regs[2]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
+ tcg_target_call_iarg_regs[1]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
+ tcg_target_call_iarg_regs[0]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
+ TCG_AREG0);
+#endif
+#endif
+
+ tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]);
+
+ if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
+ /* Pop and discard. This is 2 bytes smaller than the add. */
+ tcg_out_pop(s, TCG_REG_ECX);
+ } else if (stack_adjust != 0) {
+ tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
+ }
+
+ /* jump back to original code */
+ tcg_out_jmp(s, (tcg_target_long) raddr);
+}
+
+/* generates all of the slow cases of qemu_ld/st at the end of TB */
+void tcg_out_qemu_ldst_slow_path(TCGContext *s)
+{
+ int i;
+ TCGLabelQemuLdst *label;
+
+ for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
+ label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i];
+ if (IS_QEMU_LD_LABEL(label)) {
+ tcg_out_qemu_ld_slow_path(s, label);
+ } else {
+ tcg_out_qemu_st_slow_path(s, label);
+ }
+ }
+}
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
+
static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
const TCGArg *args, const int *const_args)
{
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 8386b70..8009069 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -301,6 +301,14 @@ void tcg_func_start(TCGContext *s)
gen_opc_ptr = gen_opc_buf;
gen_opparam_ptr = gen_opparam_buf;
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+ /* initialize qemu_ld/st labels which help to generate TLB miss case codes
at the end of TB */
+ s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) *
TCG_MAX_QEMU_LDST);
+ if (!s->qemu_ldst_labels) {
+ tcg_abort();
+ }
+ s->nb_qemu_ldst_labels = 0;
+#endif
}
static inline void tcg_temp_alloc(TCGContext *s, int n)
@@ -2169,6 +2177,10 @@ static inline int tcg_gen_code_common(TCGContext *s,
uint8_t *gen_code_buf,
#endif
}
the_end:
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+ /* Generate MMU call helpers at the end of block (currently only for
qemu_ld/st) */
+ tcg_out_qemu_ldst_slow_path(s);
+#endif
return -1;
}
diff --git a/tcg/tcg.h b/tcg/tcg.h
index d710694..b174cdb 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -187,6 +187,29 @@ typedef tcg_target_ulong TCGArg;
are aliases for target_ulong and host pointer sized values respectively.
*/
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+/* Macros and structures for qemu_ld/st IR code optimization:
+ It looks good for TCG_MAX_HELPER_LABELS to be half of OPC_BUF_SIZE in
exec-all.h. */
+#define TCG_MAX_QEMU_LDST 320
+#define HL_LDST_SHIFT 4
+#define HL_LDST_MASK (1 << HL_LDST_SHIFT)
+#define HL_ST_MASK HL_LDST_MASK
+#define HL_OPC_MASK (HL_LDST_MASK - 1)
+#define IS_QEMU_LD_LABEL(L) (!((L)->opc_ext & HL_LDST_MASK))
+#define IS_QEMU_ST_LABEL(L) ((L)->opc_ext & HL_LDST_MASK)
+
+typedef struct TCGLabelQemuLdst {
+ int opc_ext; /* | 27bit (reserved) | 1bit (ld/st flag) |
4bit (opc) | */
+ int addrlo_reg; /* reg index for the low word of guest virtual
address */
+ int addrhi_reg; /* reg index for the high word of guest
virtual address */
+ int datalo_reg; /* reg index for the low word to be loaded or
to be stored */
+ int datahi_reg; /* reg index for the high word to be loaded or
to be stored */
+ int mem_index; /* soft MMU memory index */
+ uint8_t *raddr; /* return address (located end of TB) */
+ uint8_t *label_ptr[2]; /* label pointers to be updated */
+} TCGLabelQemuLdst;
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
+
#ifdef CONFIG_DEBUG_TCG
#define DEBUG_TCGV 1
#endif
@@ -389,6 +412,13 @@ struct TCGContext {
#ifdef CONFIG_DEBUG_TCG
int temps_in_use;
#endif
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+ /* labels info for qemu_ld/st IRs
+ The labels help to generate TLB miss case codes at the end of TB */
+ TCGLabelQemuLdst *qemu_ldst_labels;
+ int nb_qemu_ldst_labels;
+#endif
};
extern TCGContext tcg_ctx;
@@ -588,3 +618,8 @@ extern uint8_t code_gen_prologue[];
#endif
void tcg_register_jit(void *buf, size_t buf_size);
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+/* qemu_ld/st generation at the end of TB */
+void tcg_out_qemu_ldst_slow_path(TCGContext *s);
+#endif
--
1.7.4.1
[Qemu-devel] [RFC][PATCH v2 3/4] tcg: add optimized TCG qemu_ld/st generation,
Yeongkyoon Lee <=
[Qemu-devel] [RFC][PATCH v2 4/4] configure: add CONFIG_QEMU_LDST_OPTIMIZATION for TCG qemu_ld/st optimization, Yeongkyoon Lee, 2012/07/05