qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 18/20] tcg: Save insn data and use it in cpu_res


From: Peter Maydell
Subject: Re: [Qemu-devel] [PATCH 18/20] tcg: Save insn data and use it in cpu_restore_state_from_tb
Date: Thu, 10 Sep 2015 14:49:13 +0100

On 2 September 2015 at 06:52, Richard Henderson <address@hidden> wrote:
> We can now restore state without retranslation.
>
> Signed-off-by: Richard Henderson <address@hidden>
> ---
>  include/exec/exec-all.h |   1 +
>  tcg/tcg.c               |  11 ++++-
>  tcg/tcg.h               |   3 +-
>  translate-all.c         | 129 
> +++++++++++++++++++++++++++++++++---------------
>  4 files changed, 100 insertions(+), 44 deletions(-)
>
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 8c347ba..315f20a 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -198,6 +198,7 @@ struct TranslationBlock {
>  #define CF_USE_ICOUNT  0x20000
>
>      void *tc_ptr;    /* pointer to the translated code */
> +    uint8_t *tc_search;  /* pointer to search data */
>      /* next matching tb for physical address. */
>      struct TranslationBlock *phys_hash_next;
>      /* original tb when cflags has CF_NOCACHE */
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index d956f0b..3541d4c 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -2300,7 +2300,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
>                                        tcg_insn_unit *gen_code_buf,
>                                        long search_pc)
>  {
> -    int i, oi, oi_next;
> +    int i, oi, oi_next, num_insns;
>
>  #ifdef DEBUG_DISAS
>      if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) {
> @@ -2344,6 +2344,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
>
>      tcg_out_tb_init(s);
>
> +    num_insns = -1;
>      for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
>          TCGOp * const op = &s->gen_op_buf[oi];
>          TCGArg * const args = &s->gen_opparam_buf[op->args];
> @@ -2367,6 +2368,10 @@ static inline int tcg_gen_code_common(TCGContext *s,
>              tcg_reg_alloc_movi(s, args, dead_args, sync_args);
>              break;
>          case INDEX_op_insn_start:
> +            if (num_insns >= 0) {
> +                s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
> +            }
> +            num_insns++;
>              for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
>                  target_ulong a;
>  #if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
> @@ -2374,7 +2379,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
>  #else
>                  a = args[i];
>  #endif
> -                s->gen_opc_data[i] = a;
> +                s->gen_insn_data[num_insns][i] = a;
>              }
>              break;
>          case INDEX_op_discard:
> @@ -2406,6 +2411,8 @@ static inline int tcg_gen_code_common(TCGContext *s,
>          check_regs(s);
>  #endif
>      }
> +    tcg_debug_assert(num_insns >= 0);

This is claiming that every TB will have at least one insn_start,
right? I think that most targets will violate that in the breakpoint
case, because the "if we have a bp for this insn then generate a
debug insn and break out of the loop" code is before the call
to tcg_gen_insn_start().

We should probably assert that num_insns < TCG_MAX_INSNS while
we're here.

> +    s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
>
>      /* Generate TB finalization at the end of block */
>      tcg_out_tb_finalize(s);
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 794b757..11cc107 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -581,7 +581,8 @@ struct TCGContext {
>      uint16_t gen_opc_icount[OPC_BUF_SIZE];
>      uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
>
> -    target_ulong gen_opc_data[TARGET_INSN_START_WORDS];
> +    uint16_t gen_insn_end_off[TCG_MAX_INSNS];
> +    target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
>  };
>
>  extern TCGContext tcg_ctx;
> diff --git a/translate-all.c b/translate-all.c
> index 74be98a..a31f839 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -138,58 +138,65 @@ void cpu_gen_init(void)
>      tcg_context_init(&tcg_ctx);
>  }
>
> -/* The cpu state corresponding to 'searched_pc' is restored.
> - */
> +static target_long decode_sleb128(uint8_t **pp)
> +{
> +    uint8_t *p = *pp;
> +    target_long val = 0;
> +    int byte, shift = 0;
> +
> +    do {
> +        byte = *p++;
> +        val |= (target_ulong)(byte & 0x7f) << shift;
> +        shift += 7;
> +    } while (byte & 0x80);
> +    if (shift < TARGET_LONG_BITS && (byte & 0x40)) {
> +        val |= -(target_ulong)1 << shift;
> +    }
> +
> +    *pp = p;
> +    return val;
> +}

Are the encode/decode sleb128 functions known-good ones
borrowed from somewhere else?

(PS: checkpatch complains about missing braces.)

> +
> +/* The cpu state corresponding to 'searched_pc' is restored.  */
>  static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
>                                       uintptr_t searched_pc)
>  {
> +    target_ulong data[TARGET_INSN_START_WORDS] = { };
> +    uintptr_t host_pc = (uintptr_t)tb->tc_ptr;
>      CPUArchState *env = cpu->env_ptr;
> -    TCGContext *s = &tcg_ctx;
> -    int j;
> -    uintptr_t tc_ptr;
> +    uint8_t *p = tb->tc_search;
> +    int i, j, num_insns = tb->icount;
>  #ifdef CONFIG_PROFILER
> -    int64_t ti;
> +    int64_t ti = profile_getclock();
>  #endif
>
> -#ifdef CONFIG_PROFILER
> -    ti = profile_getclock();
> -#endif
> -    tcg_func_start(s);
> +    if (searched_pc < host_pc) {
> +        return -1;
> +    }
>
> -    gen_intermediate_code_pc(env, tb);
> +    /* Reconstruct the stored insn data while looking for the point at
> +       which the end of the insn exceeds the searched_pc.  */
> +    for (i = 0; i < num_insns; ++i) {
> +        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
> +            data[j] += decode_sleb128(&p);
> +        }
> +        host_pc += decode_sleb128(&p);
> +        if (host_pc > searched_pc) {
> +            goto found;
> +        }
> +    }
> +    return -1;
>
> + found:
>      if (tb->cflags & CF_USE_ICOUNT) {
>          assert(use_icount);
>          /* Reset the cycle counter to the start of the block.  */
> -        cpu->icount_decr.u16.low += tb->icount;
> +        cpu->icount_decr.u16.low += num_insns;
>          /* Clear the IO flag.  */
>          cpu->can_do_io = 0;
>      }
> -
> -    /* find opc index corresponding to search_pc */
> -    tc_ptr = (uintptr_t)tb->tc_ptr;
> -    if (searched_pc < tc_ptr)
> -        return -1;
> -
> -    s->tb_next_offset = tb->tb_next_offset;
> -#ifdef USE_DIRECT_JUMP
> -    s->tb_jmp_offset = tb->tb_jmp_offset;
> -    s->tb_next = NULL;
> -#else
> -    s->tb_jmp_offset = NULL;
> -    s->tb_next = tb->tb_next;
> -#endif
> -    j = tcg_gen_code_search_pc(s, (tcg_insn_unit *)tc_ptr,
> -                               searched_pc - tc_ptr);
> -    if (j < 0)
> -        return -1;
> -    /* now find start of instruction before */
> -    while (s->gen_opc_instr_start[j] == 0) {
> -        j--;
> -    }
> -    cpu->icount_decr.u16.low -= s->gen_opc_icount[j];
> -
> -    restore_state_to_opc(env, tb, s->gen_opc_data);
> +    cpu->icount_decr.u16.low -= i;
> +    restore_state_to_opc(env, tb, data);
>
>  #ifdef CONFIG_PROFILER
>      s->restore_time += profile_getclock() - ti;
> @@ -933,6 +940,44 @@ static void build_page_bitmap(PageDesc *p)
>      }
>  }
>
> +static uint8_t *encode_sleb128(uint8_t *p, target_long val)
> +{
> +    int more, byte;
> +
> +    do {
> +        byte = val & 0x7f;
> +        val >>= 7;
> +        more = !((val == 0 && (byte & 0x40) == 0)
> +                 || (val == -1 && (byte & 0x40) != 0));
> +        if (more)
> +          byte |= 0x80;
> +        *p++ = byte;
> +    } while (more);
> +
> +    return p;
> +}
> +
> +static int encode_search(TranslationBlock *tb, uint8_t *block)
> +{

I think this function would benefit from a brief comment
describing the compressed format we're creating here.

> +    uint8_t *p = block;
> +    int i, j, n;
> +
> +    tb->tc_search = block;
> +
> +    for (i = 0, n = tb->icount; i < n; ++i) {
> +        target_ulong prev;
> +
> +        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
> +            prev = (i == 0 ? 0 : tcg_ctx.gen_insn_data[i - 1][j]);
> +            p = encode_sleb128(p, tcg_ctx.gen_insn_data[i][j] - prev);
> +        }
> +        prev = (i == 0 ? 0 : tcg_ctx.gen_insn_end_off[i - 1]);
> +        p = encode_sleb128(p, tcg_ctx.gen_insn_end_off[i] - prev);
> +    }
> +
> +    return p - block;
> +}
> +
>  TranslationBlock *tb_gen_code(CPUState *cpu,
>                                target_ulong pc, target_ulong cs_base,
>                                int flags, int cflags)
> @@ -942,7 +987,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>      tb_page_addr_t phys_pc, phys_page2;
>      target_ulong virt_page2;
>      tcg_insn_unit *gen_code_buf;
> -    int gen_code_size;
> +    int gen_code_size, search_size;
>  #ifdef CONFIG_PROFILER
>      int64_t ti;
>  #endif
> @@ -998,11 +1043,12 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>  #endif
>
>      gen_code_size = tcg_gen_code(&tcg_ctx, gen_code_buf);
> +    search_size = encode_search(tb, (void *)gen_code_buf + gen_code_size);

Now we're putting the encoded search info in the codegen buffer,
don't we need to adjust the calculation of code_gen_buffer_max_size
to avoid falling off the end if the last TB in the buffer has a very
large set of generated TCG code and also a big encoded search buffer?

It would also be nice to assert if we do fall off the end of the
buffer somehow.

>
>  #ifdef CONFIG_PROFILER
>      tcg_ctx.code_time += profile_getclock();
>      tcg_ctx.code_in_len += tb->size;
> -    tcg_ctx.code_out_len += gen_code_size;
> +    tcg_ctx.code_out_len += gen_code_size + search_size;
>  #endif

How much extra space does the encoded search typically take (as a
% of the gen_code_size, say)?

>
>  #ifdef DEBUG_DISAS
> @@ -1014,8 +1060,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>      }
>  #endif
>
> -    tcg_ctx.code_gen_ptr = (void *)(((uintptr_t)gen_code_buf +
> -            gen_code_size + CODE_GEN_ALIGN - 1) & ~(CODE_GEN_ALIGN - 1));
> +    tcg_ctx.code_gen_ptr = (void *)
> +        (((uintptr_t)gen_code_buf + gen_code_size + search_size
> +          + CODE_GEN_ALIGN - 1) & -CODE_GEN_ALIGN);

If we're messing with this line anyway we might as well use ROUND_UP:

    tcg_ctx.code_gen_ptr = (void *)
         ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
                  CODE_GEN_ALIGN);

>
>      /* check next page if needed */
>      virt_page2 = (pc + tb->size - 1) & TARGET_PAGE_MASK;
> --
> 2.4.3

thanks
-- PMM



reply via email to

[Prev in Thread] Current Thread [Next in Thread]