qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [RFC 3/7] translate-all: use a binary search tree to track


From: Emilio G. Cota
Subject: [Qemu-devel] [RFC 3/7] translate-all: use a binary search tree to track TBs in TBContext
Date: Thu, 29 Jun 2017 16:28:25 -0400

This is a prerequisite for having threads generate code on separate
buffers, which will help scalability when booting multiple cores
under MTTCG.

Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer.

Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which results
in a small slowdown when booting debian-arm:

Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
        -machine type=virt -nographic -smp 1 -m 4096 \
        -netdev user,id=unet,hostfwd=tcp::2222-:22 \
        -device virtio-net-device,netdev=unet \
        -drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
        -device virtio-blk-device,drive=myblock \
        -kernel img/arm/aarch32-current-linux-kernel-only.img \
        -append console=ttyAMA0 root=/dev/vda1 \
        -name arm,debug-threads=on -smp 1' (10 runs):

- Before:
      10289.389753      task-clock (msec)         #    0.952 CPUs utilized      
      ( +-  0.13% )
            18,238      context-switches          #    0.002 M/sec              
      ( +-  0.73% )
                 0      cpu-migrations            #    0.000 K/sec
            86,555      page-faults               #    0.008 M/sec              
      ( +-  0.49% )
    45,079,926,395      cycles                    #    4.381 GHz                
      ( +-  0.14% )
   <not supported>      stalled-cycles-frontend
   <not supported>      stalled-cycles-backend
    84,582,463,603      instructions              #    1.88  insns per cycle    
      ( +-  0.26% )
    14,964,335,400      branches                  # 1454.346 M/sec              
      ( +-  0.29% )
       288,324,215      branch-misses             #    1.93% of all branches    
      ( +-  0.34% )

      10.813687279 seconds time elapsed                                         
 ( +-  0.42% )

- After:
      10333.181473      task-clock (msec)         #    0.944 CPUs utilized      
      ( +-  0.27% )
            18,167      context-switches          #    0.002 M/sec              
      ( +-  0.20% )
                 0      cpu-migrations            #    0.000 K/sec
            83,354      page-faults               #    0.008 M/sec              
      ( +-  0.92% )
    45,247,697,926      cycles                    #    4.379 GHz                
      ( +-  0.23% )
   <not supported>      stalled-cycles-frontend
   <not supported>      stalled-cycles-backend
    84,537,657,945      instructions              #    1.87  insns per cycle    
      ( +-  0.18% )
    14,988,568,500      branches                  # 1450.528 M/sec              
      ( +-  0.21% )
       294,765,097      branch-misses             #    1.97% of all branches    
      ( +-  0.43% )

      10.946641611 seconds time elapsed                                         
 ( +-  0.79% )

Signed-off-by: Emilio G. Cota <address@hidden>
---
 include/exec/tb-context.h |   4 +-
 accel/tcg/translate-all.c | 181 ++++++++++++++++++++--------------------------
 2 files changed, 81 insertions(+), 104 deletions(-)

diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h
index 25c2afe..1fa8dcc 100644
--- a/include/exec/tb-context.h
+++ b/include/exec/tb-context.h
@@ -31,10 +31,8 @@ typedef struct TBContext TBContext;
 
 struct TBContext {
 
-    TranslationBlock **tbs;
+    GTree *tb_tree;
     struct qht htable;
-    size_t tbs_size;
-    int nb_tbs;
     /* any access to the tbs or the page table must use this lock */
     QemuMutex tb_lock;
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index da91482..a18fbf7 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -770,6 +770,21 @@ static inline void *alloc_code_gen_buffer(void)
 }
 #endif /* USE_STATIC_CODE_GEN_BUFFER, WIN32, POSIX */
 
+/* @key is already in the tree so it's safe to use container_of on it */
+static gint tc_ptr_cmp(gconstpointer candidate, gconstpointer key)
+{
+    uintptr_t a = *(uintptr_t *)candidate;
+    const TranslationBlock *tb = container_of(key, TranslationBlock, tc_ptr);
+    uintptr_t b = (uintptr_t)tb->tc_ptr;
+
+    if (a >= b + tb->out_size) {
+        return 1;
+    } else if (a < b) {
+        return -1;
+    }
+    return 0;
+}
+
 static inline void code_gen_alloc(size_t tb_size)
 {
     tcg_ctx.code_gen_buffer_size = size_code_gen_buffer(tb_size);
@@ -778,15 +793,7 @@ static inline void code_gen_alloc(size_t tb_size)
         fprintf(stderr, "Could not allocate dynamic translator buffer\n");
         exit(1);
     }
-
-    /* size this conservatively -- realloc later if needed */
-    tcg_ctx.tb_ctx.tbs_size =
-        tcg_ctx.code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE / 8;
-    if (unlikely(!tcg_ctx.tb_ctx.tbs_size)) {
-        tcg_ctx.tb_ctx.tbs_size = 64 * 1024;
-    }
-    tcg_ctx.tb_ctx.tbs = g_new(TranslationBlock *, tcg_ctx.tb_ctx.tbs_size);
-
+    tcg_ctx.tb_ctx.tb_tree = g_tree_new(tc_ptr_cmp);
     qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
 }
 
@@ -827,7 +834,6 @@ bool tcg_enabled(void)
 static TranslationBlock *tb_alloc(target_ulong pc)
 {
     TranslationBlock *tb;
-    TBContext *ctx;
 
     assert_tb_locked();
 
@@ -835,12 +841,6 @@ static TranslationBlock *tb_alloc(target_ulong pc)
     if (unlikely(tb == NULL)) {
         return NULL;
     }
-    ctx = &tcg_ctx.tb_ctx;
-    if (unlikely(ctx->nb_tbs == ctx->tbs_size)) {
-        ctx->tbs_size *= 2;
-        ctx->tbs = g_renew(TranslationBlock *, ctx->tbs, ctx->tbs_size);
-    }
-    ctx->tbs[ctx->nb_tbs++] = tb;
     return tb;
 }
 
@@ -849,16 +849,7 @@ void tb_free(TranslationBlock *tb)
 {
     assert_tb_locked();
 
-    /* In practice this is mostly used for single use temporary TB
-       Ignore the hard cases and just back up if this TB happens to
-       be the last one generated.  */
-    if (tcg_ctx.tb_ctx.nb_tbs > 0 &&
-            tb == tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs - 1]) {
-        size_t struct_size = ROUND_UP(sizeof(*tb), qemu_icache_linesize);
-
-        tcg_ctx.code_gen_ptr = tb->tc_ptr - struct_size;
-        tcg_ctx.tb_ctx.nb_tbs--;
-    }
+    g_tree_remove(tcg_ctx.tb_ctx.tb_tree, &tb->tc_ptr);
 }
 
 static inline void invalidate_page_bitmap(PageDesc *p)
@@ -906,6 +897,8 @@ static void page_flush_tb(void)
 /* flush all the translation blocks */
 static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
 {
+    int nb_tbs __attribute__((unused));
+
     tb_lock();
 
     /* If it is already been done on request of another CPU,
@@ -916,11 +909,12 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data 
tb_flush_count)
     }
 
 #if defined(DEBUG_TB_FLUSH)
+    nb_tbs = g_tree_nnodes(tcg_ctx.tb_ctx.tb_tree);
     printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
            (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
-           tcg_ctx.tb_ctx.nb_tbs, tcg_ctx.tb_ctx.nb_tbs > 0 ?
+           nb_tbs, nb_tbs > 0 ?
            ((unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer)) /
-           tcg_ctx.tb_ctx.nb_tbs : 0);
+           nb_tbs : 0);
 #endif
     if ((unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer)
         > tcg_ctx.code_gen_buffer_size) {
@@ -935,7 +929,10 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data 
tb_flush_count)
         }
     }
 
-    tcg_ctx.tb_ctx.nb_tbs = 0;
+    /* Increment the refcount first so that destroy acts as a reset */
+    g_tree_ref(tcg_ctx.tb_ctx.tb_tree);
+    g_tree_destroy(tcg_ctx.tb_ctx.tb_tree);
+
     qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
     page_flush_tb();
 
@@ -1385,6 +1382,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
      * through the physical hash table and physical page list.
      */
     tb_link_page(tb, phys_pc, phys_page2);
+    g_tree_insert(tcg_ctx.tb_ctx.tb_tree, &tb->tc_ptr, tb);
     return tb;
 }
 
@@ -1653,37 +1651,14 @@ static bool tb_invalidate_phys_page(tb_page_addr_t 
addr, uintptr_t pc)
 }
 #endif
 
-/* find the TB 'tb' such that tb[0].tc_ptr <= tc_ptr <
-   tb[1].tc_ptr. Return NULL if not found */
+/*
+ * Find the TB 'tb' such that
+ * tb->tc_ptr <= tc_ptr < tb->tc_ptr + tb->out_size
+ * Return NULL if not found.
+ */
 static TranslationBlock *tb_find_pc(uintptr_t tc_ptr)
 {
-    int m_min, m_max, m;
-    uintptr_t v;
-    TranslationBlock *tb;
-
-    if (tcg_ctx.tb_ctx.nb_tbs <= 0) {
-        return NULL;
-    }
-    if (tc_ptr < (uintptr_t)tcg_ctx.code_gen_buffer ||
-        tc_ptr >= (uintptr_t)tcg_ctx.code_gen_ptr) {
-        return NULL;
-    }
-    /* binary search (cf Knuth) */
-    m_min = 0;
-    m_max = tcg_ctx.tb_ctx.nb_tbs - 1;
-    while (m_min <= m_max) {
-        m = (m_min + m_max) >> 1;
-        tb = tcg_ctx.tb_ctx.tbs[m];
-        v = (uintptr_t)tb->tc_ptr;
-        if (v == tc_ptr) {
-            return tb;
-        } else if (tc_ptr < v) {
-            m_max = m - 1;
-        } else {
-            m_min = m + 1;
-        }
-    }
-    return tcg_ctx.tb_ctx.tbs[m_max];
+    return g_tree_lookup(tcg_ctx.tb_ctx.tb_tree, &tc_ptr);
 }
 
 #if !defined(CONFIG_USER_ONLY)
@@ -1866,63 +1841,67 @@ static void print_qht_statistics(FILE *f, 
fprintf_function cpu_fprintf,
     g_free(hgram);
 }
 
+struct tb_tree_stats {
+    int target_size;
+    int max_target_size;
+    int direct_jmp_count;
+    int direct_jmp2_count;
+    int cross_page;
+};
+
+static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
+{
+    const TranslationBlock *tb = value;
+    struct tb_tree_stats *tst = data;
+
+    tst->target_size += tb->size;
+    if (tb->size > tst->max_target_size) {
+        tst->max_target_size = tb->size;
+    }
+    if (tb->page_addr[1] != -1) {
+        tst->cross_page++;
+    }
+    if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
+        tst->direct_jmp_count++;
+        if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
+            tst->direct_jmp2_count++;
+        }
+    }
+    return false;
+}
+
 void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
 {
-    int i, target_code_size, max_target_code_size;
-    int direct_jmp_count, direct_jmp2_count, cross_page;
-    TranslationBlock *tb;
+    struct tb_tree_stats tst = {};
     struct qht_stats hst;
+    int nb_tbs;
 
     tb_lock();
 
-    target_code_size = 0;
-    max_target_code_size = 0;
-    cross_page = 0;
-    direct_jmp_count = 0;
-    direct_jmp2_count = 0;
-    for (i = 0; i < tcg_ctx.tb_ctx.nb_tbs; i++) {
-        tb = tcg_ctx.tb_ctx.tbs[i];
-        target_code_size += tb->size;
-        if (tb->size > max_target_code_size) {
-            max_target_code_size = tb->size;
-        }
-        if (tb->page_addr[1] != -1) {
-            cross_page++;
-        }
-        if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
-            direct_jmp_count++;
-            if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
-                direct_jmp2_count++;
-            }
-        }
-    }
+    nb_tbs = g_tree_nnodes(tcg_ctx.tb_ctx.tb_tree);
+    g_tree_foreach(tcg_ctx.tb_ctx.tb_tree, tb_tree_stats_iter, &tst);
     /* XXX: avoid using doubles ? */
     cpu_fprintf(f, "Translation buffer state:\n");
     cpu_fprintf(f, "gen code size       %td/%zd\n",
                 tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer,
                 tcg_ctx.code_gen_highwater - tcg_ctx.code_gen_buffer);
-    cpu_fprintf(f, "TB count            %d\n", tcg_ctx.tb_ctx.nb_tbs);
+    cpu_fprintf(f, "TB count            %d\n", nb_tbs);
     cpu_fprintf(f, "TB avg target size  %d max=%d bytes\n",
-            tcg_ctx.tb_ctx.nb_tbs ? target_code_size /
-                    tcg_ctx.tb_ctx.nb_tbs : 0,
-            max_target_code_size);
+                nb_tbs ? tst.target_size / nb_tbs : 0,
+                tst.max_target_size);
     cpu_fprintf(f, "TB avg host size    %td bytes (expansion ratio: %0.1f)\n",
-            tcg_ctx.tb_ctx.nb_tbs ? (tcg_ctx.code_gen_ptr -
-                                     tcg_ctx.code_gen_buffer) /
-                                     tcg_ctx.tb_ctx.nb_tbs : 0,
-                target_code_size ? (double) (tcg_ctx.code_gen_ptr -
-                                             tcg_ctx.code_gen_buffer) /
-                                             target_code_size : 0);
-    cpu_fprintf(f, "cross page TB count %d (%d%%)\n", cross_page,
-            tcg_ctx.tb_ctx.nb_tbs ? (cross_page * 100) /
-                                    tcg_ctx.tb_ctx.nb_tbs : 0);
+                nb_tbs ? (tcg_ctx.code_gen_ptr -
+                          tcg_ctx.code_gen_buffer) / nb_tbs : 0,
+                tst.target_size ? (double) (tcg_ctx.code_gen_ptr -
+                                            tcg_ctx.code_gen_buffer) /
+                                            tst.target_size : 0);
+    cpu_fprintf(f, "cross page TB count %d (%d%%)\n", tst.cross_page,
+            nb_tbs ? (tst.cross_page * 100) / nb_tbs : 0);
     cpu_fprintf(f, "direct jump count   %d (%d%%) (2 jumps=%d %d%%)\n",
-                direct_jmp_count,
-                tcg_ctx.tb_ctx.nb_tbs ? (direct_jmp_count * 100) /
-                        tcg_ctx.tb_ctx.nb_tbs : 0,
-                direct_jmp2_count,
-                tcg_ctx.tb_ctx.nb_tbs ? (direct_jmp2_count * 100) /
-                        tcg_ctx.tb_ctx.nb_tbs : 0);
+                tst.direct_jmp_count,
+                nb_tbs ? (tst.direct_jmp_count * 100) / nb_tbs : 0,
+                tst.direct_jmp2_count,
+                nb_tbs ? (tst.direct_jmp2_count * 100) / nb_tbs : 0);
 
     qht_statistics_init(&tcg_ctx.tb_ctx.htable, &hst);
     print_qht_statistics(f, cpu_fprintf, hst);
-- 
2.7.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]