[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [RFC 3/7] translate-all: use a binary search tree to track
From: |
Emilio G. Cota |
Subject: |
[Qemu-devel] [RFC 3/7] translate-all: use a binary search tree to track TBs in TBContext |
Date: |
Thu, 29 Jun 2017 16:28:25 -0400 |
This is a prerequisite for having threads generate code on separate
buffers, which will help scalability when booting multiple cores
under MTTCG.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which results
in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
10289.389753 task-clock (msec) # 0.952 CPUs utilized
( +- 0.13% )
18,238 context-switches # 0.002 M/sec
( +- 0.73% )
0 cpu-migrations # 0.000 K/sec
86,555 page-faults # 0.008 M/sec
( +- 0.49% )
45,079,926,395 cycles # 4.381 GHz
( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
84,582,463,603 instructions # 1.88 insns per cycle
( +- 0.26% )
14,964,335,400 branches # 1454.346 M/sec
( +- 0.29% )
288,324,215 branch-misses # 1.93% of all branches
( +- 0.34% )
10.813687279 seconds time elapsed
( +- 0.42% )
- After:
10333.181473 task-clock (msec) # 0.944 CPUs utilized
( +- 0.27% )
18,167 context-switches # 0.002 M/sec
( +- 0.20% )
0 cpu-migrations # 0.000 K/sec
83,354 page-faults # 0.008 M/sec
( +- 0.92% )
45,247,697,926 cycles # 4.379 GHz
( +- 0.23% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
84,537,657,945 instructions # 1.87 insns per cycle
( +- 0.18% )
14,988,568,500 branches # 1450.528 M/sec
( +- 0.21% )
294,765,097 branch-misses # 1.97% of all branches
( +- 0.43% )
10.946641611 seconds time elapsed
( +- 0.79% )
Signed-off-by: Emilio G. Cota <address@hidden>
---
include/exec/tb-context.h | 4 +-
accel/tcg/translate-all.c | 181 ++++++++++++++++++++--------------------------
2 files changed, 81 insertions(+), 104 deletions(-)
diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h
index 25c2afe..1fa8dcc 100644
--- a/include/exec/tb-context.h
+++ b/include/exec/tb-context.h
@@ -31,10 +31,8 @@ typedef struct TBContext TBContext;
struct TBContext {
- TranslationBlock **tbs;
+ GTree *tb_tree;
struct qht htable;
- size_t tbs_size;
- int nb_tbs;
/* any access to the tbs or the page table must use this lock */
QemuMutex tb_lock;
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index da91482..a18fbf7 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -770,6 +770,21 @@ static inline void *alloc_code_gen_buffer(void)
}
#endif /* USE_STATIC_CODE_GEN_BUFFER, WIN32, POSIX */
+/* @key is already in the tree so it's safe to use container_of on it */
+static gint tc_ptr_cmp(gconstpointer candidate, gconstpointer key)
+{
+ uintptr_t a = *(uintptr_t *)candidate;
+ const TranslationBlock *tb = container_of(key, TranslationBlock, tc_ptr);
+ uintptr_t b = (uintptr_t)tb->tc_ptr;
+
+ if (a >= b + tb->out_size) {
+ return 1;
+ } else if (a < b) {
+ return -1;
+ }
+ return 0;
+}
+
static inline void code_gen_alloc(size_t tb_size)
{
tcg_ctx.code_gen_buffer_size = size_code_gen_buffer(tb_size);
@@ -778,15 +793,7 @@ static inline void code_gen_alloc(size_t tb_size)
fprintf(stderr, "Could not allocate dynamic translator buffer\n");
exit(1);
}
-
- /* size this conservatively -- realloc later if needed */
- tcg_ctx.tb_ctx.tbs_size =
- tcg_ctx.code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE / 8;
- if (unlikely(!tcg_ctx.tb_ctx.tbs_size)) {
- tcg_ctx.tb_ctx.tbs_size = 64 * 1024;
- }
- tcg_ctx.tb_ctx.tbs = g_new(TranslationBlock *, tcg_ctx.tb_ctx.tbs_size);
-
+ tcg_ctx.tb_ctx.tb_tree = g_tree_new(tc_ptr_cmp);
qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
}
@@ -827,7 +834,6 @@ bool tcg_enabled(void)
static TranslationBlock *tb_alloc(target_ulong pc)
{
TranslationBlock *tb;
- TBContext *ctx;
assert_tb_locked();
@@ -835,12 +841,6 @@ static TranslationBlock *tb_alloc(target_ulong pc)
if (unlikely(tb == NULL)) {
return NULL;
}
- ctx = &tcg_ctx.tb_ctx;
- if (unlikely(ctx->nb_tbs == ctx->tbs_size)) {
- ctx->tbs_size *= 2;
- ctx->tbs = g_renew(TranslationBlock *, ctx->tbs, ctx->tbs_size);
- }
- ctx->tbs[ctx->nb_tbs++] = tb;
return tb;
}
@@ -849,16 +849,7 @@ void tb_free(TranslationBlock *tb)
{
assert_tb_locked();
- /* In practice this is mostly used for single use temporary TB
- Ignore the hard cases and just back up if this TB happens to
- be the last one generated. */
- if (tcg_ctx.tb_ctx.nb_tbs > 0 &&
- tb == tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs - 1]) {
- size_t struct_size = ROUND_UP(sizeof(*tb), qemu_icache_linesize);
-
- tcg_ctx.code_gen_ptr = tb->tc_ptr - struct_size;
- tcg_ctx.tb_ctx.nb_tbs--;
- }
+ g_tree_remove(tcg_ctx.tb_ctx.tb_tree, &tb->tc_ptr);
}
static inline void invalidate_page_bitmap(PageDesc *p)
@@ -906,6 +897,8 @@ static void page_flush_tb(void)
/* flush all the translation blocks */
static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
{
+ int nb_tbs __attribute__((unused));
+
tb_lock();
/* If it is already been done on request of another CPU,
@@ -916,11 +909,12 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data
tb_flush_count)
}
#if defined(DEBUG_TB_FLUSH)
+ nb_tbs = g_tree_nnodes(tcg_ctx.tb_ctx.tb_tree);
printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
(unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
- tcg_ctx.tb_ctx.nb_tbs, tcg_ctx.tb_ctx.nb_tbs > 0 ?
+ nb_tbs, nb_tbs > 0 ?
((unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer)) /
- tcg_ctx.tb_ctx.nb_tbs : 0);
+ nb_tbs : 0);
#endif
if ((unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer)
> tcg_ctx.code_gen_buffer_size) {
@@ -935,7 +929,10 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data
tb_flush_count)
}
}
- tcg_ctx.tb_ctx.nb_tbs = 0;
+ /* Increment the refcount first so that destroy acts as a reset */
+ g_tree_ref(tcg_ctx.tb_ctx.tb_tree);
+ g_tree_destroy(tcg_ctx.tb_ctx.tb_tree);
+
qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
page_flush_tb();
@@ -1385,6 +1382,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
* through the physical hash table and physical page list.
*/
tb_link_page(tb, phys_pc, phys_page2);
+ g_tree_insert(tcg_ctx.tb_ctx.tb_tree, &tb->tc_ptr, tb);
return tb;
}
@@ -1653,37 +1651,14 @@ static bool tb_invalidate_phys_page(tb_page_addr_t
addr, uintptr_t pc)
}
#endif
-/* find the TB 'tb' such that tb[0].tc_ptr <= tc_ptr <
- tb[1].tc_ptr. Return NULL if not found */
+/*
+ * Find the TB 'tb' such that
+ * tb->tc_ptr <= tc_ptr < tb->tc_ptr + tb->out_size
+ * Return NULL if not found.
+ */
static TranslationBlock *tb_find_pc(uintptr_t tc_ptr)
{
- int m_min, m_max, m;
- uintptr_t v;
- TranslationBlock *tb;
-
- if (tcg_ctx.tb_ctx.nb_tbs <= 0) {
- return NULL;
- }
- if (tc_ptr < (uintptr_t)tcg_ctx.code_gen_buffer ||
- tc_ptr >= (uintptr_t)tcg_ctx.code_gen_ptr) {
- return NULL;
- }
- /* binary search (cf Knuth) */
- m_min = 0;
- m_max = tcg_ctx.tb_ctx.nb_tbs - 1;
- while (m_min <= m_max) {
- m = (m_min + m_max) >> 1;
- tb = tcg_ctx.tb_ctx.tbs[m];
- v = (uintptr_t)tb->tc_ptr;
- if (v == tc_ptr) {
- return tb;
- } else if (tc_ptr < v) {
- m_max = m - 1;
- } else {
- m_min = m + 1;
- }
- }
- return tcg_ctx.tb_ctx.tbs[m_max];
+ return g_tree_lookup(tcg_ctx.tb_ctx.tb_tree, &tc_ptr);
}
#if !defined(CONFIG_USER_ONLY)
@@ -1866,63 +1841,67 @@ static void print_qht_statistics(FILE *f,
fprintf_function cpu_fprintf,
g_free(hgram);
}
+struct tb_tree_stats {
+ int target_size;
+ int max_target_size;
+ int direct_jmp_count;
+ int direct_jmp2_count;
+ int cross_page;
+};
+
+static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
+{
+ const TranslationBlock *tb = value;
+ struct tb_tree_stats *tst = data;
+
+ tst->target_size += tb->size;
+ if (tb->size > tst->max_target_size) {
+ tst->max_target_size = tb->size;
+ }
+ if (tb->page_addr[1] != -1) {
+ tst->cross_page++;
+ }
+ if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
+ tst->direct_jmp_count++;
+ if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
+ tst->direct_jmp2_count++;
+ }
+ }
+ return false;
+}
+
void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
{
- int i, target_code_size, max_target_code_size;
- int direct_jmp_count, direct_jmp2_count, cross_page;
- TranslationBlock *tb;
+ struct tb_tree_stats tst = {};
struct qht_stats hst;
+ int nb_tbs;
tb_lock();
- target_code_size = 0;
- max_target_code_size = 0;
- cross_page = 0;
- direct_jmp_count = 0;
- direct_jmp2_count = 0;
- for (i = 0; i < tcg_ctx.tb_ctx.nb_tbs; i++) {
- tb = tcg_ctx.tb_ctx.tbs[i];
- target_code_size += tb->size;
- if (tb->size > max_target_code_size) {
- max_target_code_size = tb->size;
- }
- if (tb->page_addr[1] != -1) {
- cross_page++;
- }
- if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
- direct_jmp_count++;
- if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
- direct_jmp2_count++;
- }
- }
- }
+ nb_tbs = g_tree_nnodes(tcg_ctx.tb_ctx.tb_tree);
+ g_tree_foreach(tcg_ctx.tb_ctx.tb_tree, tb_tree_stats_iter, &tst);
/* XXX: avoid using doubles ? */
cpu_fprintf(f, "Translation buffer state:\n");
cpu_fprintf(f, "gen code size %td/%zd\n",
tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer,
tcg_ctx.code_gen_highwater - tcg_ctx.code_gen_buffer);
- cpu_fprintf(f, "TB count %d\n", tcg_ctx.tb_ctx.nb_tbs);
+ cpu_fprintf(f, "TB count %d\n", nb_tbs);
cpu_fprintf(f, "TB avg target size %d max=%d bytes\n",
- tcg_ctx.tb_ctx.nb_tbs ? target_code_size /
- tcg_ctx.tb_ctx.nb_tbs : 0,
- max_target_code_size);
+ nb_tbs ? tst.target_size / nb_tbs : 0,
+ tst.max_target_size);
cpu_fprintf(f, "TB avg host size %td bytes (expansion ratio: %0.1f)\n",
- tcg_ctx.tb_ctx.nb_tbs ? (tcg_ctx.code_gen_ptr -
- tcg_ctx.code_gen_buffer) /
- tcg_ctx.tb_ctx.nb_tbs : 0,
- target_code_size ? (double) (tcg_ctx.code_gen_ptr -
- tcg_ctx.code_gen_buffer) /
- target_code_size : 0);
- cpu_fprintf(f, "cross page TB count %d (%d%%)\n", cross_page,
- tcg_ctx.tb_ctx.nb_tbs ? (cross_page * 100) /
- tcg_ctx.tb_ctx.nb_tbs : 0);
+ nb_tbs ? (tcg_ctx.code_gen_ptr -
+ tcg_ctx.code_gen_buffer) / nb_tbs : 0,
+ tst.target_size ? (double) (tcg_ctx.code_gen_ptr -
+ tcg_ctx.code_gen_buffer) /
+ tst.target_size : 0);
+ cpu_fprintf(f, "cross page TB count %d (%d%%)\n", tst.cross_page,
+ nb_tbs ? (tst.cross_page * 100) / nb_tbs : 0);
cpu_fprintf(f, "direct jump count %d (%d%%) (2 jumps=%d %d%%)\n",
- direct_jmp_count,
- tcg_ctx.tb_ctx.nb_tbs ? (direct_jmp_count * 100) /
- tcg_ctx.tb_ctx.nb_tbs : 0,
- direct_jmp2_count,
- tcg_ctx.tb_ctx.nb_tbs ? (direct_jmp2_count * 100) /
- tcg_ctx.tb_ctx.nb_tbs : 0);
+ tst.direct_jmp_count,
+ nb_tbs ? (tst.direct_jmp_count * 100) / nb_tbs : 0,
+ tst.direct_jmp2_count,
+ nb_tbs ? (tst.direct_jmp2_count * 100) / nb_tbs : 0);
qht_statistics_init(&tcg_ctx.tb_ctx.htable, &hst);
print_qht_statistics(f, cpu_fprintf, hst);
--
2.7.4